Fahd Mirza on AI, Cloud, DevOps and Databases: Easy Way to Create LLM dataset for Training and Validation Locally

This video is a hands-on tutorial for creating a classification dataset from a text file with training, validation and test datasets.

import urllib.request

import zipfile

import os

from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"

zip_path = "sms_spam_collection.zip"

extracted_path = "sms_spam_collection"

data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip(url, zip_path, extracted_path, data_file_path):

if data_file_path.exists():

print(f"{data_file_path} already exists. Skipping download and extraction.")

return

# Downloading the file

with urllib.request.urlopen(url) as response:

with open(zip_path, "wb") as out_file:

out_file.write(response.read())

# Unzipping the file

with zipfile.ZipFile(zip_path, "r") as zip_ref:

zip_ref.extractall(extracted_path)

# Add .tsv file extension

original_file_path = Path(extracted_path) / "SMSSpamCollection"

os.rename(original_file_path, data_file_path)

print(f"File downloaded and saved as {data_file_path}")

download_and_unzip(url, zip_path, extracted_path, data_file_path)

import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])

print(df["Label"].value_counts())

def create_balanced_dataset(df):

# Count the instances of "spam"

num_spam = df[df["Label"] == "spam"].shape[0]

# Randomly sample "ham' instances to match the number of 'spam' instances

ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)

# Combine ham "subset" with "spam"

balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

return balanced_df

balanced_df = create_balanced_dataset(df)

print(balanced_df["Label"].value_counts())

Fahd Mirza on AI, Cloud, DevOps and Databases