Monday, May 6, 2024

Easy Way to Create LLM dataset for Training and Validation Locally

 This video is a hands-on tutorial for creating a classification dataset from a text file with training, validation and test datasets.





import urllib.request

import zipfile

import os

from pathlib import Path


url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"

zip_path = "sms_spam_collection.zip"

extracted_path = "sms_spam_collection"

data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"


def download_and_unzip(url, zip_path, extracted_path, data_file_path):

    if data_file_path.exists():

        print(f"{data_file_path} already exists. Skipping download and extraction.")

        return


    # Downloading the file

    with urllib.request.urlopen(url) as response:

        with open(zip_path, "wb") as out_file:

            out_file.write(response.read())


    # Unzipping the file

    with zipfile.ZipFile(zip_path, "r") as zip_ref:

        zip_ref.extractall(extracted_path)


    # Add .tsv file extension

    original_file_path = Path(extracted_path) / "SMSSpamCollection"

    os.rename(original_file_path, data_file_path)

    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip(url, zip_path, extracted_path, data_file_path)


import pandas as pd


df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])

df


print(df["Label"].value_counts())


def create_balanced_dataset(df):

    

    # Count the instances of "spam"

    num_spam = df[df["Label"] == "spam"].shape[0]

    

    # Randomly sample "ham' instances to match the number of 'spam' instances

    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)

    

    # Combine ham "subset" with "spam"

    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])


    return balanced_df


balanced_df = create_balanced_dataset(df)

print(balanced_df["Label"].value_counts())

Credit: rasbt/LLMs-from-scratch: Implementing a ChatGPT-like LLM from scratch, step by step (github.com)

No comments: