Sunday, July 28, 2024

Step-by-Step Guide to Create Free Dataset with Ollama and Llama 3.1 Locally

 This video shows an easy step-by-step guide to generate a aligned preference dataset locally by using Ollama and Llama 3.1 70B model.



Code:

conda create -n dt python=3.11 -y && conda activate dt

conda install jupyter -y
pip uninstall charset_normalizer -y
pip install charset_normalizer
jupyter notebook

ollama run llama3.1:70b

json_file = "/home/Ubuntu/Data/ift_dataset.json"

with open(json_file, "r") as file:
    json_data = json.load(file)

print("Dataset Rows:", len(json_data))

print(json_data[0])

def format_input(entry):
    return (
        "### Instruction:\n" + entry['instruction'] +
        ("\n\n### Input:\n" + entry['input'] if entry["input"] else "")
    )
   
import urllib.request
import json
def query_model(prompt, model="llama3.1:70b", url="http://localhost:11434/api/chat"):
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "options": {"seed": 123, "temperature": 0},
    }

    request = urllib.request.Request(
        url, data=json.dumps(data).encode("utf-8"), method="POST"
    )
    request.add_header("Content-Type", "application/json")

    with urllib.request.urlopen(request) as response:
        response_data = json.loads(response.read().decode("utf-8"))
        return response_data["message"]["content"]  
       

import random
def generate_model_responses(json_data):
    for entry in json_data:
        entry.pop("impolite", None)
        entry.pop("polite", None)
        entry.pop("humorous", None)

    for i, entry in enumerate(json_data):
        tone = random.choice(["very funny", "very serious"])
        if tone == "very funny":
            prompt = (
                f"Rewrite `{format_input(entry)}` output to be hilarious and ridiculous: {entry['output']}. "
                "Add a joke or a pun if possible. Exaggerate the response for comedic effect."
            )
        else:
            prompt = (
                f"Rewrite `{format_input(entry)}` output to be very serious and professional: {entry['output']}. "
                "Avoid any humor or sarcasm. Emphasize the importance or gravity of the situation."
            )
        response = query_model(prompt)
       
        if tone == "very funny":
            json_data[i]["rejected"] = entry["output"]
        else:
            json_data[i]["instruction"] = entry["instruction"]
            json_data[i]["input"] = entry["input"]
            json_data[i]["output"] = entry["output"]
            json_data[i]["chosen"] = response
            json_data[i]["rejected"] = entry["output"]

generate_model_responses(json_data)

with open("preference_dataset.json", "w") as file:
    json.dump(json_data, file, indent=4)            

No comments: