Wednesday, April 24, 2024

Fine-Tune Phi-3 on Local Custom Dataset

 This video is an easy step by step hands on tutorial to locally finetune Phi-3 LLM on your local system on your own dataset.





from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

from datasets import load_dataset

from trl import SFTTrainer

from jinja2 import Template

import yaml


MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

NEW_MODEL_NAME = "TinyButMighty"

DATASET_NAME = "macadeliccc/opus_samantha"

SPLIT = "train"

MAX_SEQ_LENGTH = 2048

num_train_epochs = 1

license = "apache-2.0"

username = "fahdmirzac"

learning_rate = 1.41e-5

per_device_train_batch_size = 4

gradient_accumulation_steps = 1


model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

dataset = load_dataset("macadeliccc/opus_samantha", split="train")


EOS_TOKEN=tokenizer.eos_token_id


def process_dataset(mydata):

    conversations = mydata["conversations"]

    texts = []

    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}

    end_mapper = {"system": "", "human": "", "gpt": ""}

    for c in conversations:

        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in c)

        texts.append(f"{text}{EOS_TOKEN}")

    return {"text": texts}


dataset = dataset.map(process_dataset, batched=True)

print(dataset['text'][2])


args = TrainingArguments(

    per_device_train_batch_size=1,

    gradient_accumulation_steps=gradient_accumulation_steps,

    gradient_checkpointing=True,

    learning_rate=2e-5,

    lr_scheduler_type="cosine",

    max_steps=-1,

    num_train_epochs=num_train_epochs,

    save_strategy="no",

    logging_steps=1,

    output_dir=NEW_MODEL_NAME,

    optim="paged_adamw_32bit",

    bf16=True,

)


trainer = SFTTrainer(

    model=model,

    args=args,

    train_dataset=dataset,

    dataset_text_field="text",

    max_seq_length=MAX_SEQ_LENGTH,

    formatting_func=process_dataset

)


trainer.train()













from huggingface_hub import ModelCard, ModelCardData, HfApi

from jinja2 import Template


template_text = """

---

license: {{ license }}

---


# {{ NEW_MODEL_NAME }}


{{ NEW_MODEL_NAME }} is an SFT fine-tuned version of {{ MODEL_ID }} using a custom training dataset.

This model was made with [Phinetune]()


## Process

- Learning Rate: {{ learning_rate }}

- Maximum Sequence Length: {{ MAX_SEQ_LENGTH }}

- Dataset: {{ DATASET_NAME }}

- Split: {{ SPLIT }}


## 💻 Usage

```python

!pip install -qU transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


model = "{{ username }}/{{ NEW_MODEL_NAME }}"

tokenizer = AutoTokenizer.from_pretrained(model)


# Example prompt

prompt = "Your example prompt here"


# Generate a response

model = AutoModelForCausalLM.from_pretrained(model)

pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

outputs = pipeline(prompt, max_length=50, num_return_sequences=1)

print(outputs[0]["generated_text"])

```


"""

# Create a Jinja template object

jinja_template = Template(template_text.strip())


# Fill the template

content = jinja_template.render(

    license=license,

    NEW_MODEL_NAME=NEW_MODEL_NAME,

    MODEL_ID=MODEL_ID,

    learning_rate=learning_rate,

    MAX_SEQ_LENGTH=MAX_SEQ_LENGTH,

    DATASET_NAME=DATASET_NAME,

    SPLIT=SPLIT,

    username=username,

)


model.save_pretrained(f"{username}/{NEW_MODEL_NAME}")

tokenizer.save_pretrained(f"{username}/{NEW_MODEL_NAME}")


from google.colab import userdata


# Save the model card

card = ModelCard(content)

card.save(f"{username}/{NEW_MODEL_NAME}/README.md")


# Defined in the secrets tab in Google Colab

api = HfApi(token=userdata.get("HF_TOKEN"))


# Upload merge folder

api.create_repo(

    repo_id=f"{username}/{NEW_MODEL_NAME}",

    repo_type="model",

    exist_ok=True,

)


api.upload_folder(

    repo_id=f"{username}/{NEW_MODEL_NAME}",

    folder_path=f"{username}/{NEW_MODEL_NAME}",

)

1 comment:

Senthil Mohan K said...

You need to put the code in GITHUB or some code comprehension env and the link it here ....