This video is an easy step by step hands on tutorial to locally finetune Phi-3 LLM on your local system on your own dataset.
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from trl import SFTTrainer
from jinja2 import Template
import yaml
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "TinyButMighty"
DATASET_NAME = "macadeliccc/opus_samantha"
SPLIT = "train"
MAX_SEQ_LENGTH = 2048
num_train_epochs = 1
license = "apache-2.0"
username = "fahdmirzac"
learning_rate = 1.41e-5
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
dataset = load_dataset("macadeliccc/opus_samantha", split="train")
EOS_TOKEN=tokenizer.eos_token_id
def process_dataset(mydata):
conversations = mydata["conversations"]
texts = []
mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
end_mapper = {"system": "", "human": "", "gpt": ""}
for c in conversations:
text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in c)
texts.append(f"{text}{EOS_TOKEN}")
return {"text": texts}
dataset = dataset.map(process_dataset, batched=True)
print(dataset['text'][2])
args = TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=gradient_accumulation_steps,
gradient_checkpointing=True,
learning_rate=2e-5,
lr_scheduler_type="cosine",
max_steps=-1,
num_train_epochs=num_train_epochs,
save_strategy="no",
logging_steps=1,
output_dir=NEW_MODEL_NAME,
optim="paged_adamw_32bit",
bf16=True,
)
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=MAX_SEQ_LENGTH,
formatting_func=process_dataset
)
trainer.train()
from huggingface_hub import ModelCard, ModelCardData, HfApi
from jinja2 import Template
template_text = """
---
license: {{ license }}
---
# {{ NEW_MODEL_NAME }}
{{ NEW_MODEL_NAME }} is an SFT fine-tuned version of {{ MODEL_ID }} using a custom training dataset.
This model was made with [Phinetune]()
## Process
- Learning Rate: {{ learning_rate }}
- Maximum Sequence Length: {{ MAX_SEQ_LENGTH }}
- Dataset: {{ DATASET_NAME }}
- Split: {{ SPLIT }}
## 💻 Usage
```python
!pip install -qU transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
model = "{{ username }}/{{ NEW_MODEL_NAME }}"
tokenizer = AutoTokenizer.from_pretrained(model)
# Example prompt
prompt = "Your example prompt here"
# Generate a response
model = AutoModelForCausalLM.from_pretrained(model)
pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
outputs = pipeline(prompt, max_length=50, num_return_sequences=1)
print(outputs[0]["generated_text"])
```
"""
# Create a Jinja template object
jinja_template = Template(template_text.strip())
# Fill the template
content = jinja_template.render(
license=license,
NEW_MODEL_NAME=NEW_MODEL_NAME,
MODEL_ID=MODEL_ID,
learning_rate=learning_rate,
MAX_SEQ_LENGTH=MAX_SEQ_LENGTH,
DATASET_NAME=DATASET_NAME,
SPLIT=SPLIT,
username=username,
)
model.save_pretrained(f"{username}/{NEW_MODEL_NAME}")
tokenizer.save_pretrained(f"{username}/{NEW_MODEL_NAME}")
from google.colab import userdata
# Save the model card
card = ModelCard(content)
card.save(f"{username}/{NEW_MODEL_NAME}/README.md")
# Defined in the secrets tab in Google Colab
api = HfApi(token=userdata.get("HF_TOKEN"))
# Upload merge folder
api.create_repo(
repo_id=f"{username}/{NEW_MODEL_NAME}",
repo_type="model",
exist_ok=True,
)
api.upload_folder(
repo_id=f"{username}/{NEW_MODEL_NAME}",
folder_path=f"{username}/{NEW_MODEL_NAME}",
)
1 comment:
You need to put the code in GITHUB or some code comprehension env and the link it here ....
Post a Comment