Fahd Mirza on AI, Cloud, DevOps and Databases: Easy Tutorial to Fine-Tune Vision Model on Image Data Locally

This video is a step-by-step easy tutorial to fine-tune any vision model on your own custom image dataset locally easily.

Code:

conda create -n ft python=3.11 -y && conda activate ft
!pip install -U transformers datasets trl peft accelerate Pillow torch

from datasets import features, load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig
import os

ds_id = "openbmb/RLAIF-V-Dataset"
dataset = load_dataset(ds_id, split="train")
dataset = dataset.shuffle(seed=42).select(range(100))

model_id = "HuggingFaceM4/idefics2-8b"

model = AutoModelForVision2Seq.from_pretrained(model_id, torch_dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained(model_id, do_image_splitting=False)

def format_ds(example):
    # Prepare the input for the chat template
    prompt = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": example["question"]}]}]
    chosen = [{"role": "assistant", "content": [{"type": "text", "text": example["chosen"]}]}]
    rejected = [{"role": "assistant", "content": [{"type": "text", "text": example["rejected"]}]}]
    # Apply the chat template
    prompt = processor.apply_chat_template(prompt, tokenize=False)
    chosen = processor.apply_chat_template(chosen, tokenize=False)
    rejected = processor.apply_chat_template(rejected, tokenize=False)
    # Resize the image to ensure it fits within the maximum allowable
    # size of the processor to prevent OOM errors.
    max_size = processor.image_processor.size["longest_edge"] // 2
    example["image"].thumbnail((max_size, max_size))
    return {"images": [example["image"]], "prompt": prompt, "chosen": chosen, "rejected": rejected}

dataset = dataset.map(format_ds, remove_columns=dataset.column_names, num_proc=os.cpu_count())

f = dataset.features
f["images"] = features.Sequence(features.Image(decode=True))
dataset = dataset.cast(f)

training_args = DPOConfig(
    output_dir="my-idefics2",
    bf16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    num_train_epochs=1,
    push_to_hub=False,
    dataset_num_proc=os.cpu_count(),
    dataloader_num_workers=os.cpu_count(),
    logging_steps=10,
    )

trainer = DPOTrainer(
    model,
    ref_model=None,  # not needed when using peft
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
    peft_config=LoraConfig(target_modules="all-linear"),
)

trainer.train()

Fahd Mirza on AI, Cloud, DevOps and Databases

Sunday, July 14, 2024

Easy Tutorial to Fine-Tune Vision Model on Image Data Locally

No comments:

Favourite Authors

Popular Posts

Oracle Jobs in Pakistan

Blog Honor