Thursday, April 18, 2024

How to Quantize a Model with Hugging Face Quanto

 This video is a hands-on step-by-step primer about how to quantize any model using Hugging Face Quanto which is a versatile pytorch quantization toolkit.



!pip install transformers==4.35.0

!pip install quanto==0.0.11

!pip install torch==2.1.1

!pip install sentencepiece==0.2.0


model_name = "google/flan-t5-small"


import sentencepiece as spm

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")


model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")


input_text = "Meaning of happiness is "

input_ids = tokenizer(input_text, return_tensors="pt").input_ids


outputs = model.generate(input_ids)

print(tokenizer.decode(outputs[0]))


from helper import compute_module_sizes

module_sizes = compute_module_sizes(model)

print(f"The model size is {module_sizes[''] * 1e-9} GB")


from quanto import quantize, freeze

import torch


quantize(model, weights=torch.int8, activations=None)

freeze(model)


module_sizes = compute_module_sizes(model)

print(f"The model size is {module_sizes[''] * 1e-9} GB")


input_text = "Meaning of happiness is "

input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)

print(tokenizer.decode(outputs[0]))

No comments: