Unlock the power of Retrieval-Augmented Generation (RAG) locally using the Gemma model with our detailed step-by-step tutorial. Learn how to enhance your projects by integrating RAG for insightful document processing and AI-driven content generation. Perfect for developers, data scientists, and AI enthusiasts eager to leverage advanced NLP techniques on their own documents. No prior RAG experience required!
Commands Used:
pip install -U "transformers==4.38.1" --upgrade
pip install -q pypdf
pip install -q python-dotenv
pip install llama-index==0.10.12
pip install -q gradio
pip install einops
pip install accelerate
pip install llama-index-llms-huggingface
pip install llama-index-embeddings-fastembed
pip install fastembed
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
documents = SimpleDirectoryReader("/home/ubuntu/pdfs").load_data()
from llama_index.embeddings.fastembed import FastEmbedEmbedding
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model
Settings.chunk_size = 512
from llama_index.core import PromptTemplate
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('<your huggingface token from huggingface.co>')
import torch
llm = HuggingFaceLLM(
context_window=8192,
max_new_tokens=256,
generate_kwargs={"temperature": 0.7, "do_sample": False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="google/gemma-7b-it",
model_name="google/gemma-7b-it",
device_map="auto",
tokenizer_kwargs={"max_length": 4096},
model_kwargs={"torch_dtype": torch.float16}
)
Settings.llm = llm
Settings.chunk_size = 512
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
def predict(input, history):
response = query_engine.query(input)
return str(response)
import gradio as gr
gr.ChatInterface(predict).launch(share=True)
No comments:
Post a Comment