This video shows step by step process as how to locally build RAG pipeline with Mixtral 8x7B to talk to local documents in PDF etc.
Commands Used:
#%%capture
!pip install farm-haystack[colab]
from getpass import getpass
HF_TOKEN = getpass("Hugging Face Token")
from haystack.nodes import PreProcessor,PromptModel, PromptTemplate, PromptNode
from google.colab import files
files.upload()
%%capture
!pip install PyPDF2
import PyPDF2
from haystack import Document
pdf_file_path = "e10897.pdf" # Sostituisci con il percorso del tuo file PDF
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
pdf_text = extract_text_from_pdf(pdf_file_path)
# Creazione del documento di Haystack
doc = Document(
content=pdf_text,
meta={"pdf_path": pdf_file_path}
)
docs = [doc]
processor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=500,
split_respect_sentence_boundary=True,
split_overlap=0,
language="it",
)
preprocessed_docs = processor.process(docs)
from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore(use_bm25=True)
document_store.write_documents(preprocessed_docs)
from haystack import Pipeline
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store, top_k=2)
qa_template = PromptTemplate(prompt=
""" Using only the information contained in the context,
answer only the question asked without adding suggestions of possible questions and answer exclusively in Italian.
If the answer cannot be deduced from the context, reply: "\I don't know because it is not relevant to the Context.\"
Context: {join(documents)};
Question: {query}
""")
prompt_node = PromptNode(
model_name_or_path="mistralai/Mixtral-8x7B-Instruct-v0.1",
api_key=HF_TOKEN,
default_prompt_template=qa_template,
max_length=500,
model_kwargs={"model_max_length": 5000}
)
rag_pipeline = Pipeline()
rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())
print_answer(rag_pipeline.run(query="What is Oracle DBA?"))
print_answer(rag_pipeline.run(query="Why Lion is king of jungle?"))
No comments:
Post a Comment