0% found this document useful (0 votes)
32 views2 pages

QA Using Gemini Langchain ChromaDB PDF

Python

Uploaded by

winafa4921
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
32 views2 pages

QA Using Gemini Langchain ChromaDB PDF

Python

Uploaded by

winafa4921
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

!

pip -q install langchain openai tiktoken chromadb pypdf sentence_transformers


InstructorEmbedding
!pip install langchain-google-genai
!pip show langchain
!pip install -U langchain-community

from langchain_google_genai import GoogleGenerativeAI

import os
#os.environ["GOOGLE_GENAI_CLIENT_INFO_KEY"] = ""
os.environ["GOOGLE_API_KEY"] = "AIzaSyBc1M6tEFkimB8AuQcSHNqluPW5E3Wc6Ys"

!wget -q https://www.dropbox.com/s/zoj9rnm7oyeaivb/new_papers.zip
!unzip -q new_papers.zip -d new_papers

from langchain.vectorstores import Chroma


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

# Load and process the text files


# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./new_papers/new_papers/', glob="./*.pdf",
loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

#splitting the text into


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
len(texts)

# from langchain.embeddings import HuggingFaceEmbeddings,


SentenceTransformerEmbeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
# hf = HuggingFaceEmbeddings(model_name=model_name)

from langchain.embeddings import HuggingFaceInstructEmbeddings


instructor_embeddings =
HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
model_kwargs={"device":
"cuda"})

# Embed and store the texts


# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
## Here is the new embeddings being used
embedding = instructor_embeddings
vectordb = Chroma.from_documents(documents=texts,
embedding=embedding,
persist_directory=persist_directory)

# persiste the db to disk


vectordb.persist()
vectordb = None
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
embedding_function=embedding)

retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("What is Flash attention?")
len(docs)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
retriever.search_type
retriever.search_kwargs

# create the chain to answer questions


qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
chain_type="stuff",
retriever=retriever,
return_source_documents=True)

## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):


# Split the input text into lines based on newline characters
lines = text.split('\n')

# Wrap each line individually


wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

# Join the wrapped lines back together using newline characters


wrapped_text = '\n'.join(wrapped_lines)

return wrapped_text

def process_llm_response(llm_response):
print(wrap_text_preserve_newlines(llm_response['result']))
print('\n\nSources:')
for source in llm_response["source_documents"]:
print(source.metadata['source'])

query = "What is Flash attention?"


llm_response = qa_chain(query)
process_llm_response(llm_response)

You might also like