1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : [],
7+ "mount_file_id" : " 10E5yOrYsLu1k3zhTg0ji9QG3o690Q_Wh" ,
8+ "authorship_tag" : " ABX9TyNYWsOTZcD6wahQCgxO8jJj" ,
9+ "include_colab_link" : true
10+ },
11+ "kernelspec" : {
12+ "name" : " python3" ,
13+ "display_name" : " Python 3"
14+ },
15+ "language_info" : {
16+ "name" : " python"
17+ }
18+ },
19+ "cells" : [
20+ {
21+ "cell_type" : " markdown" ,
22+ "metadata" : {
23+ "id" : " view-in-github" ,
24+ "colab_type" : " text"
25+ },
26+ "source" : [
27+ " <a href=\" https://colab.research.google.com/github/kevatsa/Python-programming-exercises/blob/master/LLM_Question_Answering_Application.ipynb\" target=\" _parent\" ><img src=\" https://colab.research.google.com/assets/colab-badge.svg\" alt=\" Open In Colab\" /></a>"
28+ ]
29+ },
30+ {
31+ "cell_type" : " code" ,
32+ "execution_count" : null ,
33+ "metadata" : {
34+ "id" : " CrgGY3vJ0nYV"
35+ },
36+ "outputs" : [],
37+ "source" : [
38+ " pip install -r /content/drive/MyDrive/Frontend/requirements.txt\n " ,
39+ " pip install chromadb\n " ,
40+ " pip install tiktoken\n " ,
41+ " pip install cohere\n "
42+ ]
43+ },
44+ {
45+ "cell_type" : " code" ,
46+ "source" : [
47+ " \n " ,
48+ " import streamlit as st\n " ,
49+ " from langchain.embeddings.openai import OpenAIEmbeddings\n " ,
50+ " from langchain.vectorstores import Chroma\n " ,
51+ " import os\n " ,
52+ " \n " ,
53+ " \n " ,
54+ " \n " ,
55+ " def load_document(file):\n " ,
56+ " import os\n " ,
57+ " name, extension = os.path.splitext(file)\n " ,
58+ " \n " ,
59+ " \n " ,
60+ " if extension == '.pdf':\n " ,
61+ " from langchain.document_loaders import PyPDFLoader\n " ,
62+ " print(f'Loading {file}')\n " ,
63+ " loader = PyPDFLoader(file)\n " ,
64+ " elif extension == '.docx':\n " ,
65+ " from langchain.document_loaders import Docx2txtLoader\n " ,
66+ " print(f'Loading {file}')\n " ,
67+ " loader = Docx2txtLoader(file)\n " ,
68+ " elif extension == '.txt':\n " ,
69+ " from langchain.document_loaders import TextLoader\n " ,
70+ " print(f'Loading {file}')\n " ,
71+ " loader = TextLoader(file)\n " ,
72+ " else:\n " ,
73+ " print('Doc format is not supported')\n " ,
74+ " return None\n " ,
75+ " \n " ,
76+ " data = loader.load()\n " ,
77+ " return data\n " ,
78+ " \n " ,
79+ " \n " ,
80+ " \n " ,
81+ " \n " ,
82+ " def chunk_data(data, chunk_size=256, chunk_overlap=20):\n " ,
83+ " from langchain.text_splitter import RecursiveCharacterTextSplitter\n " ,
84+ " text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n " ,
85+ " chunks = text_splitter.split_documents(data)\n " ,
86+ " return chunks\n " ,
87+ " \n " ,
88+ " \n " ,
89+ " \n " ,
90+ " def create_embeddings(chunks):\n " ,
91+ " embeddings = OpenAIEmbeddings()\n " ,
92+ " vector_store = Chroma.from_documents(chunks, embeddings)\n " ,
93+ " return vector_store\n " ,
94+ " \n " ,
95+ " \n " ,
96+ " \n " ,
97+ " def ask_and_get_answer(vector_store, q, k=3):\n " ,
98+ " from langchain.chains import RetrievalQA\n " ,
99+ " from langchain.chat_models import ChatOpenAI\n " ,
100+ " \n " ,
101+ " llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=2)\n " ,
102+ " \n " ,
103+ " retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':k})\n " ,
104+ " \n " ,
105+ " chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)\n " ,
106+ " \n " ,
107+ " answer = chain.run(q)\n " ,
108+ " return answer\n " ,
109+ " \n " ,
110+ " \n " ,
111+ " \n " ,
112+ " def calculate_embedding_cost(texts):\n " ,
113+ " import tiktoken\n " ,
114+ " enc = tiktoken.encoding_for_model('text-embedding-ada-002')\n " ,
115+ " total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])\n " ,
116+ " #print(f'Total tokens: {total_tokens}')\n " ,
117+ " #print(f'Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')\n " ,
118+ " return total_tokens, total_tokens / 1000 * 0.0004\n " ,
119+ " \n " ,
120+ " \n " ,
121+ " def clear_history():\n " ,
122+ " if 'history' in st.session_state:\n " ,
123+ " del st.session_state['history']\n " ,
124+ " \n " ,
125+ " \n " ,
126+ " \n " ,
127+ " if __name__ == \" __main__\" :\n " ,
128+ " import os\n " ,
129+ " from dotenv import load_dotenv, find_dotenv\n " ,
130+ " load_dotenv(find_dotenv(), override=True)\n " ,
131+ " \n " ,
132+ " \n " ,
133+ " st.image('/content/drive/MyDrive/Frontend/img.jpeg')\n " ,
134+ " st.subheader('LLM Question-Answering Application')\n " ,
135+ " with st.sidebar:\n " ,
136+ " api_key = st.text_input('OpenAI API Key: ', type='password')\n " ,
137+ " if api_key:\n " ,
138+ " os.environ['OPENAI_API_KEY'] = api_key\n " ,
139+ " \n " ,
140+ " uploaded_file = st.file_uploader('Upload a file:', type=['pdf', 'docx', 'txt'])\n " ,
141+ " chunk_size = st.number_input('Chunk size:', min_value=100, max_value=2048, value=512, on_change=clear_history)\n " ,
142+ " k = st.number_input('k', min_value=1, max_value=20,value=3, on_change=clear_history)\n " ,
143+ " add_data = st.button('Add Data', on_click=clear_history)\n " ,
144+ " \n " ,
145+ " if uploaded_file and add_data:\n " ,
146+ " with st.spinner('Reading, chunking, embedding file...'):\n " ,
147+ " bytes_data = uploaded_file.read()\n " ,
148+ " file_name = os.path.join('./', uploaded_file.name)\n " ,
149+ " with open(file_name, 'wb') as f:\n " ,
150+ " f.write(bytes_data)\n " ,
151+ " \n " ,
152+ " data = load_document(file_name)\n " ,
153+ " chunks = chunk_data(data, chunk_size=chunk_size)\n " ,
154+ " st.write(f'Chunk size: {chunk_size}, chunks: {len(chunks)}')\n " ,
155+ " \n " ,
156+ " \n " ,
157+ " tokens, embedding_cost = calculate_embedding_cost(chunks)\n " ,
158+ " st.write(f'Embedding cost: ${embedding_cost:.4f}')\n " ,
159+ " \n " ,
160+ " vector_store = create_embeddings(chunks)\n " ,
161+ " \n " ,
162+ " st.session_state.vs = vector_store\n " ,
163+ " \n " ,
164+ " st.success('File uploaded, Chunked and Embedded successfully.')\n " ,
165+ " \n " ,
166+ " \n " ,
167+ " q = st.text_input('Ask a question about the content of your file:')\n " ,
168+ " if q:\n " ,
169+ " if 'vs' in st.session_state:\n " ,
170+ " vector_store = st.session_state.vs\n " ,
171+ " #st.write(f'k: {k}')\n " ,
172+ " answer = ask_and_get_answer(vector_store, q, k)\n " ,
173+ " st.text_area('LLM Answer:', value=answer)\n " ,
174+ " \n " ,
175+ " \n " ,
176+ " st.divider()\n " ,
177+ " if 'history' not in st.session_state:\n " ,
178+ " st.session_state.history = ' '\n " ,
179+ " \n " ,
180+ " value = f'Q: {q} \\ nA: {answer} '\n " ,
181+ " st.session_state.history = f'{value} \\ n {\" -\" * 100} \\ n {st.session_state.history}'\n " ,
182+ " h = st.session_state.history\n " ,
183+ " st.text_area(label='Chat History', value=h, key='history', height=400)\n " ,
184+ " \n " ,
185+ " \n " ,
186+ " \n " ,
187+ " \n "
188+ ],
189+ "metadata" : {
190+ "id" : " Cw2UIcc22jjQ"
191+ },
192+ "execution_count" : null ,
193+ "outputs" : []
194+ },
195+ {
196+ "cell_type" : " code" ,
197+ "source" : [
198+ " !streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]\n "
199+ ],
200+ "metadata" : {
201+ "id" : " q_02qlJzh2Jd"
202+ },
203+ "execution_count" : null ,
204+ "outputs" : []
205+ }
206+ ]
207+ }
0 commit comments