Created using Colaboratory

kevatsa · kevatsa · commit ba28ea53dfd3 · 2023-12-11T08:25:15.000+03:00
diff --git a/LLM_Question_Answering_Application.ipynb b/LLM_Question_Answering_Application.ipynb
@@ -0,0 +1,207 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "mount_file_id": "10E5yOrYsLu1k3zhTg0ji9QG3o690Q_Wh",
+      "authorship_tag": "ABX9TyNYWsOTZcD6wahQCgxO8jJj",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/kevatsa/Python-programming-exercises/blob/master/LLM_Question_Answering_Application.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CrgGY3vJ0nYV"
+      },
+      "outputs": [],
+      "source": [
+        "pip install -r /content/drive/MyDrive/Frontend/requirements.txt\n",
+        "pip install chromadb\n",
+        "pip install tiktoken\n",
+        "pip install cohere\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "import streamlit as st\n",
+        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+        "from langchain.vectorstores import Chroma\n",
+        "import os\n",
+        "\n",
+        "\n",
+        "\n",
+        "def load_document(file):\n",
+        "  import os\n",
+        "  name, extension = os.path.splitext(file)\n",
+        "\n",
+        "\n",
+        "  if extension == '.pdf':\n",
+        "    from langchain.document_loaders import PyPDFLoader\n",
+        "    print(f'Loading {file}')\n",
+        "    loader = PyPDFLoader(file)\n",
+        "  elif extension == '.docx':\n",
+        "    from langchain.document_loaders import Docx2txtLoader\n",
+        "    print(f'Loading {file}')\n",
+        "    loader = Docx2txtLoader(file)\n",
+        "  elif extension == '.txt':\n",
+        "    from langchain.document_loaders import TextLoader\n",
+        "    print(f'Loading {file}')\n",
+        "    loader = TextLoader(file)\n",
+        "  else:\n",
+        "    print('Doc format  is not supported')\n",
+        "    return None\n",
+        "\n",
+        "  data = loader.load()\n",
+        "  return data\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "def chunk_data(data, chunk_size=256, chunk_overlap=20):\n",
+        "  from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+        "  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
+        "  chunks = text_splitter.split_documents(data)\n",
+        "  return chunks\n",
+        "\n",
+        "\n",
+        "\n",
+        "def create_embeddings(chunks):\n",
+        "  embeddings = OpenAIEmbeddings()\n",
+        "  vector_store = Chroma.from_documents(chunks, embeddings)\n",
+        "  return vector_store\n",
+        "\n",
+        "\n",
+        "\n",
+        "def ask_and_get_answer(vector_store, q, k=3):\n",
+        "  from langchain.chains import RetrievalQA\n",
+        "  from langchain.chat_models import ChatOpenAI\n",
+        "\n",
+        "  llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=2)\n",
+        "\n",
+        "  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':k})\n",
+        "\n",
+        "  chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)\n",
+        "\n",
+        "  answer = chain.run(q)\n",
+        "  return answer\n",
+        "\n",
+        "\n",
+        "\n",
+        "def calculate_embedding_cost(texts):\n",
+        "  import tiktoken\n",
+        "  enc = tiktoken.encoding_for_model('text-embedding-ada-002')\n",
+        "  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])\n",
+        "  #print(f'Total tokens: {total_tokens}')\n",
+        "  #print(f'Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')\n",
+        "  return total_tokens, total_tokens / 1000 * 0.0004\n",
+        "\n",
+        "\n",
+        "def clear_history():\n",
+        "  if 'history' in st.session_state:\n",
+        "    del st.session_state['history']\n",
+        "\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "  import os\n",
+        "  from dotenv import load_dotenv, find_dotenv\n",
+        "  load_dotenv(find_dotenv(), override=True)\n",
+        "\n",
+        "\n",
+        "  st.image('/content/drive/MyDrive/Frontend/img.jpeg')\n",
+        "  st.subheader('LLM Question-Answering Application')\n",
+        "  with st.sidebar:\n",
+        "    api_key = st.text_input('OpenAI API Key: ', type='password')\n",
+        "    if api_key:\n",
+        "      os.environ['OPENAI_API_KEY'] = api_key\n",
+        "\n",
+        "    uploaded_file = st.file_uploader('Upload a file:', type=['pdf', 'docx', 'txt'])\n",
+        "    chunk_size = st.number_input('Chunk size:', min_value=100, max_value=2048, value=512, on_change=clear_history)\n",
+        "    k = st.number_input('k', min_value=1, max_value=20,value=3, on_change=clear_history)\n",
+        "    add_data = st.button('Add Data', on_click=clear_history)\n",
+        "\n",
+        "    if uploaded_file and add_data:\n",
+        "      with st.spinner('Reading, chunking, embedding file...'):\n",
+        "        bytes_data = uploaded_file.read()\n",
+        "        file_name = os.path.join('./', uploaded_file.name)\n",
+        "        with open(file_name, 'wb') as f:\n",
+        "          f.write(bytes_data)\n",
+        "\n",
+        "        data = load_document(file_name)\n",
+        "        chunks = chunk_data(data, chunk_size=chunk_size)\n",
+        "        st.write(f'Chunk size: {chunk_size}, chunks: {len(chunks)}')\n",
+        "\n",
+        "\n",
+        "        tokens, embedding_cost = calculate_embedding_cost(chunks)\n",
+        "        st.write(f'Embedding cost: ${embedding_cost:.4f}')\n",
+        "\n",
+        "        vector_store = create_embeddings(chunks)\n",
+        "\n",
+        "        st.session_state.vs = vector_store\n",
+        "\n",
+        "        st.success('File uploaded, Chunked and Embedded successfully.')\n",
+        "\n",
+        "\n",
+        "  q = st.text_input('Ask a question  about the content of your file:')\n",
+        "  if q:\n",
+        "    if 'vs' in st.session_state:\n",
+        "      vector_store = st.session_state.vs\n",
+        "      #st.write(f'k: {k}')\n",
+        "      answer = ask_and_get_answer(vector_store, q, k)\n",
+        "      st.text_area('LLM Answer:', value=answer)\n",
+        "\n",
+        "\n",
+        "    st.divider()\n",
+        "    if 'history' not in st.session_state:\n",
+        "      st.session_state.history = ' '\n",
+        "\n",
+        "    value = f'Q: {q} \\nA: {answer} '\n",
+        "    st.session_state.history = f'{value} \\n {\"-\" * 100} \\n {st.session_state.history}'\n",
+        "    h = st.session_state.history\n",
+        "    st.text_area(label='Chat History', value=h, key='history', height=400)\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "Cw2UIcc22jjQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]\n"
+      ],
+      "metadata": {
+        "id": "q_02qlJzh2Jd"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}