adding Postgresql docker example

rsouza · rsouza · commit 132c0c8c12c7 · 2021-10-25T20:14:38.000+02:00
diff --git a/Notebooks/03_PostgreSQL.ipynb b/Notebooks/03_PostgreSQL.ipynb
@@ -88,11 +88,295 @@
     "df_postgres = psql.read_sql('SELECT * FROM postgres LIMIT 5;', con=conn)\n",
     "df_postgres.head()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A Python Pipeline using PostgreSQL and Docker\n",
+    "\n",
+    "Source: [this blog post](https://globoglobito.medium.com/creating-your-first-data-pipeline-with-python-62bfb7a298fe) and [Github](https://github.com/globoglobito/WebScraperPOC/blob/main/scraper.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### If Docker not installed https://docs.docker.com/get-docker/  \n",
+    "\n",
+    "### To create your docker container for the first time:  \n",
+    "> docker run -d -p 4321:5432 --name PostgresDB -e POSTGRES_PASSWORD=my_password postgres  \n",
+    "\n",
+    "### To enter into your your container:  \n",
+    "> docker exec -it PostgresDB bash \n",
+    "\n",
+    "### Once inside your container, to enter postgres:  \n",
+    "> psql -U postgres  \n",
+    "\n",
+    "### Finally, to create the table:  \n",
+    ">CREATE TABLE scraped_data (  \n",
+    ">    date_of_scraping timestamp,   \n",
+    ">    seller     varchar(20),  \n",
+    ">    name       varchar(100),  \n",
+    ">    price      integer,  \n",
+    ">    in_stock   bool,  \n",
+    ">    deal       bool,  \n",
+    ">    url        varchar(100)  \n",
+    ">);  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### To create your Grafana container:\n",
+    "> docker run -d -p 3000:3000 --name=grafana  grafana/grafana  \n",
+    "\n",
+    "Once the container is up and running, open your web browser and go to http://localhost:3000/.  \n",
+    "If not working, check the browser config. \n",
+    "+ in firefox type about:config\n",
+    "+ search localhost in it and make below flag true\n",
+    "+ network.dns.native-is-localhost\n",
+    "\n",
+    "On the login page, enter admin for username and password.\n",
+    "+ Click Log In. \n",
+    "+ Click OK on the prompt, then change your password.  \n",
+    "+ Add your Postgres DB as a data source:  \n",
+    "    + host: host.docker.internal:4321\n",
+    "    + Database: postgres\n",
+    "    + user: postgres\n",
+    "    + password: my_password (defined above)\n",
+    "    \n",
+    "+ Create your own Dashboards"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "import re\n",
+    "import datetime\n",
+    "import psycopg2\n",
+    "import smtplib\n",
+    "import ssl\n",
+    "import logging\n",
+    "import argparse\n",
+    "\n",
+    "timestamp_of_script = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n",
+    "\n",
+    "# A very basic logger that dumps information into a file.\n",
+    "log_file = os.path.join(os.getcwd(), \"WebScraper.log\")\n",
+    "logger = logging.getLogger(\"WebScraper\")\n",
+    "logger.setLevel(logging.INFO)\n",
+    "file_logger = logging.FileHandler(log_file, mode='a')\n",
+    "file_logger.setLevel(logging.INFO)\n",
+    "logger.addHandler(file_logger)\n",
+    "\n",
+    "\n",
+    "# These are the web pages I decided to scrape for information. The information we need to scrape the data is:\n",
+    "# The URL of the web page, the class where the name of the GPU is stored, the class where the price is stored, and\n",
+    "# the class where the buy button is stored (this is how we determine availability; unless there is stock this class wont appear)\n",
+    "pages_dictionary = {\"coolmod\": [\"https://www.coolmod.com/asus-turbo-geforce-rtx-3090-24gb-gddr6x-tarjeta-grafica\",\n",
+    "                                \"-precio\", \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
+    "                    \"coolmod2\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-black-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n",
+    "                                 \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
+    "                    \"coolmod3\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n",
+    "                                 \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
+    "                    \"coolmod4\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-ultra-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n",
+    "                                 \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
+    "                    \"ibertronica\": [\"https://www.ibertronica.es/asus-rtx-3090-turbo-24gb-gddr6x\",\n",
+    "                                    \"mb-3 h2 product-title\", \"col-6 ng-tns-c1-1 ng-star-inserted\",\n",
+    "                                    \"btn btn-outline-primary btn-block m-0 mb-3\"],\n",
+    "                    \"xtremmedia\": [\"https://www.xtremmedia.com/Asus_Turbo_GeForce_RTX_3090_24GB_GDDR6X.html\",\n",
+    "                                   \"ficha-titulo\", \"offerDetails article-list-pvp\", \"article-carrito2\", \"precio\"],\n",
+    "                    \"xtremmedia2\": [\"https://www.xtremmedia.com/EVGA_GeForce_RTX_3090_XC3_Ultra_Gaming_24GB_GDDR6X.html\",\n",
+    "                                    \"ficha-titulo\", \"offerDetails article-list-pvp\", \"article-carrito2\", \"precio\"],\n",
+    "                    \"pccomponentes\": [\"https://www.pccomponentes.com/asus-turbo-geforce-rtx-3090-24gb-gddr6x\", \"h4\",\n",
+    "                                      \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n",
+    "                    \"pccomponentes2\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-black-gaming-24gb-gdddr6x\", \"h4\",\n",
+    "                                       \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n",
+    "                    \"pccomponentes3\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-gaming-24gb-gddr6x\", \"h4\", \n",
+    "                                       \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n",
+    "                    \"pccomponentes4\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-ultra-gaming-24gb-gddr6x\", \"h4\",\n",
+    "                                       \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"]}\n",
+    "\n",
+    "\n",
+    "# Note for docker:\n",
+    "# You might have an instance of Postgres running on local and it probably uses port 5432 already. We must bind another local port to port 5432 of the container.\n",
+    "# In this case when builfing the container we used : docker run -d -p 4321:5432 ...... and so on.\n",
+    "\n",
+    "def get_product_details(urls, name_class, price_class, instock_class, alternate_price_class=None):\n",
+    "    \"\"\" Receives 4-5 inputs, and returns a dictionary with the scraped information.\n",
+    "        The function extracts the relevant information of the url provided (price, name, availability),\n",
+    "        it then cleans and formats the information so that it can be dumped into a relational DB\"\"\"\n",
+    "    headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\",\n",
+    "               \"Chrome/88.0.4324.104 Safari/537.36\"}\n",
+    "    details = {\"date_of_scraping\": \"\", \"seller\": \"\", \"name\": \"\", \"price\": 0, \"in_stock\": False, \"deal\": False,\n",
+    "               \"url\": \"\"}\n",
+    "    if urls == \"\":\n",
+    "        logger.warning(f\"URL parameter is empty, skipping this k-v pair\")\n",
+    "        details = None\n",
+    "    else:\n",
+    "        try:\n",
+    "            page = requests.get(urls, headers=headers)\n",
+    "            page.raise_for_status()  # to check if we got a correct response (200) else it raises an Exception.\n",
+    "            soup = BeautifulSoup(page.content, features=\"html.parser\")\n",
+    "            timestamp = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n",
+    "            seller_raw = re.sub('^.*w\\.', '', urls)\n",
+    "            name = soup.find(class_=name_class)\n",
+    "            price = soup.find(class_=price_class)\n",
+    "            in_stock = soup.find(class_=instock_class)\n",
+    "            if alternate_price_class is not None and price is None:\n",
+    "                price = soup.find(class_=alternate_price_class)\n",
+    "            details[\"date_of_scraping\"] = timestamp\n",
+    "            if \"ibertronica\" in seller_raw:\n",
+    "                details[\"seller\"] = re.sub('\\.es.*', '', seller_raw)\n",
+    "            else:\n",
+    "                details[\"seller\"] = re.sub('\\.com.*', '', seller_raw)\n",
+    "            if name is not None:\n",
+    "                details[\"name\"] = name.get_text()\n",
+    "                details[\"name\"] = re.sub(\"GeForce\", \"\", details[\"name\"])\n",
+    "                details[\"name\"] = re.sub(\"®\", \"\", details[\"name\"])\n",
+    "                details[\"name\"] = re.sub(\" - {2}Tarjeta Gráfica\", \"\", details[\"name\"])\n",
+    "                details[\"name\"] = re.sub(\" {2}\", \" \", details[\"name\"])\n",
+    "                details[\"name\"] = re.sub(\"DDD\", \"DD\", details[\"name\"])\n",
+    "                details[\"name\"] = details[\"name\"].upper()\n",
+    "                details[\"name\"] = re.sub(\"ASUS TURBO RTX 3090\", \"ASUS RTX 3090 TURBO\", details[\"name\"])\n",
+    "                details[\"url\"] = urls\n",
+    "            else:\n",
+    "                details = None\n",
+    "                logger.warning(f\"URL: {urls} not scraped because the name of the product was not found @ {timestamp}\")\n",
+    "                return details\n",
+    "            if price is not None:\n",
+    "                details[\"price\"] = int(re.sub('[^0-9]', '', price.get_text())[0:4])\n",
+    "            if in_stock is not None:\n",
+    "                details[\"in_stock\"] = True\n",
+    "            if int(details[\"price\"]) <= 1800:\n",
+    "                details[\"deal\"] = True\n",
+    "            logger.info(f\"{urls} scraped successfully @ {timestamp}\")\n",
+    "        except Exception as ex:\n",
+    "            logger.warning(f\"Exception caught @ get_product_details :{ex}\")\n",
+    "            details = None\n",
+    "    return details\n",
+    "\n",
+    "\n",
+    "def iterate_webpages(dictionary):\n",
+    "    \"\"\" Helper function to iterate over our pages directory using the get_products_details function\"\"\"\n",
+    "    if not dictionary:\n",
+    "        logger.warning(f\"Nothing to scrape, ending script\")\n",
+    "        sys.exit(1)\n",
+    "    sql_information_list = []\n",
+    "    for key in dictionary:\n",
+    "        query = get_product_details(*dictionary[key])\n",
+    "        if query is not None:\n",
+    "            sql_information_list.append(query)\n",
+    "    if not sql_information_list:\n",
+    "        logger.warning(f\"No information was scraped, terminating {timestamp_of_script}\")\n",
+    "        sys.exit(1)\n",
+    "    return sql_information_list\n",
+    "\n",
+    "\n",
+    "def create_message(scraped_data):\n",
+    "    \"\"\" A simple function that creates the message to be sent in an email if the conditions are met.\"\"\"\n",
+    "    message = \"\"\n",
+    "    for dic in scraped_data:\n",
+    "        if dic[\"in_stock\"] and dic[\"deal\"]:\n",
+    "            line = f\"The item sold by {dic['seller']} is on sale for {dic['price']} euros @ {dic['url']}\\n\"\n",
+    "            message += line\n",
+    "    return message\n",
+    "\n",
+    "\n",
+    "def send_email(message, config):\n",
+    "    \"\"\" This function sends the actual email should the conditions be met.\"\"\"\n",
+    "    try:\n",
+    "        with open(config) as reader:\n",
+    "            lines = reader.read().splitlines()\n",
+    "        port = 465  # For SSL\n",
+    "        smtp_server = lines[0]\n",
+    "        sender_email = lines[1]\n",
+    "        password = lines[2]\n",
+    "        receiver_email = lines[3]\n",
+    "        print(smtp_server, sender_email, password, receiver_email)\n",
+    "\n",
+    "        message_to_send = f\"Subject: Price Alert \\n\\n {message}\"\n",
+    "        message_to_send = re.sub(r'[^\\x00-\\x7F]+', ' ', message_to_send)  # Quick and dirty regex to remove non ascii chars.\n",
+    "\n",
+    "        context = ssl.create_default_context()\n",
+    "        with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:\n",
+    "            server.login(sender_email, password)\n",
+    "            server.sendmail(sender_email, receiver_email, message_to_send)\n",
+    "    except Exception as ex:\n",
+    "        logger.warning(f\"Exception caught when trying to send an email @ send_email():{ex}\")\n",
+    "\n",
+    "\n",
+    "def do_insert(rec, config):\n",
+    "    \"\"\" This function inserts the scraped data into our Postgres DB, should an exception occur the function will\n",
+    "        rollback the transaction and continue with the rest.\"\"\"\n",
+    "    try:\n",
+    "        with open(config) as reader:\n",
+    "            lines = reader.read().splitlines()\n",
+    "        db_name = lines[0]\n",
+    "        username = lines[1]\n",
+    "        password = lines[2]\n",
+    "        ip_address = lines[3]\n",
+    "        port = lines[4]\n",
+    "        conn = psycopg2.connect(dbname=db_name, user=username, password=password, host=ip_address, port=port)\n",
+    "        cur = conn.cursor()\n",
+    "    except Exception as ex:\n",
+    "        logger.warning(f\"Exception caught when reading config file @ do_insert():{ex}\")\n",
+    "        sys.exit(1)\n",
+    "\n",
+    "    for dictionary in rec:\n",
+    "        try:\n",
+    "            cols = dictionary.keys()\n",
+    "            cols_str = ','.join(cols)\n",
+    "            values_to_insert = [dictionary[k] for k in cols]\n",
+    "            values_wildcards = ','.join(['%s' for i in range(len(values_to_insert))])  # -> %s,%s,%s,%s,%s,%s,%s\n",
+    "            sql_str = f\"INSERT INTO scraped_data ({cols_str}) VALUES ({values_wildcards}) ON CONFLICT DO NOTHING\"\n",
+    "            cur.execute(sql_str, values_to_insert)\n",
+    "            conn.commit()\n",
+    "        except Exception as ex:\n",
+    "            conn.rollback()\n",
+    "            logger.warning(f\"Exception caught @ do_insert():{ex}\")\n",
+    "            continue\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "    scraped_data = iterate_webpages(pages_dictionary)\n",
+    "    email = create_message(scraped_data)\n",
+    "    if email:\n",
+    "        send_email(email, config_path)\n",
+    "    do_insert(scraped_data, pg_config_path)\n",
+    "    logger.info(f\"We are done! @ {timestamp_of_script}\")\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "\n",
+    "    parser = argparse.ArgumentParser()\n",
+    "    parser.add_argument(\"email_config_file\",\n",
+    "                        type=str,\n",
+    "                        help=\"a text file with email_config parameters for sending the email\")\n",
+    "    parser.add_argument(\"postgres_config_file\",\n",
+    "                        type=str,\n",
+    "                        help=\"a text file with email_config parameters connecting to our postgres db\")\n",
+    "    args = parser.parse_args()\n",
+    "    pwd = os.getcwd()\n",
+    "    config_path = os.path.join(pwd, args.email_config_file)\n",
+    "    pg_config_path = os.path.join(pwd, args.postgres_config_file)\n",
+    "\n",
+    "    main()"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -106,7 +390,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,