|
88 | 88 | "df_postgres = psql.read_sql('SELECT * FROM postgres LIMIT 5;', con=conn)\n",
|
89 | 89 | "df_postgres.head()"
|
90 | 90 | ]
|
| 91 | + }, |
| 92 | + { |
| 93 | + "cell_type": "markdown", |
| 94 | + "metadata": {}, |
| 95 | + "source": [ |
| 96 | + "## A Python Pipeline using PostgreSQL and Docker\n", |
| 97 | + "\n", |
| 98 | + "Source: [this blog post](https://globoglobito.medium.com/creating-your-first-data-pipeline-with-python-62bfb7a298fe) and [Github](https://github.com/globoglobito/WebScraperPOC/blob/main/scraper.py)" |
| 99 | + ] |
| 100 | + }, |
| 101 | + { |
| 102 | + "cell_type": "markdown", |
| 103 | + "metadata": {}, |
| 104 | + "source": [ |
| 105 | + "### If Docker not installed https://docs.docker.com/get-docker/ \n", |
| 106 | + "\n", |
| 107 | + "### To create your docker container for the first time: \n", |
| 108 | + "> docker run -d -p 4321:5432 --name PostgresDB -e POSTGRES_PASSWORD=my_password postgres \n", |
| 109 | + "\n", |
| 110 | + "### To enter into your your container: \n", |
| 111 | + "> docker exec -it PostgresDB bash \n", |
| 112 | + "\n", |
| 113 | + "### Once inside your container, to enter postgres: \n", |
| 114 | + "> psql -U postgres \n", |
| 115 | + "\n", |
| 116 | + "### Finally, to create the table: \n", |
| 117 | + ">CREATE TABLE scraped_data ( \n", |
| 118 | + "> date_of_scraping timestamp, \n", |
| 119 | + "> seller varchar(20), \n", |
| 120 | + "> name varchar(100), \n", |
| 121 | + "> price integer, \n", |
| 122 | + "> in_stock bool, \n", |
| 123 | + "> deal bool, \n", |
| 124 | + "> url varchar(100) \n", |
| 125 | + ">); " |
| 126 | + ] |
| 127 | + }, |
| 128 | + { |
| 129 | + "cell_type": "markdown", |
| 130 | + "metadata": {}, |
| 131 | + "source": [ |
| 132 | + "### To create your Grafana container:\n", |
| 133 | + "> docker run -d -p 3000:3000 --name=grafana grafana/grafana \n", |
| 134 | + "\n", |
| 135 | + "Once the container is up and running, open your web browser and go to http://localhost:3000/. \n", |
| 136 | + "If not working, check the browser config. \n", |
| 137 | + "+ in firefox type about:config\n", |
| 138 | + "+ search localhost in it and make below flag true\n", |
| 139 | + "+ network.dns.native-is-localhost\n", |
| 140 | + "\n", |
| 141 | + "On the login page, enter admin for username and password.\n", |
| 142 | + "+ Click Log In. \n", |
| 143 | + "+ Click OK on the prompt, then change your password. \n", |
| 144 | + "+ Add your Postgres DB as a data source: \n", |
| 145 | + " + host: host.docker.internal:4321\n", |
| 146 | + " + Database: postgres\n", |
| 147 | + " + user: postgres\n", |
| 148 | + " + password: my_password (defined above)\n", |
| 149 | + " \n", |
| 150 | + "+ Create your own Dashboards" |
| 151 | + ] |
| 152 | + }, |
| 153 | + { |
| 154 | + "cell_type": "code", |
| 155 | + "execution_count": null, |
| 156 | + "metadata": {}, |
| 157 | + "outputs": [], |
| 158 | + "source": [ |
| 159 | + "import os\n", |
| 160 | + "import sys\n", |
| 161 | + "import requests\n", |
| 162 | + "from bs4 import BeautifulSoup\n", |
| 163 | + "import re\n", |
| 164 | + "import datetime\n", |
| 165 | + "import psycopg2\n", |
| 166 | + "import smtplib\n", |
| 167 | + "import ssl\n", |
| 168 | + "import logging\n", |
| 169 | + "import argparse\n", |
| 170 | + "\n", |
| 171 | + "timestamp_of_script = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n", |
| 172 | + "\n", |
| 173 | + "# A very basic logger that dumps information into a file.\n", |
| 174 | + "log_file = os.path.join(os.getcwd(), \"WebScraper.log\")\n", |
| 175 | + "logger = logging.getLogger(\"WebScraper\")\n", |
| 176 | + "logger.setLevel(logging.INFO)\n", |
| 177 | + "file_logger = logging.FileHandler(log_file, mode='a')\n", |
| 178 | + "file_logger.setLevel(logging.INFO)\n", |
| 179 | + "logger.addHandler(file_logger)\n", |
| 180 | + "\n", |
| 181 | + "\n", |
| 182 | + "# These are the web pages I decided to scrape for information. The information we need to scrape the data is:\n", |
| 183 | + "# The URL of the web page, the class where the name of the GPU is stored, the class where the price is stored, and\n", |
| 184 | + "# the class where the buy button is stored (this is how we determine availability; unless there is stock this class wont appear)\n", |
| 185 | + "pages_dictionary = {\"coolmod\": [\"https://www.coolmod.com/asus-turbo-geforce-rtx-3090-24gb-gddr6x-tarjeta-grafica\",\n", |
| 186 | + " \"-precio\", \"product-first-part\", \"text-price-total\", \"button-buy\"],\n", |
| 187 | + " \"coolmod2\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-black-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n", |
| 188 | + " \"product-first-part\", \"text-price-total\", \"button-buy\"],\n", |
| 189 | + " \"coolmod3\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n", |
| 190 | + " \"product-first-part\", \"text-price-total\", \"button-buy\"],\n", |
| 191 | + " \"coolmod4\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-ultra-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n", |
| 192 | + " \"product-first-part\", \"text-price-total\", \"button-buy\"],\n", |
| 193 | + " \"ibertronica\": [\"https://www.ibertronica.es/asus-rtx-3090-turbo-24gb-gddr6x\",\n", |
| 194 | + " \"mb-3 h2 product-title\", \"col-6 ng-tns-c1-1 ng-star-inserted\",\n", |
| 195 | + " \"btn btn-outline-primary btn-block m-0 mb-3\"],\n", |
| 196 | + " \"xtremmedia\": [\"https://www.xtremmedia.com/Asus_Turbo_GeForce_RTX_3090_24GB_GDDR6X.html\",\n", |
| 197 | + " \"ficha-titulo\", \"offerDetails article-list-pvp\", \"article-carrito2\", \"precio\"],\n", |
| 198 | + " \"xtremmedia2\": [\"https://www.xtremmedia.com/EVGA_GeForce_RTX_3090_XC3_Ultra_Gaming_24GB_GDDR6X.html\",\n", |
| 199 | + " \"ficha-titulo\", \"offerDetails article-list-pvp\", \"article-carrito2\", \"precio\"],\n", |
| 200 | + " \"pccomponentes\": [\"https://www.pccomponentes.com/asus-turbo-geforce-rtx-3090-24gb-gddr6x\", \"h4\",\n", |
| 201 | + " \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n", |
| 202 | + " \"pccomponentes2\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-black-gaming-24gb-gdddr6x\", \"h4\",\n", |
| 203 | + " \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n", |
| 204 | + " \"pccomponentes3\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-gaming-24gb-gddr6x\", \"h4\", \n", |
| 205 | + " \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n", |
| 206 | + " \"pccomponentes4\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-ultra-gaming-24gb-gddr6x\", \"h4\",\n", |
| 207 | + " \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"]}\n", |
| 208 | + "\n", |
| 209 | + "\n", |
| 210 | + "# Note for docker:\n", |
| 211 | + "# You might have an instance of Postgres running on local and it probably uses port 5432 already. We must bind another local port to port 5432 of the container.\n", |
| 212 | + "# In this case when builfing the container we used : docker run -d -p 4321:5432 ...... and so on.\n", |
| 213 | + "\n", |
| 214 | + "def get_product_details(urls, name_class, price_class, instock_class, alternate_price_class=None):\n", |
| 215 | + " \"\"\" Receives 4-5 inputs, and returns a dictionary with the scraped information.\n", |
| 216 | + " The function extracts the relevant information of the url provided (price, name, availability),\n", |
| 217 | + " it then cleans and formats the information so that it can be dumped into a relational DB\"\"\"\n", |
| 218 | + " headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\",\n", |
| 219 | + " \"Chrome/88.0.4324.104 Safari/537.36\"}\n", |
| 220 | + " details = {\"date_of_scraping\": \"\", \"seller\": \"\", \"name\": \"\", \"price\": 0, \"in_stock\": False, \"deal\": False,\n", |
| 221 | + " \"url\": \"\"}\n", |
| 222 | + " if urls == \"\":\n", |
| 223 | + " logger.warning(f\"URL parameter is empty, skipping this k-v pair\")\n", |
| 224 | + " details = None\n", |
| 225 | + " else:\n", |
| 226 | + " try:\n", |
| 227 | + " page = requests.get(urls, headers=headers)\n", |
| 228 | + " page.raise_for_status() # to check if we got a correct response (200) else it raises an Exception.\n", |
| 229 | + " soup = BeautifulSoup(page.content, features=\"html.parser\")\n", |
| 230 | + " timestamp = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n", |
| 231 | + " seller_raw = re.sub('^.*w\\.', '', urls)\n", |
| 232 | + " name = soup.find(class_=name_class)\n", |
| 233 | + " price = soup.find(class_=price_class)\n", |
| 234 | + " in_stock = soup.find(class_=instock_class)\n", |
| 235 | + " if alternate_price_class is not None and price is None:\n", |
| 236 | + " price = soup.find(class_=alternate_price_class)\n", |
| 237 | + " details[\"date_of_scraping\"] = timestamp\n", |
| 238 | + " if \"ibertronica\" in seller_raw:\n", |
| 239 | + " details[\"seller\"] = re.sub('\\.es.*', '', seller_raw)\n", |
| 240 | + " else:\n", |
| 241 | + " details[\"seller\"] = re.sub('\\.com.*', '', seller_raw)\n", |
| 242 | + " if name is not None:\n", |
| 243 | + " details[\"name\"] = name.get_text()\n", |
| 244 | + " details[\"name\"] = re.sub(\"GeForce\", \"\", details[\"name\"])\n", |
| 245 | + " details[\"name\"] = re.sub(\"®\", \"\", details[\"name\"])\n", |
| 246 | + " details[\"name\"] = re.sub(\" - {2}Tarjeta Gráfica\", \"\", details[\"name\"])\n", |
| 247 | + " details[\"name\"] = re.sub(\" {2}\", \" \", details[\"name\"])\n", |
| 248 | + " details[\"name\"] = re.sub(\"DDD\", \"DD\", details[\"name\"])\n", |
| 249 | + " details[\"name\"] = details[\"name\"].upper()\n", |
| 250 | + " details[\"name\"] = re.sub(\"ASUS TURBO RTX 3090\", \"ASUS RTX 3090 TURBO\", details[\"name\"])\n", |
| 251 | + " details[\"url\"] = urls\n", |
| 252 | + " else:\n", |
| 253 | + " details = None\n", |
| 254 | + " logger.warning(f\"URL: {urls} not scraped because the name of the product was not found @ {timestamp}\")\n", |
| 255 | + " return details\n", |
| 256 | + " if price is not None:\n", |
| 257 | + " details[\"price\"] = int(re.sub('[^0-9]', '', price.get_text())[0:4])\n", |
| 258 | + " if in_stock is not None:\n", |
| 259 | + " details[\"in_stock\"] = True\n", |
| 260 | + " if int(details[\"price\"]) <= 1800:\n", |
| 261 | + " details[\"deal\"] = True\n", |
| 262 | + " logger.info(f\"{urls} scraped successfully @ {timestamp}\")\n", |
| 263 | + " except Exception as ex:\n", |
| 264 | + " logger.warning(f\"Exception caught @ get_product_details :{ex}\")\n", |
| 265 | + " details = None\n", |
| 266 | + " return details\n", |
| 267 | + "\n", |
| 268 | + "\n", |
| 269 | + "def iterate_webpages(dictionary):\n", |
| 270 | + " \"\"\" Helper function to iterate over our pages directory using the get_products_details function\"\"\"\n", |
| 271 | + " if not dictionary:\n", |
| 272 | + " logger.warning(f\"Nothing to scrape, ending script\")\n", |
| 273 | + " sys.exit(1)\n", |
| 274 | + " sql_information_list = []\n", |
| 275 | + " for key in dictionary:\n", |
| 276 | + " query = get_product_details(*dictionary[key])\n", |
| 277 | + " if query is not None:\n", |
| 278 | + " sql_information_list.append(query)\n", |
| 279 | + " if not sql_information_list:\n", |
| 280 | + " logger.warning(f\"No information was scraped, terminating {timestamp_of_script}\")\n", |
| 281 | + " sys.exit(1)\n", |
| 282 | + " return sql_information_list\n", |
| 283 | + "\n", |
| 284 | + "\n", |
| 285 | + "def create_message(scraped_data):\n", |
| 286 | + " \"\"\" A simple function that creates the message to be sent in an email if the conditions are met.\"\"\"\n", |
| 287 | + " message = \"\"\n", |
| 288 | + " for dic in scraped_data:\n", |
| 289 | + " if dic[\"in_stock\"] and dic[\"deal\"]:\n", |
| 290 | + " line = f\"The item sold by {dic['seller']} is on sale for {dic['price']} euros @ {dic['url']}\\n\"\n", |
| 291 | + " message += line\n", |
| 292 | + " return message\n", |
| 293 | + "\n", |
| 294 | + "\n", |
| 295 | + "def send_email(message, config):\n", |
| 296 | + " \"\"\" This function sends the actual email should the conditions be met.\"\"\"\n", |
| 297 | + " try:\n", |
| 298 | + " with open(config) as reader:\n", |
| 299 | + " lines = reader.read().splitlines()\n", |
| 300 | + " port = 465 # For SSL\n", |
| 301 | + " smtp_server = lines[0]\n", |
| 302 | + " sender_email = lines[1]\n", |
| 303 | + " password = lines[2]\n", |
| 304 | + " receiver_email = lines[3]\n", |
| 305 | + " print(smtp_server, sender_email, password, receiver_email)\n", |
| 306 | + "\n", |
| 307 | + " message_to_send = f\"Subject: Price Alert \\n\\n {message}\"\n", |
| 308 | + " message_to_send = re.sub(r'[^\\x00-\\x7F]+', ' ', message_to_send) # Quick and dirty regex to remove non ascii chars.\n", |
| 309 | + "\n", |
| 310 | + " context = ssl.create_default_context()\n", |
| 311 | + " with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:\n", |
| 312 | + " server.login(sender_email, password)\n", |
| 313 | + " server.sendmail(sender_email, receiver_email, message_to_send)\n", |
| 314 | + " except Exception as ex:\n", |
| 315 | + " logger.warning(f\"Exception caught when trying to send an email @ send_email():{ex}\")\n", |
| 316 | + "\n", |
| 317 | + "\n", |
| 318 | + "def do_insert(rec, config):\n", |
| 319 | + " \"\"\" This function inserts the scraped data into our Postgres DB, should an exception occur the function will\n", |
| 320 | + " rollback the transaction and continue with the rest.\"\"\"\n", |
| 321 | + " try:\n", |
| 322 | + " with open(config) as reader:\n", |
| 323 | + " lines = reader.read().splitlines()\n", |
| 324 | + " db_name = lines[0]\n", |
| 325 | + " username = lines[1]\n", |
| 326 | + " password = lines[2]\n", |
| 327 | + " ip_address = lines[3]\n", |
| 328 | + " port = lines[4]\n", |
| 329 | + " conn = psycopg2.connect(dbname=db_name, user=username, password=password, host=ip_address, port=port)\n", |
| 330 | + " cur = conn.cursor()\n", |
| 331 | + " except Exception as ex:\n", |
| 332 | + " logger.warning(f\"Exception caught when reading config file @ do_insert():{ex}\")\n", |
| 333 | + " sys.exit(1)\n", |
| 334 | + "\n", |
| 335 | + " for dictionary in rec:\n", |
| 336 | + " try:\n", |
| 337 | + " cols = dictionary.keys()\n", |
| 338 | + " cols_str = ','.join(cols)\n", |
| 339 | + " values_to_insert = [dictionary[k] for k in cols]\n", |
| 340 | + " values_wildcards = ','.join(['%s' for i in range(len(values_to_insert))]) # -> %s,%s,%s,%s,%s,%s,%s\n", |
| 341 | + " sql_str = f\"INSERT INTO scraped_data ({cols_str}) VALUES ({values_wildcards}) ON CONFLICT DO NOTHING\"\n", |
| 342 | + " cur.execute(sql_str, values_to_insert)\n", |
| 343 | + " conn.commit()\n", |
| 344 | + " except Exception as ex:\n", |
| 345 | + " conn.rollback()\n", |
| 346 | + " logger.warning(f\"Exception caught @ do_insert():{ex}\")\n", |
| 347 | + " continue\n", |
| 348 | + "\n", |
| 349 | + "\n", |
| 350 | + "def main():\n", |
| 351 | + " scraped_data = iterate_webpages(pages_dictionary)\n", |
| 352 | + " email = create_message(scraped_data)\n", |
| 353 | + " if email:\n", |
| 354 | + " send_email(email, config_path)\n", |
| 355 | + " do_insert(scraped_data, pg_config_path)\n", |
| 356 | + " logger.info(f\"We are done! @ {timestamp_of_script}\")\n", |
| 357 | + "\n", |
| 358 | + "\n", |
| 359 | + "if __name__ == \"__main__\":\n", |
| 360 | + "\n", |
| 361 | + " parser = argparse.ArgumentParser()\n", |
| 362 | + " parser.add_argument(\"email_config_file\",\n", |
| 363 | + " type=str,\n", |
| 364 | + " help=\"a text file with email_config parameters for sending the email\")\n", |
| 365 | + " parser.add_argument(\"postgres_config_file\",\n", |
| 366 | + " type=str,\n", |
| 367 | + " help=\"a text file with email_config parameters connecting to our postgres db\")\n", |
| 368 | + " args = parser.parse_args()\n", |
| 369 | + " pwd = os.getcwd()\n", |
| 370 | + " config_path = os.path.join(pwd, args.email_config_file)\n", |
| 371 | + " pg_config_path = os.path.join(pwd, args.postgres_config_file)\n", |
| 372 | + "\n", |
| 373 | + " main()" |
| 374 | + ] |
91 | 375 | }
|
92 | 376 | ],
|
93 | 377 | "metadata": {
|
94 | 378 | "kernelspec": {
|
95 |
| - "display_name": "Python 3", |
| 379 | + "display_name": "Python 3 (ipykernel)", |
96 | 380 | "language": "python",
|
97 | 381 | "name": "python3"
|
98 | 382 | },
|
|
106 | 390 | "name": "python",
|
107 | 391 | "nbconvert_exporter": "python",
|
108 | 392 | "pygments_lexer": "ipython3",
|
109 |
| - "version": "3.7.6" |
| 393 | + "version": "3.8.10" |
110 | 394 | }
|
111 | 395 | },
|
112 | 396 | "nbformat": 4,
|
|
0 commit comments