Skip to content

Commit 132c0c8

Browse files
committed
adding Postgresql docker example
1 parent 2c8ae71 commit 132c0c8

File tree

1 file changed

+286
-2
lines changed

1 file changed

+286
-2
lines changed

Notebooks/03_PostgreSQL.ipynb

+286-2
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,295 @@
8888
"df_postgres = psql.read_sql('SELECT * FROM postgres LIMIT 5;', con=conn)\n",
8989
"df_postgres.head()"
9090
]
91+
},
92+
{
93+
"cell_type": "markdown",
94+
"metadata": {},
95+
"source": [
96+
"## A Python Pipeline using PostgreSQL and Docker\n",
97+
"\n",
98+
"Source: [this blog post](https://globoglobito.medium.com/creating-your-first-data-pipeline-with-python-62bfb7a298fe) and [Github](https://github.com/globoglobito/WebScraperPOC/blob/main/scraper.py)"
99+
]
100+
},
101+
{
102+
"cell_type": "markdown",
103+
"metadata": {},
104+
"source": [
105+
"### If Docker not installed https://docs.docker.com/get-docker/ \n",
106+
"\n",
107+
"### To create your docker container for the first time: \n",
108+
"> docker run -d -p 4321:5432 --name PostgresDB -e POSTGRES_PASSWORD=my_password postgres \n",
109+
"\n",
110+
"### To enter into your your container: \n",
111+
"> docker exec -it PostgresDB bash \n",
112+
"\n",
113+
"### Once inside your container, to enter postgres: \n",
114+
"> psql -U postgres \n",
115+
"\n",
116+
"### Finally, to create the table: \n",
117+
">CREATE TABLE scraped_data ( \n",
118+
"> date_of_scraping timestamp, \n",
119+
"> seller varchar(20), \n",
120+
"> name varchar(100), \n",
121+
"> price integer, \n",
122+
"> in_stock bool, \n",
123+
"> deal bool, \n",
124+
"> url varchar(100) \n",
125+
">); "
126+
]
127+
},
128+
{
129+
"cell_type": "markdown",
130+
"metadata": {},
131+
"source": [
132+
"### To create your Grafana container:\n",
133+
"> docker run -d -p 3000:3000 --name=grafana grafana/grafana \n",
134+
"\n",
135+
"Once the container is up and running, open your web browser and go to http://localhost:3000/. \n",
136+
"If not working, check the browser config. \n",
137+
"+ in firefox type about:config\n",
138+
"+ search localhost in it and make below flag true\n",
139+
"+ network.dns.native-is-localhost\n",
140+
"\n",
141+
"On the login page, enter admin for username and password.\n",
142+
"+ Click Log In. \n",
143+
"+ Click OK on the prompt, then change your password. \n",
144+
"+ Add your Postgres DB as a data source: \n",
145+
" + host: host.docker.internal:4321\n",
146+
" + Database: postgres\n",
147+
" + user: postgres\n",
148+
" + password: my_password (defined above)\n",
149+
" \n",
150+
"+ Create your own Dashboards"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"import os\n",
160+
"import sys\n",
161+
"import requests\n",
162+
"from bs4 import BeautifulSoup\n",
163+
"import re\n",
164+
"import datetime\n",
165+
"import psycopg2\n",
166+
"import smtplib\n",
167+
"import ssl\n",
168+
"import logging\n",
169+
"import argparse\n",
170+
"\n",
171+
"timestamp_of_script = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n",
172+
"\n",
173+
"# A very basic logger that dumps information into a file.\n",
174+
"log_file = os.path.join(os.getcwd(), \"WebScraper.log\")\n",
175+
"logger = logging.getLogger(\"WebScraper\")\n",
176+
"logger.setLevel(logging.INFO)\n",
177+
"file_logger = logging.FileHandler(log_file, mode='a')\n",
178+
"file_logger.setLevel(logging.INFO)\n",
179+
"logger.addHandler(file_logger)\n",
180+
"\n",
181+
"\n",
182+
"# These are the web pages I decided to scrape for information. The information we need to scrape the data is:\n",
183+
"# The URL of the web page, the class where the name of the GPU is stored, the class where the price is stored, and\n",
184+
"# the class where the buy button is stored (this is how we determine availability; unless there is stock this class wont appear)\n",
185+
"pages_dictionary = {\"coolmod\": [\"https://www.coolmod.com/asus-turbo-geforce-rtx-3090-24gb-gddr6x-tarjeta-grafica\",\n",
186+
" \"-precio\", \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
187+
" \"coolmod2\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-black-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n",
188+
" \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
189+
" \"coolmod3\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n",
190+
" \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
191+
" \"coolmod4\": [\"https://www.coolmod.com/evga-geforce-rtx-3090-xc3-ultra-gaming-24gb-gddr6x-tarjeta-grafica-precio\",\n",
192+
" \"product-first-part\", \"text-price-total\", \"button-buy\"],\n",
193+
" \"ibertronica\": [\"https://www.ibertronica.es/asus-rtx-3090-turbo-24gb-gddr6x\",\n",
194+
" \"mb-3 h2 product-title\", \"col-6 ng-tns-c1-1 ng-star-inserted\",\n",
195+
" \"btn btn-outline-primary btn-block m-0 mb-3\"],\n",
196+
" \"xtremmedia\": [\"https://www.xtremmedia.com/Asus_Turbo_GeForce_RTX_3090_24GB_GDDR6X.html\",\n",
197+
" \"ficha-titulo\", \"offerDetails article-list-pvp\", \"article-carrito2\", \"precio\"],\n",
198+
" \"xtremmedia2\": [\"https://www.xtremmedia.com/EVGA_GeForce_RTX_3090_XC3_Ultra_Gaming_24GB_GDDR6X.html\",\n",
199+
" \"ficha-titulo\", \"offerDetails article-list-pvp\", \"article-carrito2\", \"precio\"],\n",
200+
" \"pccomponentes\": [\"https://www.pccomponentes.com/asus-turbo-geforce-rtx-3090-24gb-gddr6x\", \"h4\",\n",
201+
" \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n",
202+
" \"pccomponentes2\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-black-gaming-24gb-gdddr6x\", \"h4\",\n",
203+
" \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n",
204+
" \"pccomponentes3\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-gaming-24gb-gddr6x\", \"h4\", \n",
205+
" \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"],\n",
206+
" \"pccomponentes4\": [\"https://www.pccomponentes.com/evga-geforce-rtx-3090-xc3-ultra-gaming-24gb-gddr6x\", \"h4\",\n",
207+
" \"baseprice\", \"btn btn-primary btn-lg buy GTM-addToCart buy-button js-article-buy\"]}\n",
208+
"\n",
209+
"\n",
210+
"# Note for docker:\n",
211+
"# You might have an instance of Postgres running on local and it probably uses port 5432 already. We must bind another local port to port 5432 of the container.\n",
212+
"# In this case when builfing the container we used : docker run -d -p 4321:5432 ...... and so on.\n",
213+
"\n",
214+
"def get_product_details(urls, name_class, price_class, instock_class, alternate_price_class=None):\n",
215+
" \"\"\" Receives 4-5 inputs, and returns a dictionary with the scraped information.\n",
216+
" The function extracts the relevant information of the url provided (price, name, availability),\n",
217+
" it then cleans and formats the information so that it can be dumped into a relational DB\"\"\"\n",
218+
" headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\",\n",
219+
" \"Chrome/88.0.4324.104 Safari/537.36\"}\n",
220+
" details = {\"date_of_scraping\": \"\", \"seller\": \"\", \"name\": \"\", \"price\": 0, \"in_stock\": False, \"deal\": False,\n",
221+
" \"url\": \"\"}\n",
222+
" if urls == \"\":\n",
223+
" logger.warning(f\"URL parameter is empty, skipping this k-v pair\")\n",
224+
" details = None\n",
225+
" else:\n",
226+
" try:\n",
227+
" page = requests.get(urls, headers=headers)\n",
228+
" page.raise_for_status() # to check if we got a correct response (200) else it raises an Exception.\n",
229+
" soup = BeautifulSoup(page.content, features=\"html.parser\")\n",
230+
" timestamp = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n",
231+
" seller_raw = re.sub('^.*w\\.', '', urls)\n",
232+
" name = soup.find(class_=name_class)\n",
233+
" price = soup.find(class_=price_class)\n",
234+
" in_stock = soup.find(class_=instock_class)\n",
235+
" if alternate_price_class is not None and price is None:\n",
236+
" price = soup.find(class_=alternate_price_class)\n",
237+
" details[\"date_of_scraping\"] = timestamp\n",
238+
" if \"ibertronica\" in seller_raw:\n",
239+
" details[\"seller\"] = re.sub('\\.es.*', '', seller_raw)\n",
240+
" else:\n",
241+
" details[\"seller\"] = re.sub('\\.com.*', '', seller_raw)\n",
242+
" if name is not None:\n",
243+
" details[\"name\"] = name.get_text()\n",
244+
" details[\"name\"] = re.sub(\"GeForce\", \"\", details[\"name\"])\n",
245+
" details[\"name\"] = re.sub(\"®\", \"\", details[\"name\"])\n",
246+
" details[\"name\"] = re.sub(\" - {2}Tarjeta Gráfica\", \"\", details[\"name\"])\n",
247+
" details[\"name\"] = re.sub(\" {2}\", \" \", details[\"name\"])\n",
248+
" details[\"name\"] = re.sub(\"DDD\", \"DD\", details[\"name\"])\n",
249+
" details[\"name\"] = details[\"name\"].upper()\n",
250+
" details[\"name\"] = re.sub(\"ASUS TURBO RTX 3090\", \"ASUS RTX 3090 TURBO\", details[\"name\"])\n",
251+
" details[\"url\"] = urls\n",
252+
" else:\n",
253+
" details = None\n",
254+
" logger.warning(f\"URL: {urls} not scraped because the name of the product was not found @ {timestamp}\")\n",
255+
" return details\n",
256+
" if price is not None:\n",
257+
" details[\"price\"] = int(re.sub('[^0-9]', '', price.get_text())[0:4])\n",
258+
" if in_stock is not None:\n",
259+
" details[\"in_stock\"] = True\n",
260+
" if int(details[\"price\"]) <= 1800:\n",
261+
" details[\"deal\"] = True\n",
262+
" logger.info(f\"{urls} scraped successfully @ {timestamp}\")\n",
263+
" except Exception as ex:\n",
264+
" logger.warning(f\"Exception caught @ get_product_details :{ex}\")\n",
265+
" details = None\n",
266+
" return details\n",
267+
"\n",
268+
"\n",
269+
"def iterate_webpages(dictionary):\n",
270+
" \"\"\" Helper function to iterate over our pages directory using the get_products_details function\"\"\"\n",
271+
" if not dictionary:\n",
272+
" logger.warning(f\"Nothing to scrape, ending script\")\n",
273+
" sys.exit(1)\n",
274+
" sql_information_list = []\n",
275+
" for key in dictionary:\n",
276+
" query = get_product_details(*dictionary[key])\n",
277+
" if query is not None:\n",
278+
" sql_information_list.append(query)\n",
279+
" if not sql_information_list:\n",
280+
" logger.warning(f\"No information was scraped, terminating {timestamp_of_script}\")\n",
281+
" sys.exit(1)\n",
282+
" return sql_information_list\n",
283+
"\n",
284+
"\n",
285+
"def create_message(scraped_data):\n",
286+
" \"\"\" A simple function that creates the message to be sent in an email if the conditions are met.\"\"\"\n",
287+
" message = \"\"\n",
288+
" for dic in scraped_data:\n",
289+
" if dic[\"in_stock\"] and dic[\"deal\"]:\n",
290+
" line = f\"The item sold by {dic['seller']} is on sale for {dic['price']} euros @ {dic['url']}\\n\"\n",
291+
" message += line\n",
292+
" return message\n",
293+
"\n",
294+
"\n",
295+
"def send_email(message, config):\n",
296+
" \"\"\" This function sends the actual email should the conditions be met.\"\"\"\n",
297+
" try:\n",
298+
" with open(config) as reader:\n",
299+
" lines = reader.read().splitlines()\n",
300+
" port = 465 # For SSL\n",
301+
" smtp_server = lines[0]\n",
302+
" sender_email = lines[1]\n",
303+
" password = lines[2]\n",
304+
" receiver_email = lines[3]\n",
305+
" print(smtp_server, sender_email, password, receiver_email)\n",
306+
"\n",
307+
" message_to_send = f\"Subject: Price Alert \\n\\n {message}\"\n",
308+
" message_to_send = re.sub(r'[^\\x00-\\x7F]+', ' ', message_to_send) # Quick and dirty regex to remove non ascii chars.\n",
309+
"\n",
310+
" context = ssl.create_default_context()\n",
311+
" with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:\n",
312+
" server.login(sender_email, password)\n",
313+
" server.sendmail(sender_email, receiver_email, message_to_send)\n",
314+
" except Exception as ex:\n",
315+
" logger.warning(f\"Exception caught when trying to send an email @ send_email():{ex}\")\n",
316+
"\n",
317+
"\n",
318+
"def do_insert(rec, config):\n",
319+
" \"\"\" This function inserts the scraped data into our Postgres DB, should an exception occur the function will\n",
320+
" rollback the transaction and continue with the rest.\"\"\"\n",
321+
" try:\n",
322+
" with open(config) as reader:\n",
323+
" lines = reader.read().splitlines()\n",
324+
" db_name = lines[0]\n",
325+
" username = lines[1]\n",
326+
" password = lines[2]\n",
327+
" ip_address = lines[3]\n",
328+
" port = lines[4]\n",
329+
" conn = psycopg2.connect(dbname=db_name, user=username, password=password, host=ip_address, port=port)\n",
330+
" cur = conn.cursor()\n",
331+
" except Exception as ex:\n",
332+
" logger.warning(f\"Exception caught when reading config file @ do_insert():{ex}\")\n",
333+
" sys.exit(1)\n",
334+
"\n",
335+
" for dictionary in rec:\n",
336+
" try:\n",
337+
" cols = dictionary.keys()\n",
338+
" cols_str = ','.join(cols)\n",
339+
" values_to_insert = [dictionary[k] for k in cols]\n",
340+
" values_wildcards = ','.join(['%s' for i in range(len(values_to_insert))]) # -> %s,%s,%s,%s,%s,%s,%s\n",
341+
" sql_str = f\"INSERT INTO scraped_data ({cols_str}) VALUES ({values_wildcards}) ON CONFLICT DO NOTHING\"\n",
342+
" cur.execute(sql_str, values_to_insert)\n",
343+
" conn.commit()\n",
344+
" except Exception as ex:\n",
345+
" conn.rollback()\n",
346+
" logger.warning(f\"Exception caught @ do_insert():{ex}\")\n",
347+
" continue\n",
348+
"\n",
349+
"\n",
350+
"def main():\n",
351+
" scraped_data = iterate_webpages(pages_dictionary)\n",
352+
" email = create_message(scraped_data)\n",
353+
" if email:\n",
354+
" send_email(email, config_path)\n",
355+
" do_insert(scraped_data, pg_config_path)\n",
356+
" logger.info(f\"We are done! @ {timestamp_of_script}\")\n",
357+
"\n",
358+
"\n",
359+
"if __name__ == \"__main__\":\n",
360+
"\n",
361+
" parser = argparse.ArgumentParser()\n",
362+
" parser.add_argument(\"email_config_file\",\n",
363+
" type=str,\n",
364+
" help=\"a text file with email_config parameters for sending the email\")\n",
365+
" parser.add_argument(\"postgres_config_file\",\n",
366+
" type=str,\n",
367+
" help=\"a text file with email_config parameters connecting to our postgres db\")\n",
368+
" args = parser.parse_args()\n",
369+
" pwd = os.getcwd()\n",
370+
" config_path = os.path.join(pwd, args.email_config_file)\n",
371+
" pg_config_path = os.path.join(pwd, args.postgres_config_file)\n",
372+
"\n",
373+
" main()"
374+
]
91375
}
92376
],
93377
"metadata": {
94378
"kernelspec": {
95-
"display_name": "Python 3",
379+
"display_name": "Python 3 (ipykernel)",
96380
"language": "python",
97381
"name": "python3"
98382
},
@@ -106,7 +390,7 @@
106390
"name": "python",
107391
"nbconvert_exporter": "python",
108392
"pygments_lexer": "ipython3",
109-
"version": "3.7.6"
393+
"version": "3.8.10"
110394
}
111395
},
112396
"nbformat": 4,

0 commit comments

Comments
 (0)