{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to Python \n", "\n", "### Introduction to Scraping" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#import pylab\n", "import re\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import requests\n", "import urllib\n", "from bs4 import BeautifulSoup as bs\n", "\n", "from IPython.display import HTML\n", "\n", "%matplotlib inline\n", "#%matplotlib notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 - Using only Pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 - Scraping HTML tables with Pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Checking the page: [List of cities and towns in Austria](https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_Austria)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url1 = \"https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_Austria\"\n", "list_df_tables = pd.read_html(url1)\n", "len(list_df_tables)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameStatePopulation (as of 2022)[1]
0ViennaVienna1931593
1GrazStyria292630
2LinzUpper Austria207247
3SalzburgSalzburg155331
4InnsbruckTyrol130585
\n", "
" ], "text/plain": [ " Name State Population (as of 2022)[1]\n", "0 Vienna Vienna 1931593\n", "1 Graz Styria 292630\n", "2 Linz Upper Austria 207247\n", "3 Salzburg Salzburg 155331\n", "4 Innsbruck Tyrol 130585" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_df_tables[0].head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameDesignationPopulation
0AndauMarket town2285
1AntauOther municipality758
2ApetlonMarket town1760
3Bad SauerbrunnOther municipality2219
4Bad TatzmannsdorfOther municipality1554
5BadersdorfOther municipality287
6BaumgartenOther municipality894
7BernsteinMarket town2123
8BildeinOther municipality343
9BocksdorfOther municipality813
\n", "
" ], "text/plain": [ " Name Designation Population\n", "0 Andau Market town 2285\n", "1 Antau Other municipality 758\n", "2 Apetlon Market town 1760\n", "3 Bad Sauerbrunn Other municipality 2219\n", "4 Bad Tatzmannsdorf Other municipality 1554\n", "5 Badersdorf Other municipality 287\n", "6 Baumgarten Other municipality 894\n", "7 Bernstein Market town 2123\n", "8 Bildein Other municipality 343\n", "9 Bocksdorf Other municipality 813" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_df_tables[1].head(10)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameDesignationPopulation
0Afritz am SeeOther municipality1429
1AlbeckOther municipality995
2AlthofenOther city4759
3ArnoldsteinMarket town7096
4ArriachOther municipality1349
\n", "
" ], "text/plain": [ " Name Designation Population\n", "0 Afritz am See Other municipality 1429\n", "1 Albeck Other municipality 995\n", "2 Althofen Other city 4759\n", "3 Arnoldstein Market town 7096\n", "4 Arriach Other municipality 1349" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_df_tables[2].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2 - Read files from URLs with Pandas\n", "\n", "Source: [data.gv.at](https://www.data.gv.at/katalog/dataset/41f8db46-d2d9-48b3-a8b3-138742613c1a)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumKategorieTitelBeschreibung
001.Jän.23InternationalNeujahr (staatlicher Feiertag)Beginn des Jahres nach dem gregorianischen Kal...
106.Jän.23ChristentumHeilige Drei Könige (katholisch, evangelisch, ...Beim Dreikönigsfest (auch Epiphanie, das heißt...
206.Jän.23ChristentumOrthodoxer Heiliger Abend (nach julianischem K...Als Symbol für den Stall, den Ort der Geburt J...
307.Jän.23ChristentumOrthodoxes Weihnachten/Orthodoxer Christtag (n...Serbische Bräuche: Am frühen Morgen besucht ei...
422.Jän.23TraditionellChinesisches NeujahrNach dem chinesischen Kalender beginnt das Jah...
502.Feb.23ChristentumMariä Lichtmess / Darstellung des Herrn (katho...Auch \"Einführung Jesu in den Tempel\" genannt. ...
606.Feb.23JudentumTu BiSchwatIn Hebräisch so viel wie: \"15. Tag im Monat Sc...
714.Feb.23InternationalValentinstagAn diesem Tag werden Blumen als Zeichen der Zu...
816.Feb.23IslamHizir Fest14. bis 16. Februar: Hizir Fasten / Festtag (a...
922.Feb.23ChristentumAschermittwoch (katholisch, evangelisch)Für die katholischen Christinnen und Christen ...
\n", "
" ], "text/plain": [ " Datum Kategorie \\\n", "0 01.Jän.23 International \n", "1 06.Jän.23 Christentum \n", "2 06.Jän.23 Christentum \n", "3 07.Jän.23 Christentum \n", "4 22.Jän.23 Traditionell \n", "5 02.Feb.23 Christentum \n", "6 06.Feb.23 Judentum \n", "7 14.Feb.23 International \n", "8 16.Feb.23 Islam \n", "9 22.Feb.23 Christentum \n", "\n", " Titel \\\n", "0 Neujahr (staatlicher Feiertag) \n", "1 Heilige Drei Könige (katholisch, evangelisch, ... \n", "2 Orthodoxer Heiliger Abend (nach julianischem K... \n", "3 Orthodoxes Weihnachten/Orthodoxer Christtag (n... \n", "4 Chinesisches Neujahr \n", "5 Mariä Lichtmess / Darstellung des Herrn (katho... \n", "6 Tu BiSchwat \n", "7 Valentinstag \n", "8 Hizir Fest \n", "9 Aschermittwoch (katholisch, evangelisch) \n", "\n", " Beschreibung \n", "0 Beginn des Jahres nach dem gregorianischen Kal... \n", "1 Beim Dreikönigsfest (auch Epiphanie, das heißt... \n", "2 Als Symbol für den Stall, den Ort der Geburt J... \n", "3 Serbische Bräuche: Am frühen Morgen besucht ei... \n", "4 Nach dem chinesischen Kalender beginnt das Jah... \n", "5 Auch \"Einführung Jesu in den Tempel\" genannt. ... \n", "6 In Hebräisch so viel wie: \"15. Tag im Monat Sc... \n", "7 An diesem Tag werden Blumen als Zeichen der Zu... \n", "8 14. bis 16. Februar: Hizir Fasten / Festtag (a... \n", "9 Für die katholischen Christinnen und Christen ... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://www.wien.gv.at/menschen/integration/ogd/interkultureller-kalender-2023.csv\", \n", " skiprows=5, \n", " usecols=[0,1,2,3],\n", " engine=\"python\",\n", " on_bad_lines=\"skip\",\n", " sep=\";\",\n", " )\n", "df.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***\n", "\n", "## 2 - Scraping static pages using only [Requests](https://docs.python-requests.org/en/master/)" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### Which one to use: [urllib or requests?](https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-urllib3-and-requests-modul)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1 - Getting html content of the page \n", "\n", "Source: [https://www.data.gv.at/](https://www.data.gv.at/suche/?typeFilter%5B%5D=dataset)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "200\n" ] } ], "source": [ "url2 = 'https://www.data.gv.at/suche/?typeFilter%5B%5D=dataset' \n", "content = requests.get(url2)\n", "print(content.status_code)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check [HTTP status codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n\\n\\n\\n\\t\\n\\n\\nSuche | data.gv.at\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\t\\n\\t\\n\\t\\n\\t\\n\\t\\n\\n\\t\\n\\t\\n\\t\\n\\t\\n\\t\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "

\n", " Seitenbereiche\n", "

\n", " \n", "
\n", "
\n", " \n", "
\n", " \n", " \"Logo\n", " \n", "
\n", "
\n", "
\n", "
\n", " 41146\n", "
\n", "
\n", " 701\n", "
\n", "
\n", " 2390\n", "
\n", "
\n", "
\n", "
\n", " \n", " Datensätze\n", " \n", "
\n", "
\n", " \n", " Anwendungen\n", " \n", "
\n", "
\n", " \n", " Organisationen\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \"Mastodon\n", " \n", " */ moment.updateLocale( 'de_DE', {\"months\":[\"Januar\",\"Februar\",\"M\\u00e4rz\",\"April\",\"Mai\",\"Juni\",\"Juli\",\"August\",\"September\",\"Oktober\",\"November\",\"Dezember\"],\"monthsShort\":[\"Jan\",\"Feb\",\"Mrz\",\"Apr\",\"Mai\",\"Jun\",\"Jul\",\"Aug\",\"Sep\",\"Okt\",\"Nov\",\"Dez\"],\"weekdays\":[\"Sonntag\",\"Montag\",\"Dienstag\",\"Mittwoch\",\"Donnerstag\",\"Freitag\",\"Samstag\"],\"weekdaysShort\":[\"So\",\"Mo\",\"Di\",\"Mi\",\"Do\",\"Fr\",\"Sa\"],\"week\":{\"dow\":1},\"longDateFormat\":{\"LT\":\"H:i\",\"LTS\":null,\"L\":null,\"LL\":\"j. F Y\",\"LLL\":\"j. F Y G:i\",\"LLLL\":null}} ); Seitenbereiche Inhalt (Accesskey 0)\n", "Hauptmenü (Accesskey 1)\n", "Suche (Accesskey 2) \t\t\t\t\t\t\t41146\t\t\t\t\t\t \t\t\t\t\t\t\t701\t\t\t\t\t\t \t\t\t\t\t\t\t2390\t\t\t\t\t\t Datensätze Anwendungen Organisationen de en deen \t\t\t\t\tdocument.getElementById(\"ckan_de\").onclick = function () {\n", "\t\t\t\t\t\tlocation.pathname = location.pathname.replace(/^\\/katalog\\/en\\//, '/katalog/');\n", "\t\t\t\t\t}; \t\t\t\t\tdocument.getElementById(\"ckan_en\").onclick = function () {\n", "\t\t\t\t\t\tlocation.pathname = location.pathname.replace(/^\\/katalog(\\/|\\/de\\/)/, '/katalog/en/');\n", "\t\t\t\t\t};\n", "\t\t\t\t Grußbotschaft des Bundespräsidenten zum 10-jährigen data.gv.at Jubiläum Hauptmenü Daten Datensatz finden\n", "Gesamtbestand aller Datensätze COVID-19 Informationsportal\n", "Ausschreibungen laut BVergG2018\n", "Geodaten – INSPIRE Datensätze mit Anwendungen\n", "Veröffentlichende Stellen\n", "Visualisierung der Daten Anleitung zur Datenveröffentlichung\n", "Daten/Dokumente hinzufügen Anwendungen Anwendung finden\n", "Anwendung einreichen Infos Zielsetzung data.gv.at\n", "Open Data Prinzipien\n", "Entwicklung von Open Data\n", "E-Learning-Module zu Open Data\n", "Open Data Videos Cooperation OGD Österreich\n", "Kooperation OGD D-A-CH-LI Informationen zum BVergG 2018\n", "Leitfaden Open Data Analyse\n", "Leitfaden Open Data Governance\n", "Netiquette\n", "Erklärung zur Barrierefreiheit\n", "Datenschutzerklärung\n", "Impressum News News\n", "Newsletter abonnieren Inhalt \tlet params = (new URL(window.location.href)).searchParams;\n", "\tlet typeFilterInput = document.getElementById('typeFilter');\n", "\twindow.addEventListener('load', function() {\n", "\t\ttypeFilterInput.value = params.has('typeFilter[]') ? params.get('typeFilter[]') : (params.has('typeFilter[0]') ? params.get('typeFilter[0]') : '')\n", "\t});\n", "\t Suche erweiterte Suche 42192 Treffer Vorauswahl \n", "OpenDocument (4307)\n", "\t\t\t\t\t\t\t HighValueDataset (3)\n", "\t\t\t\t\t\t\t Kategorien \n", "Bevölkerung (1)\n", "\t\t\t\t\t\t\t Gesellschaft und Soziales (1)\n", "\t\t\t\t\t\t\t Landwirtschaft, Fischerei, Forstwirtschaft & Lebensmittel (140)\n", "\t\t\t\t\t\t\t Wirtschaft & Finanzen (33975)\n", "\t\t\t\t\t\t\t Bildung, Kultur & Sport (448)\n", "\t\t\t\t\t\t\t Energie (0)\n", "\t\t\t\t\t\t\t Umwelt (4311)\n", "\t\t\t\t\t\t\t Regierung & Öffentliche Verwaltung (948)\n", "\t\t\t\t\t\t\t Gesundheit (149)\n", "\t\t\t\t\t\t\t Internationale Themen (0)\n", "\t\t\t\t\t\t\t Justiz, Gesetze & Öffentliche Sicherheit (0)\n", "\t\t\t\t\t\t\t Regionen & Städte (832)\n", "\t\t\t\t\t\t\t Bevölkerung & Gesellschaft (871)\n", "\t\t\t\t\t\t\t Wissenschaft & Technologie (0)\n", "\t\t\t\t\t\t\t Transport & Verkehr (403)\n", "\t\t\t\t\t\t\t Kunst und Kultur (1)\n", "\t\t\t\t\t\t\t Umwelt (109)\n", "\t\t\t\t\t\t\t Veröffentlichende Stelle \n", "Offenerhaushalt.at (32669)\n", "\t\t\t\t\t\t\t Nationalparks Austria (3325)\n", "\t\t\t\t\t\t\t Stadt Linz (767)\n", "\t\t\t\t\t\t\t Umweltbundesamt GmbH (741)\n", "\t\t\t\t\t\t\t Stadt Wien (590)\n", "\t\t\t\t\t\t\t Gemeinde Engerwitzdorf (514)\n", "\t\t\t\t\t\t\t Statistik Austria (370)\n", "\t\t\t\t\t\t\t Land Oberösterreich (223)\n", "\t\t\t\t\t\t\t Stadt Graz (198)\n", "\t\t\t\t\t\t\t Land Salzburg (195)\n", "\t\t\t\t\t\t\t Alle anzeigen ...Veröffentlichende Stelle×AAGES (1)\n", "\t\t\t\t\t\t\tAGES GmbH (1)\n", "\t\t\t\t\t\t\tAMA (33)\n", "\t\t\t\t\t\t\tAMS Österreich (4)\n", "\t\t\t\t\t\t\tANKÖ Service Ges.m.b.H (1)\n", "\t\t\t\t\t\t\tASFINAG (1)\n", "\t\t\t\t\t\t\tAUSSCHREIBUNG.AT (1)\n", "\t\t\t\t\t\t\tAustriaTech (1)\n", "\t\t\t\t\t\t\tAustria Wirtschaftsservice Gesellschaft mbH (1)\n", "\t\t\t\t\t\t\tBBauamt Ref. IV (1)\n", "\t\t\t\t\t\t\tBetriebsleitung Fusch (1)\n", "\t\t\t\t\t\t\tBEV (44)\n", "\t\t\t\t\t\t\tBildungsdirektion Oberösterreich (1)\n", "\t\t\t\t\t\t\tBKA (25)\n", "\t\t\t\t\t\t\tBMA (10)\n", "\t\t\t\t\t\t\tBMBF (1)\n", "\t\t\t\t\t\t\tBMBWF (7)\n", "\t\t\t\t\t\t\tBMEIA (2)\n", "\t\t\t\t\t\t\tBMF (5)\n", "\t\t\t\t\t\t\tBMI (6)\n", "\t\t\t\t\t\t\tBMLRT (1)\n", "\t\t\t\t\t\t\tBMNT (9)\n", "\t\t\t\t\t\t\tBMSGPK (23)\n", "\t\t\t\t\t\t\tBMVIT (5)\n", "\t\t\t\t\t\t\tBundesbeschaffung Ges.m.b.H (1)\n", "\t\t\t\t\t\t\tBundesdenkmalamt (27)\n", "\t\t\t\t\t\t\tCCAMPUS 02-Fachhochschule der Wirtschaft GmbH (1)\n", "\t\t\t\t\t\t\tCooperation OGD Österreich (11)\n", "\t\t\t\t\t\t\tDDachverband der Sozialversicherungsträger (3)\n", "\t\t\t\t\t\t\tEE-Control (1)\n", "\t\t\t\t\t\t\tEinkaufsmanagement WU Wien (3)\n", "\t\t\t\t\t\t\tEnergie Burgenland AG (1)\n", "\t\t\t\t\t\t\teVergabe.at (1)\n", "\t\t\t\t\t\t\tEVN AG, Beschaffung und Einkauf (1)\n", "\t\t\t\t\t\t\tGGemeinde Altmünster (1)\n", "\t\t\t\t\t\t\tGemeinde Diex (1)\n", "\t\t\t\t\t\t\tGemeinde Edt bei Lambach (1)\n", "\t\t\t\t\t\t\tGemeinde Engerwitzdorf (514)\n", "\t\t\t\t\t\t\tGemeinde Eugendorf (2)\n", "\t\t\t\t\t\t\tGemeinde Gutenberg-Stenzengreith (1)\n", "\t\t\t\t\t\t\tGemeinde Gössendorf (2)\n", "\t\t\t\t\t\t\tGemeinde Hippach, Johann-Sponring-Straße 80, 6283 Hippach (1)\n", "\t\t\t\t\t\t\tGemeinde Kirchham (7)\n", "\t\t\t\t\t\t\tGemeinde Klaffer am Hochficht (1)\n", "\t\t\t\n" ] } ], "source": [ "#print(soup.get_text()[0:10000])\n", "print(re.sub(r\"\\n{2,}\",\" \", soup.get_text()[0:5000]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1.3 - Getting page information" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Suche | data.gv.at\n", "title\n", "Suche | data.gv.at\n", "head\n", "

Seitenbereiche

\n" ] } ], "source": [ "# title of the page\n", "print(soup.title)\n", "\n", "# get attributes:\n", "print(soup.title.name)\n", "\n", "# get values:\n", "print(soup.title.string)\n", "\n", "# beginning navigation:\n", "print(soup.title.parent.name)\n", "\n", "# getting specific values:\n", "print(soup.p)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1.3.1 - Finding paragraph tags `p` is a fairly common task. \n", "In the case above, we're just finding the first one. What if we wanted to find them all?" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[

Seitenbereiche

,

\"\"

]\n" ] } ], "source": [ "print(soup.find_all('p'))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Seitenbereiche\n", "None\n" ] } ], "source": [ "for paragraph in soup.find_all('p'):\n", " print(paragraph.string)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Seitenbereiche\n", "\n" ] } ], "source": [ "for paragraph in soup.find_all('p'):\n", " print(str(paragraph.text))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The difference between string and text is that string produces a NavigableString object, and text is just typical unicode text. Notice that, if there are child tags in the paragraph item that we're attempting to use .string on, we will get None returned." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1.3.2 - Extracting anchors and links" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[
Inhalt (Accesskey 0),\n", " Hauptmenü (Accesskey 1),\n", " Suche (Accesskey 2),\n", " \"Logo,\n", " \n", " de,\n", " \n", " en,\n", " de,\n", " en,\n", " \n", " \"Mastodon\n", " ,\n", " \n", " \"Twitter\n", " ,\n", " \n", " \"Facebook\n", " ,\n", " \n", " \"Instagram\n", " ,\n", " \n", " \"RSS\n", " ,\n", " Grußbotschaft des Bundespräsidenten zum 10-jährigen data.gv.at Jubiläum,\n", " Hauptmenü,\n", " ,\n", " ,\n", " Datensatz finden,\n", " Gesamtbestand aller Datensätze,\n", "
]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.find_all('a')[0:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1.3.3 - Extracting only the *href* of anchors and links" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#jump_inhalt\n", "#jump_menu\n", "#jump_suche\n", "/\n", "https://www.data.gv.at/suche/?typeFilter%5B0%5D=dataset\n", "https://www.data.gv.at/en/search/?typeFilter%5B0%5D=dataset\n", "#\n", "#\n", "https://mastodon.social/@datagvat\n", "https://twitter.com/datagvat\n", "https://www.facebook.com/groups/opendataaustria\n", "https://www.instagram.com/data.gv.at/\n", "/infos/rss-feeds/\n", "/2022/09/29/10-jahre-data-gv-at-van-der-bellen/\n", "None\n", "#\n", "https://www.data.gv.at/daten/\n", "https://www.data.gv.at/daten/\n", "/katalog/dataset/metadaten-von-ogd-osterreich\n", "#\n", "https://www.data.gv.at/daten/covid-19/\n", "https://www.data.gv.at/daten/ausschreibungen-laut-bvergg2018/\n", "https://www.data.gv.at/daten/inspire/\n", "#\n", "https://www.data.gv.at/daten/ckan-apps/\n", "https://www.data.gv.at/daten/veroeffentlichende-stellen/\n", "https://www.data.gv.at/2021/06/09/neue-interaktive-visualisierung-verfuegbar-die-daten-von-data-gv-at/\n", "#\n", "https://www.data.gv.at/daten/anleitung-zur-datenveroeffentlichung/\n", "https://www.data.gv.at/daten/daten-hinzufuegen/\n" ] } ], "source": [ "for link in soup.find_all('a')[0:30]:\n", " print(link.get('href'))\n", " #print(link.text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Now, rather than working with the entire soup, we can specify a new Beautiful Soup object. An example might be:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "nav = soup.nav" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Next, we can grab the links from just the nav bar:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#\n", "https://www.data.gv.at/daten/\n", "https://www.data.gv.at/daten/\n", "/katalog/dataset/metadaten-von-ogd-osterreich\n", "#\n", "https://www.data.gv.at/daten/covid-19/\n", "https://www.data.gv.at/daten/ausschreibungen-laut-bvergg2018/\n", "https://www.data.gv.at/daten/inspire/\n", "#\n", "https://www.data.gv.at/daten/ckan-apps/\n", "https://www.data.gv.at/daten/veroeffentlichende-stellen/\n", "https://www.data.gv.at/2021/06/09/neue-interaktive-visualisierung-verfuegbar-die-daten-von-data-gv-at/\n", "#\n", "https://www.data.gv.at/daten/anleitung-zur-datenveroeffentlichung/\n", "https://www.data.gv.at/daten/daten-hinzufuegen/\n", "https://www.data.gv.at/applikationen/\n", "https://www.data.gv.at/applikationen/\n", "https://www.data.gv.at/applikationen/anwendung-einreichen/\n", "https://www.data.gv.at/infos/\n", "https://www.data.gv.at/infos/zielsetzung-data-gv-at/\n", "https://www.data.gv.at/infos/open-data-prinzipien/\n", "https://www.data.gv.at/infos/entwicklung-von-open-data/\n", "https://www.data.gv.at/infos/e-learning-module-zu-open-data/\n", "https://www.data.gv.at/infos/videos/\n", "#\n", "https://www.data.gv.at/infos/cooperation-ogd-oesterreich/\n", "https://www.data.gv.at/infos/ogd-d-a-ch-li/\n", "#\n", "https://www.data.gv.at/infos/bvergg2018/\n", "https://www.data.gv.at/infos/analyse/\n" ] } ], "source": [ "for url in nav.find_all('a')[0:30]:\n", " print(url.get('href'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this case, we're grabbing the first nav tags that we can find (the navigation bar). You could also go for soup.body to get the body section, then grab the .text from there:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Seitenbereiche\n", "\n" ] } ], "source": [ "body = soup.body\n", "for paragraph in body.find_all('p'):\n", " print(paragraph.text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, sometimes there might be multiple tags with the same names, but different classes, and you might want to grab information from a specific tag with a specific class. For example, our page that we're working with has a div tag with the class of \"body\". We can work with this data like so:" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for div in soup.find_all('div', class_='header--social'):\n", " print(div)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for div in soup.find_all('div', class_='header--menu'):\n", " print(div)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1.3.4 - Extracting only *http* and *https* from *href* links" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.data.gv.at/suche/?typeFilter%5B0%5D=dataset\n", "https://www.data.gv.at/en/search/?typeFilter%5B0%5D=dataset\n", "https://mastodon.social/@datagvat\n", "https://twitter.com/datagvat\n", "https://www.facebook.com/groups/opendataaustria\n", "https://www.instagram.com/data.gv.at/\n", "https://www.data.gv.at/daten/\n", "https://www.data.gv.at/daten/\n", "https://www.data.gv.at/daten/covid-19/\n", "https://www.data.gv.at/daten/ausschreibungen-laut-bvergg2018/\n", "https://www.data.gv.at/daten/inspire/\n", "https://www.data.gv.at/daten/ckan-apps/\n", "https://www.data.gv.at/daten/veroeffentlichende-stellen/\n", "https://www.data.gv.at/2021/06/09/neue-interaktive-visualisierung-verfuegbar-die-daten-von-data-gv-at/\n", "https://www.data.gv.at/daten/anleitung-zur-datenveroeffentlichung/\n", "https://www.data.gv.at/daten/daten-hinzufuegen/\n", "https://www.data.gv.at/applikationen/\n", "https://www.data.gv.at/applikationen/\n", "https://www.data.gv.at/applikationen/anwendung-einreichen/\n", "https://www.data.gv.at/infos/\n", "https://www.data.gv.at/infos/zielsetzung-data-gv-at/\n", "https://www.data.gv.at/infos/open-data-prinzipien/\n", "https://www.data.gv.at/infos/entwicklung-von-open-data/\n", "https://www.data.gv.at/infos/e-learning-module-zu-open-data/\n", "https://www.data.gv.at/infos/videos/\n", "https://www.data.gv.at/infos/cooperation-ogd-oesterreich/\n", "https://www.data.gv.at/infos/ogd-d-a-ch-li/\n", "https://www.data.gv.at/infos/bvergg2018/\n", "https://www.data.gv.at/infos/analyse/\n", "https://www.data.gv.at/infos/governance/\n" ] } ], "source": [ "for link in soup.findAll('a', attrs={'href': re.compile(\"^http[s]?://\")})[0:30]:\n", " print(link.get('href'))" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### 2.1.4 - Creating a small generic function to extract links:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def getLinks(url):\n", " raw_page = requests.get(url)\n", " html_page = raw_page.text\n", " #html_page = urllib.request.urlopen(url)\n", " soup = bs(html_page)\n", " links = []\n", " for link in soup.findAll('a', attrs={'href': re.compile(\"^http[s]?://\")}):\n", " links.append(link.get('href'))\n", " return links" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['https://www.krone.at/', 'https://apps.apple.com/us/app/krone/id633492143?itsct=apps_box_badge&itscg=30200', 'https://play.google.com/store/apps/details?id=at.krone&pcampaignid=pcampaignidMKT-Other-global-all-co-prtnr-py-PartBadge-Mar2515-1', 'https://vorteilswelt.krone.at/', 'https://service.krone.at/?23028&utm_source=krone.at&utm_medium=www&utm_campaign=StartseitenVerlinkung&utm_content=service', 'https://www.krone.at/epaper_weiche', 'https://www.krone.at/community', 'https://www.krone.at/gewinnspiele', 'https://www.krone.at', 'https://www.krone.at/kroneplus', 'https://www.krone.at/oesterreich', 'https://www.krone.at/bundeslaender', 'https://www.krone.at/politik', 'https://www.krone.at/ausland', 'https://www.krone.at/sport', 'https://www.krone.at/wm', 'https://www.krone.at/adabei', 'https://www.krone.at/digital', 'https://www.krone.at/weihnachts-extra', 'https://www.krone.at/motor', 'https://www.krone.at/wirtschaft', 'https://www.krone.at/wissenschaft', 'https://www.krone.at/gericht', 'https://www.krone.at/kolumnen', 'https://www.krone.at/freizeit', 'https://www.krone.at/tierecke', 'https://www.krone.at/gesund', 'https://www.krone.at/kultur', 'https://www.krone.at/kulinarik', 'https://www.krone.at/videos', 'https://www.krone.at/wm', 'https://www.krone.at/ticker/soccer/0/spielplan', 'https://www.krone.at/ticker/soccer/0/tabelle', 'https://www.krone.at/2854385', 'https://www.krone.at/2877502', 'https://www.krone.at/2877580', 'https://fan.at/wm-tippspiel?utm_source=krone&utm_campaign=wm-2022&utm_medium=bettinggame-teaser&utm_content=startpage', 'https://www.krone.at/2877626', 'https://www.krone.at/2877516', 'https://www.krone.at/2877263', 'https://www.krone.at/2877657', 'https://www.krone.at/2877626', 'https://www.krone.at/2877646', 'https://www.krone.at/2877654', 'https://www.krone.at/2877643', 'https://www.krone.at/2877627', 'https://www.krone.at/2877605', 'https://www.krone.at/2877636', 'https://www.krone.at/2877306', 'https://www.krone.at/2877383', 'https://www.krone.at/2877304', 'https://www.krone.at/2877525', 'https://www.krone.at/2877628', 'https://www.krone.at/2877624', 'https://www.krone.at/2877611', 'https://www.krone.at/2877610', 'https://www.krone.at/2877618', 'https://www.krone.at/2877601', 'https://www.krone.at/2877584', 'https://www.krone.at/2877588', 'https://www.krone.at/2877654', 'https://www.krone.at/2877539', 'https://www.krone.at/2877605', 'https://www.krone.at/weihnachts-extra', 'https://www.krone.at/2877318', 'https://www.krone.at/2877601', 'https://www.krone.at/2877531', 'https://www.krone.at/2877212', 'https://www.krone.at/2877566', 'https://www.krone.at/2877584', 'https://www.krone.at/oesterreich', 'https://www.krone.at/politik', 'https://www.krone.at/wirtschaft', 'https://www.krone.at/gericht', 'https://www.krone.at/wetter/oesterreich', 'https://www.krone.at/2877306', 'https://www.krone.at/2877611', 'https://www.krone.at/2877360', 'https://www.krone.at/2852734', 'https://www.krone.at/bundeslaender', 'https://www.krone.at/2877306', 'https://www.krone.at/2877383', 'https://www.krone.at/2877362', 'https://www.krone.at/2877525', 'https://www.krone.at/2877251', 'https://www.krone.at/2877556', 'https://www.krone.at/2877554', 'https://www.krone.at/2877205', 'https://www.krone.at/2877212', 'https://www.krone.at/2877389', 'https://www.krone.at/2877314', 'https://www.krone.at/2877321', 'https://www.krone.at/2877362', 'https://www.krone.at/2876519', 'https://www.krone.at/2877578', 'https://www.krone.at/2877516', 'https://www.krone.at/2877405', 'https://www.krone.at/2877463', 'https://www.krone.at/2877343', 'https://www.krone.at/2876602', 'https://www.krone.at/2877383', 'https://www.krone.at/2877461', 'https://www.krone.at/2877532', 'https://www.krone.at/2877551', 'https://www.krone.at/2877310', 'https://www.krone.at/2877360', 'https://www.krone.at/2877539', 'https://www.krone.at/2877455', 'https://www.krone.at/2877417', 'https://www.krone.at/2877576', 'https://www.krone.at/2877281', 'https://www.krone.at/2877537', 'https://www.krone.at/2877306', 'https://www.krone.at/2877611', 'https://www.krone.at/2877574', 'https://www.krone.at/2877174', 'https://www.krone.at/2877525', 'https://www.krone.at/2877485', 'https://www.krone.at/2877454', 'https://www.krone.at/2877089', 'https://www.krone.at/wien', 'https://www.krone.at/niederoesterreich', 'https://www.krone.at/oberoesterreich', 'https://www.krone.at/burgenland', 'https://www.krone.at/salzburg', 'https://www.krone.at/steiermark', 'https://www.krone.at/kaernten', 'https://www.krone.at/tirol', 'https://www.krone.at/vorarlberg', 'https://www.krone.at/politik', 'https://www.krone.at/2877383', 'https://www.krone.at/2877389', 'https://www.krone.at/ausland', 'https://www.krone.at/video-nachrichten', 'https://www.krone.at/wetter/welt', 'https://www.krone.at/2877582', 'https://www.krone.at/2877573', 'https://www.krone.at/2877359', 'https://www.krone.at/2877413', 'https://www.krone.at/sport', 'https://www.krone.at/fussball', 'https://www.krone.at/motorsport', 'https://www.krone.at/wintersport', 'https://www.krone.at/eishockey', 'https://www.krone.at/sport-mix', 'https://www.krone.at/2877657', 'https://www.krone.at/2877304', 'https://www.krone.at/2877525', 'https://www.krone.at/2877624', 'https://www.krone.at/adabei', 'https://www.krone.at/star-style', 'https://www.krone.at/hollywood', 'https://www.krone.at/royals', 'https://www.krone.at/video-adabei-tv', 'https://www.krone.at/2877645', 'https://www.krone.at/2877627', 'https://www.krone.at/2877610', 'https://www.krone.at/2875806', 'https://www.krone.at/digital', 'https://www.krone.at/web', 'https://www.krone.at/elektronik', 'https://www.krone.at/spiele', 'https://www.krone.at/2877646', 'https://www.krone.at/2877643', 'https://www.krone.at/2877636', 'https://www.krone.at/2876173', 'https://www.krone.at/shopping-tipps', 'https://www.krone.at/haushalt-garten', 'https://www.krone.at/sport-freizeit', 'https://www.krone.at/essen-trinken', 'https://www.krone.at/2874663', 'https://www.krone.at/2853911', 'https://www.krone.at/motor', 'https://www.krone.at/video-motor', 'http://anzeigen.krone.at/anzeigen/Auto_Motor/Autos/~__~/district/~~~/---~S7QytqrOtDI0NLJOtDIEMQ2ADCOr6mKghFJuZp6SdTGQq2RoAWKAhBIrkIRqa0F6DfHoNbBSQtUJFgDps7REaCsGMpQMQOIA/?', 'https://www.krone.at/2876947', 'https://www.krone.at/2876928', 'https://www.krone.at/wissenschaft', 'https://www.krone.at/2877177', 'https://www.krone.at/2876879', 'https://www.krone.at/gesund-fit', 'https://www.krone.at/2875563', 'https://www.krone.at/2876888', 'https://www.krone.at/musik', 'https://ticket.krone.at/de/home', 'https://www.krone.at/kolumnen', 'https://www.krone.at/2877556', 'https://www.krone.at/2876579', 'https://www.krone.at/medien', 'https://tv.krone.at', 'https://www.krone.at/leben', 'https://www.krone.at/2876845', 'https://www.krone.at/2870268', 'https://www.krone.at/reisen-urlaub', 'https://www.krone.at/zukunft', 'https://www.krone.at/2874406', 'https://www.krone.at/2876348', 'https://www.krone.at/community', 'https://www.krone.at/gewinnspiele', 'https://www.krone.at/2869270', 'https://www.krone.at/2876065', 'https://www.krone.at/star-style', 'https://www.krone.at/kulinarik', 'https://www.krone.at/2876528', 'https://www.krone.at/2877251', 'https://www.krone.at/ombudsfrau', 'https://www.krone.at/tierecke', 'https://www.krone.at/video-tierecke', 'https://www.krone.at/2875233', 'https://www.krone.at/2868104', 'https://www.krone.at/bauen-wohnen', 'https://www.dibeo.at/?utm_source=krone.at&utm_medium=referral&utm_campaign=krone-header', 'https://www.krone.at/viral', 'https://www.krone.at/video-viral', 'https://www.krone.at/2874839', 'https://www.krone.at/2876885', 'http://bit.ly/3AFfWzS', 'https://www.krone.at/2876409', 'https://www.krone.at/2876409', 'https://www.krone.at/2875959', 'https://www.krone.at/2874733', 'https://www.krone.at/2876025', 'https://www.krone.at/2877318', 'https://www.krone.at/2876581', 'https://www.krone.at/2876158', 'https://ticket.krone.at/de/advent?utm_source=XmasSpecial&utm_medium=randspalte&utm_campaign=Adventaktion2020', 'https://www.krone.at/videos', 'https://www.krone.at/2877101', 'https://www.krone.at/2877628', 'https://pubads.g.doubleclick.net/gampad/clk?id=5849865560&iu=/6771346/clicktracking.krone.at/clicktracking.krone.at_navi-header', 'https://jackpot.onelink.me/1833068633/d1d6df19', 'https://www.krone.at/games-bubble-shooter', 'https://www.krone.at/games-kartenspiele', 'https://www.krone.at/kreuzwortraetsel', 'https://www.krone.at/sudoku', 'https://www.krone.at/das-freie-wort', 'https://www.krone.at/vergleich/', 'https://www.krone.at/vergleich/dampfgarer-test/', 'https://www.krone.at/vergleich/einbaukuehlschrank-test/', 'https://www.krone.at/vergleich/staubsauger-test/', 'https://www.krone.at/vergleich/friteuse-test/', 'https://www.krone.at/vergleich/gefrierschrank-test/', 'https://www.krone.at/vergleich/geschirrspueler-test/', 'https://www.krone.at/vergleich/induktionsherd-test/', 'https://www.krone.at/vergleich/kaffeemaschine-test/', 'https://www.krone.at/vergleich/akku-staubsauger-test/', 'https://www.krone.at/vergleich/kuechenmaschine-test/', 'https://www.krone.at/vergleich/', 'https://www.krone.at/gutschein/', 'https://www.krone.at/gutschein/otto?utm_source=krone&utm_medium=textlink&utm_campaign=widget', 'https://www.krone.at/gutschein/eis?utm_source=krone&utm_medium=textlink&utm_campaign=widget', 'https://www.krone.at/gutschein/deichmann?utm_source=krone&utm_medium=textlink&utm_campaign=widget', 'https://www.krone.at/gutschein/moemax?utm_source=krone&utm_medium=textlink&utm_campaign=widget', 'https://www.krone.at/gutschein/gutscheinkalender?utm_source=krone&utm_medium=textlink&utm_campaign=widget', 'https://www.krone.at/newsletter', 'https://www.krone.at/newsletter', 'https://www.krone.at/video-tv-programm', 'https://www.krone.at/videos', 'https://vorteilswelt.krone.at/', 'https://service.krone.at/?23028&utm_source=krone.at&utm_medium=www&utm_campaign=StartseitenVerlinkung&utm_content=service', 'https://vorteilswelt.krone.at/?23028&utm_source=krone.at&utm_medium=www&utm_campaign=StartseitenVerlinkung&utm_content=kundenbindung', 'https://www.krone.at/epaper_weiche', 'https://www.krone.at/gewinnspiele', 'https://www.krone.at/community', 'https://www.krone.at/580415', 'https://www.krone.at/392364', 'https://www.krone.at/sales', 'https://www.krone.at/434475', 'https://www.krone.at/1704878', 'https://www.krone.at/37371', 'https://www.krone.at/37371', 'https://www.facebook.com/krone.at', 'https://www.twitter.com/krone_at', 'https://www.youtube.com/user/kronetv', 'https://www.instagram.com/kronen.zeitung']\n" ] } ], "source": [ "print(getLinks(\"https://www.krone.at/\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1.5 - Extracting information from tables" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "url3 = urllib.request.urlopen('https://pythonprogramming.net/parsememcparseface/').read()\n", "soup = bs(url3,'lxml')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "table = soup.table\n", "#table = soup.find('table')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "table_rows = table.find_all('tr')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[]\n", "['Python', '932914021', 'Definitely']\n", "['Pascal', '532', 'Unlikely']\n", "['Lisp', '1522', 'Uncertain']\n", "['D#', '12', 'Possibly']\n", "['Cobol', '3', 'No.']\n", "['Fortran', '52124', 'Yes.']\n", "['Haskell', '24', 'lol.']\n" ] } ], "source": [ "for tr in table_rows:\n", " td = tr.find_all('td')\n", " row = [i.text for i in td]\n", " print(row)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***\n", "\n", "### 3 - Parsing dynamically updated data via javascript" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[General Info](https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python) \n", "[Selenium](https://www.selenium.dev/) \n", "[Dry Scape](https://dryscrape.readthedocs.io/en/latest/) " ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "200\n" ] } ], "source": [ "url4 = \"https://kurier.at/\"\n", "content = requests.get(url4)\n", "print(content.status_code)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***\n", "\n", "### 4 - Using Requests with Parameters\n", "\n", "[Example](https://towardsdatascience.com/web-scraping-101-in-python-35f8653b1c97) of using [parameters with *requests*](https://docs.python-requests.org/en/master/user/quickstart/) on [IMDB](https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating):" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After taking a look at the IMDB webpage, we’ll set out to extract (all highlighted in the above screenshot of the page):\n", "\n", "+ Movie title\n", "+ Release year\n", "+ Runtime\n", "+ Audience rating\n", "+ Genre\n", "+ IMDB rating\n", "+ Number of votes\n", "+ Box office earnings\n", "+ Director\n", "+ Primary actors" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "url3 = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'\n", "\n", "def get_page_contents(url):\n", " page = requests.get(url, headers={\"Accept-Language\": \"en-US\"})\n", " return bs(page.text, \"html.parser\")\n", "\n", "soup = get_page_contents(url3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extracting text: Movie titles and release year\n", "\n", "Since we’ve already located the HTML tag containing each movie card, we can get a list of all distinct movies and their corresponding HTML by:" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[

\n", " 1.\n", " The Shawshank Redemption\n", " (1994)\n", "

,\n", "

\n", " 2.\n", " The Godfather\n", " (1972)\n", "

,\n", "

\n", " 3.\n", " The Dark Knight\n", " (2008)\n", "

,\n", "

\n", " 4.\n", " The Lord of the Rings: The Return of the King\n", " (2003)\n", "

,\n", "

\n", " 5.\n", " Schindler's List\n", " (1993)\n", "

]" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = soup.findAll('h3', class_='lister-item-header') \n", "movies[0:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The findAll method creates a list where each entry contains the HTML that’s captured within the h3 tag and list-item-header class. \n", "By taking a deeper look at the first movies HTML and see that the movie title can be found under the first a tag.\n", "\n", "To capture this attribute we can loop through all movies and either call findAll and grab the first element of the list, or we can use the find method which automatically grabs the first tag it finds. Thus, we can construct a list of all movie titles (with a little help from list comprehensions for efficiency) through:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Jai Bhim',\n", " 'The Shawshank Redemption',\n", " 'The Godfather',\n", " 'Soorarai Pottru',\n", " 'The Dark Knight',\n", " 'The Godfather: Part II',\n", " '12 Angry Men',\n", " 'The Lord of the Rings: The Return of the King',\n", " 'Pulp Fiction',\n", " \"Schindler's List\"]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titles = [movie.find('a').text for movie in movies]\n", "titles[0:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Release years can be found under the tag span and class lister-item-year text-muted unbold. To grab these, we can follow a similar approach as before:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['(2021)',\n", " '(1994)',\n", " '(1972)',\n", " '(2020)',\n", " '(2008)',\n", " '(1974)',\n", " '(1957)',\n", " '(2003)',\n", " '(1994)',\n", " '(1993)']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "release = [movie.find('span', class_='lister-item-year text-muted unbold').text for movie in movies]\n", "release[0:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extracting Numerical Values:\n", "\n", "In the case of IMDB ratings, number of votes and box office earnings we can see that while these may be available as string values, can also grab the actual numerical values from the data-value attribute within each respective tag.\n", "\n", "Here’s the IMDB rating of The Godfather: \n", "\\
" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'9.3'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.find('div', class_='inline-block ratings-imdb-rating')['data-value']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the case of number of votes and earnings we don’t have a class attribute to filter for. Here are the number of votes and estimated box office earnings:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['166955', '2524608', '28,341,469', '1738105', '134,966,411', '105969', '2474861', '534,858,444', '1206022', '57,300,000', '745533', '4,360,000', '1742556', '377,845,905', '1945178', '107,928,762', '1290110', '96,898,818', '2219102', '292,576,195', '381254', '34422', '1985641', '37,030,102', '1763818', '315,544,750', '1948453', '330,252,182', '729757', '6,100,000', '80169', '111744', '1574783', '342,551,365', '1819851', '171,479,930', '1092313', '46,836,394', '1226372', '290,475,067', '968073', '112,000,000', '704737', '53,367,844', '33253', '1670884', '188,020,017', '731227', '7,563,397', '711808', '10,055,859', '1318236', '216,540,909', '1228259', '136,801,374', '664080', '57,598,247', '1549651', '100,125,643', '1357805', '130,742,922', '1297940', '322,740,140', '51376', '333835', '269,061', '437425', '27144', '784073', '13,092,000', '813495', '13,182,281', '1268744', '53,089,891', '1262628', '132,384,315', '787132', '32,572,577', '1426161', '187,705,427', '1081233', '6,719,864', '1044282', '23,341,568', '1103636', '19,501,238', '1002551', '422,783,777', '1050193', '204,843,350', '249906', '11,990,401', '259173', '1136081', '210,609,762', '318833', '5,321,508', '642033', '32,000,000', '473008', '36,764,313', '550923', '1,024,560', '232696', '163,245', '178698', '19,181', '80529', '80141', '1,661,096', '1126129', '335,451,311', '37320', '37822', '236671', '5,017,246', '176504', '12,391,761', '460477', '190,241,310', '993360', '858,373,000', '966948', '678,815,482', '453653', '209,726,015', '1459854', '162,805,434', '1607516', '448,139,099', '375858', '6,532,908', '184301', '1,223,869', '1061740', '223,808,164', '377926', '11,286,112', '551409', '707,481', '1187303', '25,544,867', '372970', '2,375,308', '930103', '248,159,971', '964675', '44,017,374', '640271', '83,471,511', '838483', '78,900,000', '41116', '473614', '275,902', '119338', '8,175,000', '190087', '30035', '536,364', '214280', '216460', '288,475', '46927', '898,575', '522938', '159,227,644', '54677', '4,186,168', '39268', '52258', '35364', '39058', '311464', '687,185', '235572', '7,098,492', '167495', '6,857,096']\n" ] } ], "source": [ "votes_earnings = soup.findAll('span', {'name':'nv'})\n", "print([ve['data-value'] for ve in votes_earnings])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['166955', '28,341,469', '134,966,411', '2474861', '1206022']\n", "['2524608', '1738105', '105969', '534,858,444', '57,300,000']\n" ] } ], "source": [ "votes = []\n", "earnings = []\n", "idx = 0\n", "while idx < len(votes_earnings)-1:\n", " votes.append(votes_earnings[idx]['data-value'])\n", " earnings.append(votes_earnings[idx+1]['data-value'])\n", " idx+=2\n", "print(votes[0:5])\n", "print(earnings[0:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nested Values\n", "\n", "In the case where the data we need is located within multiple levels of generic tags, we’ll need to dig into this nested structure to extract what we need.\n", "\n", "In the case of the movie directors and actors we’ll need to do just that. From inspecting the HTML we see that the director information is located within an initial p tag and thereafter an a tag — both without class attributes making it necessary to unnest the data. We’ll do this by calling find and findAll repeatedly.\n", "\n", "Since the director is the 1st a tag, we can extract this information through:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "---\n", "IMDb Top 250 Movies chart\n", "IMDb Top 250 Movies chart\n", "---\n", "T.J. Gnanavel\n", "T.J. Gnanavel\n", "Suriya\n", "Suriya\n", "Lijo Mol Jose\n", "Lijo Mol Jose\n", "Manikandan\n", "Manikandan\n", "Rajisha Vijayan\n", "Rajisha Vijayan\n", "---\n", "Frank Darabont\n", "Frank Darabont\n", "Tim Robbins\n", "Tim Robbins\n", "Morgan Freeman\n", "Morgan Freeman\n", "Bob Gunton\n", "Bob Gunton\n", "William Sadler\n", "William Sadler\n" ] } ], "source": [ "for l1 in soup.findAll('p')[0:10]:\n", " if l1.find('a'):\n", " print('---')\n", " for l2 in l1.findAll('a'):\n", " print(l2)\n", " print(l2.text)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "actors = [actor.text for actor in soup.findAll('p')[2].findAll('a')]\n", "actors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creating functions to automate information scraping on IMDB:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "def numeric_value(movie, tag, class_=None, order=None):\n", " if order:\n", " if len(movie.findAll(tag, class_)) > 1:\n", " to_extract = movie.findAll(tag, class_)[order]['data-value']\n", " else:\n", " to_extract = None\n", " else:\n", " to_extract = movie.find(tag, class_)['data-value']\n", "\n", " return to_extract\n", "\n", "\n", "def text_value(movie, tag, class_=None):\n", " if movie.find(tag, class_):\n", " return movie.find(tag, class_).text\n", " else:\n", " return\n", "\n", "\n", "def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):\n", " if not order:\n", " return movie.find(tag_1, class_1).find(tag_2, class_2).text\n", " else:\n", " return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]\n", "\n", "\n", "def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',\n", " text_attribute=True, order=None, nested=False):\n", " movies = soup.findAll('div', class_='lister-item-content')\n", " data_list = []\n", " for movie in movies:\n", " if text_attribute:\n", " if nested:\n", " data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))\n", " else:\n", " data_list.append(text_value(movie, tag_1, class_1))\n", " else:\n", " data_list.append(numeric_value(movie, tag_1, class_1, order))\n", "\n", " return data_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creating a Dataframe with the information" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleReleaseAudience RatingRuntimeGenreIMDB RatingVotesBox Office EarningsDirectorActors
0Jai Bhim(2021)TV-MA164 min\\nCrime, Drama\\n\\n9.3\\n166,955166,955T.J. Gnanavel[Suriya, Lijo Mol Jose, Manikandan, Rajisha Vi...
1The Shawshank Redemption(1994)R142 min\\nDrama\\n\\n9.3\\n2,524,6082,524,608Frank Darabont[Tim Robbins, Morgan Freeman, Bob Gunton, Will...
2The Godfather(1972)R175 min\\nCrime, Drama\\n\\n9.2\\n1,738,1051,738,105Francis Ford Coppola[Marlon Brando, Al Pacino, James Caan, Diane K...
3Soorarai Pottru(2020)TV-MA153 min\\nDrama\\n\\n9.1\\n105,969105,969Sudha Kongara[Suriya, Paresh Rawal, Urvashi, Aparna Balamur...
4The Dark Knight(2008)PG-13152 min\\nAction, Crime, Drama\\n\\n9.0\\n2,474,8612,474,861Christopher Nolan[Christian Bale, Heath Ledger, Aaron Eckhart, ...
\n", "
" ], "text/plain": [ " Title Release Audience Rating Runtime \\\n", "0 Jai Bhim (2021) TV-MA 164 min \n", "1 The Shawshank Redemption (1994) R 142 min \n", "2 The Godfather (1972) R 175 min \n", "3 Soorarai Pottru (2020) TV-MA 153 min \n", "4 The Dark Knight (2008) PG-13 152 min \n", "\n", " Genre IMDB Rating Votes \\\n", "0 \\nCrime, Drama \\n\\n9.3\\n 166,955 \n", "1 \\nDrama \\n\\n9.3\\n 2,524,608 \n", "2 \\nCrime, Drama \\n\\n9.2\\n 1,738,105 \n", "3 \\nDrama \\n\\n9.1\\n 105,969 \n", "4 \\nAction, Crime, Drama \\n\\n9.0\\n 2,474,861 \n", "\n", " Box Office Earnings Director \\\n", "0 166,955 T.J. Gnanavel \n", "1 2,524,608 Frank Darabont \n", "2 1,738,105 Francis Ford Coppola \n", "3 105,969 Sudha Kongara \n", "4 2,474,861 Christopher Nolan \n", "\n", " Actors \n", "0 [Suriya, Lijo Mol Jose, Manikandan, Rajisha Vi... \n", "1 [Tim Robbins, Morgan Freeman, Bob Gunton, Will... \n", "2 [Marlon Brando, Al Pacino, James Caan, Diane K... \n", "3 [Suriya, Paresh Rawal, Urvashi, Aparna Balamur... \n", "4 [Christian Bale, Heath Ledger, Aaron Eckhart, ... " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titles = extract_attribute(soup, 'a')\n", "release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')\n", "audience_rating = extract_attribute(soup, 'span', 'certificate')\n", "runtime = extract_attribute(soup, 'span', 'runtime')\n", "genre = extract_attribute(soup, 'span', 'genre')\n", "imdb_rating = extract_attribute(soup, 'div', 'inline-block ratings-imdb-rating', False)\n", "votes = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 0)\n", "earnings = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 1)\n", "directors = extract_attribute(soup, 'p', '', 'a', '', True, 0, True)\n", "actors = extract_attribute(soup, 'p', '', 'a', '', True, slice(1, 5, None), True)\n", "\n", "\n", "df_dict = {'Title': titles, 'Release': release, 'Audience Rating': audience_rating,\n", " 'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,\n", " 'Votes': votes, 'Box Office Earnings': earnings, 'Director': directors,\n", " 'Actors': actors}\n", "\n", "df = pd.DataFrame(df_dict)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Manually creating a Dataframe and starting analysis" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "url4 = \"http://www.hubertiming.com/results/2017GPTR10K\"\n", "html = urllib.request.urlopen(url4)\n", "soup = bs(html, 'lxml')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Race results for the 2017 Intel Great Place to Run \\ Urban Clash Games!\n" ] } ], "source": [ "# Get the title\n", "title = soup.title\n", "print(title)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n\\n\\n\\n\\n\\n\\nRace results for the 2017 Intel Great Place to Run \\\\ Urban Clash Games!\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n 2017 Intel Great Place to Run 10K \\\\ Urban Clash Games\\n Hillsboro Stadium, Hillsboro, OR \\n June 2nd,'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print out the text\n", "text = soup.get_text()\n", "text[0:200]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[timing@hubertiming.com,\n", " Huber Timing Home,\n", " 5K Individual,\n", " 5K Team,\n", " 10K Team,\n", " Summary,\n", " ,\n", " 10K Results,\n", " Huber Timing,\n", " ,\n", " Dark Mode]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.find_all('a')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mailto:timing@hubertiming.com\n", "https://www.hubertiming.com\n", "/results/2017GPTR\n", "/results/team/2017GPTR\n", "/results/team/2017GPTR10K\n", "/results/summary/2017GPTR10K\n", "None\n", "#tabs-1\n", "https://www.hubertiming.com/\n", "https://facebook.com/hubertiming/\n", "None\n" ] } ], "source": [ "all_links = soup.find_all(\"a\")\n", "for link in all_links:\n", " print(link.get(\"href\"))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\n", "10K:\n", ", \n", "Finishers:\n", "577\n", ", \n", "Male:\n", "414\n", ", \n", "Female:\n", "163\n", ", \n", "Place\n", "Bib\n", "Name\n", "Gender\n", "City\n", "Chip Time\n", "Gun Time\n", "Team\n", ", \n", "1\n", "814\n", "\n", "\n", " JARED WILSON\n", "\n", " \n", "M\n", "TIGARD\n", "36:21\n", "36:24\n", "\n", ", \n", "2\n", "573\n", "\n", "\n", " NATHAN A SUSTERSIC\n", "\n", " \n", "M\n", "PORTLAND\n", "36:42\n", "36:45\n", "\n", "\n", " INTEL TEAM F\n", " \n", ", \n", "3\n", "687\n", "\n", "\n", " FRANCISCO MAYA\n", "\n", " \n", "M\n", "PORTLAND\n", "37:44\n", "37:48\n", "\n", "]\n" ] } ], "source": [ "# Print the first 8 rows for sanity check\n", "rows = soup.find_all('tr')\n", "print(rows[:8])" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[577, 443, \n", "\n", " LIBBY B MITCHELL\n", "\n", " , F, HILLSBORO, 1:41:18, 1:42:10, ]\n" ] }, { "data": { "text/plain": [ "bs4.element.ResultSet" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for row in rows:\n", " row_td = row.find_all('td')\n", "print(row_td)\n", "type(row_td)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[577, 443, \n", "\n", " LIBBY B MITCHELL\n", "\n", " , F, HILLSBORO, 1:41:18, 1:42:10, ]\n" ] } ], "source": [ "str_cells = str(row_td)\n", "cleantext = bs(str_cells, \"lxml\").get_text()\n", "print(cleantext)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Now processing the whole page" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[577, 443, \n", "\n", " LIBBY B MITCHELL\n", "\n", " , F, HILLSBORO, 1:41:18, 1:42:10, ]\n" ] }, { "data": { "text/plain": [ "str" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_rows = []\n", "for row in rows:\n", " cells = row.find_all('td')\n", " str_cells = str(cells)\n", " clean = re.compile('<.*?>')\n", " clean2 = (re.sub(clean, '',str_cells))\n", " list_rows.append(clean2)\n", "print(clean2)\n", "type(clean2)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
raw
0[]
1[Finishers:, 577]
2[Male:, 414]
3[Female:, 163]
4[]
5[1, 814, \\r\\n\\r\\n JARED WIL...
6[2, 573, \\r\\n\\r\\n NATHAN A ...
7[3, 687, \\r\\n\\r\\n FRANCISCO...
8[4, 623, \\r\\n\\r\\n PAUL MORR...
9[5, 569, \\r\\n\\r\\n DEREK G O...
\n", "
" ], "text/plain": [ " raw\n", "0 []\n", "1 [Finishers:, 577]\n", "2 [Male:, 414]\n", "3 [Female:, 163]\n", "4 []\n", "5 [1, 814, \\r\\n\\r\\n JARED WIL...\n", "6 [2, 573, \\r\\n\\r\\n NATHAN A ...\n", "7 [3, 687, \\r\\n\\r\\n FRANCISCO...\n", "8 [4, 623, \\r\\n\\r\\n PAUL MORR...\n", "9 [5, 569, \\r\\n\\r\\n DEREK G O..." ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(list_rows, columns=['raw'])\n", "df.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleaning the fields" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
0[]NoneNoneNoneNoneNoneNoneNone
1[Finishers:577]NoneNoneNoneNoneNoneNone
2[Male:414]NoneNoneNoneNoneNoneNone
3[Female:163]NoneNoneNoneNoneNoneNone
4[]NoneNoneNoneNoneNoneNoneNone
5[1814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24]
6[2573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...
7[3687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48]
8[4623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37]
9[5569\\r\\n\\r\\n DEREK G OSBORNE\\r...MHILLSBORO39:2139:24\\n\\r\\n INTEL TEAM ...
\n", "
" ], "text/plain": [ " 0 1 2 \\\n", "0 [] None None \n", "1 [Finishers: 577] None \n", "2 [Male: 414] None \n", "3 [Female: 163] None \n", "4 [] None None \n", "5 [1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... \n", "6 [2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... \n", "7 [3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... \n", "8 [4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... \n", "9 [5 569 \\r\\n\\r\\n DEREK G OSBORNE\\r... \n", "\n", " 3 4 5 6 \\\n", "0 None None None None \n", "1 None None None None \n", "2 None None None None \n", "3 None None None None \n", "4 None None None None \n", "5 M TIGARD 36:21 36:24 \n", "6 M PORTLAND 36:42 36:45 \n", "7 M PORTLAND 37:44 37:48 \n", "8 M BEAVERTON 38:34 38:37 \n", "9 M HILLSBORO 39:21 39:24 \n", "\n", " 7 \n", "0 None \n", "1 None \n", "2 None \n", "3 None \n", "4 None \n", "5 ] \n", "6 \\n\\r\\n INTEL TEAM ... \n", "7 ] \n", "8 ] \n", "9 \\n\\r\\n INTEL TEAM ... " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = df['raw'].str.split(',', expand=True)\n", "df1.head(10)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
0]NoneNoneNoneNoneNoneNoneNone
1Finishers:577]NoneNoneNoneNoneNoneNone
2Male:414]NoneNoneNoneNoneNoneNone
3Female:163]NoneNoneNoneNoneNoneNone
4]NoneNoneNoneNoneNoneNoneNone
51814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24]
62573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...
73687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48]
84623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37]
95569\\r\\n\\r\\n DEREK G OSBORNE\\r...MHILLSBORO39:2139:24\\n\\r\\n INTEL TEAM ...
\n", "
" ], "text/plain": [ " 0 1 2 3 \\\n", "0 ] None None None \n", "1 Finishers: 577] None None \n", "2 Male: 414] None None \n", "3 Female: 163] None None \n", "4 ] None None None \n", "5 1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... M \n", "6 2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... M \n", "7 3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... M \n", "8 4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... M \n", "9 5 569 \\r\\n\\r\\n DEREK G OSBORNE\\r... M \n", "\n", " 4 5 6 \\\n", "0 None None None \n", "1 None None None \n", "2 None None None \n", "3 None None None \n", "4 None None None \n", "5 TIGARD 36:21 36:24 \n", "6 PORTLAND 36:42 36:45 \n", "7 PORTLAND 37:44 37:48 \n", "8 BEAVERTON 38:34 38:37 \n", "9 HILLSBORO 39:21 39:24 \n", "\n", " 7 \n", "0 None \n", "1 None \n", "2 None \n", "3 None \n", "4 None \n", "5 ] \n", "6 \\n\\r\\n INTEL TEAM ... \n", "7 ] \n", "8 ] \n", "9 \\n\\r\\n INTEL TEAM ... " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1[0] = df1[0].str.strip('[')\n", "df1.head(10)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "col_labels = soup.find_all('th')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['[Place, Bib, Name, Gender, City, Chip Time, Gun Time, Team]']\n" ] } ], "source": [ "all_header = []\n", "col_str = str(col_labels)\n", "cleantext2 = bs(col_str, \"lxml\").get_text()\n", "all_header.append(cleantext2)\n", "print(all_header)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0[Place, Bib, Name, Gender, City, Chip Time, Gu...
\n", "
" ], "text/plain": [ " 0\n", "0 [Place, Bib, Name, Gender, City, Chip Time, Gu..." ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.DataFrame(all_header)\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
0[PlaceBibNameGenderCityChip TimeGun TimeTeam]
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "0 [Place Bib Name Gender City Chip Time Gun Time Team]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = df2[0].str.split(',', expand=True)\n", "df3.head()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
0[PlaceBibNameGenderCityChip TimeGun TimeTeam]
0]NoneNoneNoneNoneNoneNoneNone
1Finishers:577]NoneNoneNoneNoneNoneNone
2Male:414]NoneNoneNoneNoneNoneNone
3Female:163]NoneNoneNoneNoneNoneNone
4]NoneNoneNoneNoneNoneNoneNone
51814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24]
62573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...
73687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48]
84623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37]
\n", "
" ], "text/plain": [ " 0 1 2 \\\n", "0 [Place Bib Name \n", "0 ] None None \n", "1 Finishers: 577] None \n", "2 Male: 414] None \n", "3 Female: 163] None \n", "4 ] None None \n", "5 1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... \n", "6 2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... \n", "7 3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... \n", "8 4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... \n", "\n", " 3 4 5 6 \\\n", "0 Gender City Chip Time Gun Time \n", "0 None None None None \n", "1 None None None None \n", "2 None None None None \n", "3 None None None None \n", "4 None None None None \n", "5 M TIGARD 36:21 36:24 \n", "6 M PORTLAND 36:42 36:45 \n", "7 M PORTLAND 37:44 37:48 \n", "8 M BEAVERTON 38:34 38:37 \n", "\n", " 7 \n", "0 Team] \n", "0 None \n", "1 None \n", "2 None \n", "3 None \n", "4 None \n", "5 ] \n", "6 \\n\\r\\n INTEL TEAM ... \n", "7 ] \n", "8 ] " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frames = [df3, df1]\n", "\n", "df4 = pd.concat(frames)\n", "df4.head(10)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
[PlaceBibNameGenderCityChip TimeGun TimeTeam]
0[PlaceBibNameGenderCityChip TimeGun TimeTeam]
0]NoneNoneNoneNoneNoneNoneNone
1Finishers:577]NoneNoneNoneNoneNoneNone
2Male:414]NoneNoneNoneNoneNoneNone
3Female:163]NoneNoneNoneNoneNoneNone
\n", "
" ], "text/plain": [ " [Place Bib Name Gender City Chip Time Gun Time Team]\n", "0 [Place Bib Name Gender City Chip Time Gun Time Team]\n", "0 ] None None None None None None None\n", "1 Finishers: 577] None None None None None None\n", "2 Male: 414] None None None None None None\n", "3 Female: 163] None None None None None None" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5 = df4.rename(columns=df4.iloc[0])\n", "df5.head()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 583 entries, 0 to 581\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 [Place 583 non-null object\n", " 1 Bib 581 non-null object\n", " 2 Name 578 non-null object\n", " 3 Gender 578 non-null object\n", " 4 City 578 non-null object\n", " 5 Chip Time 578 non-null object\n", " 6 Gun Time 578 non-null object\n", " 7 Team] 578 non-null object\n", "dtypes: object(8)\n", "memory usage: 41.0+ KB\n" ] }, { "data": { "text/plain": [ "(583, 8)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.info()\n", "df5.shape" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "df6 = df5.dropna(axis=0, how='any')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
[PlaceBibNameGenderCityChip TimeGun TimeTeam]
51814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24]
62573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...
73687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48]
84623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37]
95569\\r\\n\\r\\n DEREK G OSBORNE\\r...MHILLSBORO39:2139:24\\n\\r\\n INTEL TEAM ...
\n", "
" ], "text/plain": [ " [Place Bib Name Gender \\\n", "5 1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... M \n", "6 2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... M \n", "7 3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... M \n", "8 4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... M \n", "9 5 569 \\r\\n\\r\\n DEREK G OSBORNE\\r... M \n", "\n", " City Chip Time Gun Time \\\n", "5 TIGARD 36:21 36:24 \n", "6 PORTLAND 36:42 36:45 \n", "7 PORTLAND 37:44 37:48 \n", "8 BEAVERTON 38:34 38:37 \n", "9 HILLSBORO 39:21 39:24 \n", "\n", " Team] \n", "5 ] \n", "6 \\n\\r\\n INTEL TEAM ... \n", "7 ] \n", "8 ] \n", "9 \\n\\r\\n INTEL TEAM ... " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df7 = df6.drop(df6.index[0])\n", "df7.head()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PlaceBibNameGenderCityChip TimeGun TimeTeam
51814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24]
62573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...
73687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48]
84623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37]
95569\\r\\n\\r\\n DEREK G OSBORNE\\r...MHILLSBORO39:2139:24\\n\\r\\n INTEL TEAM ...
\n", "
" ], "text/plain": [ " Place Bib Name Gender \\\n", "5 1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... M \n", "6 2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... M \n", "7 3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... M \n", "8 4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... M \n", "9 5 569 \\r\\n\\r\\n DEREK G OSBORNE\\r... M \n", "\n", " City Chip Time Gun Time \\\n", "5 TIGARD 36:21 36:24 \n", "6 PORTLAND 36:42 36:45 \n", "7 PORTLAND 37:44 37:48 \n", "8 BEAVERTON 38:34 38:37 \n", "9 HILLSBORO 39:21 39:24 \n", "\n", " Team \n", "5 ] \n", "6 \\n\\r\\n INTEL TEAM ... \n", "7 ] \n", "8 ] \n", "9 \\n\\r\\n INTEL TEAM ... " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df7.rename(columns={'[Place': 'Place'},inplace=True)\n", "df7.rename(columns={' Team]': 'Team'},inplace=True)\n", "df7.head()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PlaceBibNameGenderCityChip TimeGun TimeTeam
51814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24
62573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...
73687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48
84623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37
95569\\r\\n\\r\\n DEREK G OSBORNE\\r...MHILLSBORO39:2139:24\\n\\r\\n INTEL TEAM ...
\n", "
" ], "text/plain": [ " Place Bib Name Gender \\\n", "5 1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... M \n", "6 2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... M \n", "7 3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... M \n", "8 4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... M \n", "9 5 569 \\r\\n\\r\\n DEREK G OSBORNE\\r... M \n", "\n", " City Chip Time Gun Time \\\n", "5 TIGARD 36:21 36:24 \n", "6 PORTLAND 36:42 36:45 \n", "7 PORTLAND 37:44 37:48 \n", "8 BEAVERTON 38:34 38:37 \n", "9 HILLSBORO 39:21 39:24 \n", "\n", " Team \n", "5 \n", "6 \\n\\r\\n INTEL TEAM ... \n", "7 \n", "8 \n", "9 \\n\\r\\n INTEL TEAM ... " ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df7['Team'] = df7['Team'].str.strip(']')\n", "df7.head()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "time_list = df7[' Chip Time'].tolist()\n", "\n", "# You can use a for loop to convert 'Chip Time' to minutes\n", "\n", "time_mins = []\n", "for i in time_list:\n", " try:\n", " h, m, s = i.split(':')\n", " math = (int(h) * 3600 + int(m) * 60 + int(s))/60\n", " except:\n", " h, m = i.split(':')\n", " math = (int(h) * 3600 + int(m) * 60) # + int(s))/60\n", " time_mins.append(math)\n", "#print(time_mins)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PlaceBibNameGenderCityChip TimeGun TimeTeamRunner_mins
51814\\r\\n\\r\\n JARED WILSON\\r\\n\\...MTIGARD36:2136:24130860.0
62573\\r\\n\\r\\n NATHAN A SUSTERSI...MPORTLAND36:4236:45\\n\\r\\n INTEL TEAM ...132120.0
73687\\r\\n\\r\\n FRANCISCO MAYA\\r\\...MPORTLAND37:4437:48135840.0
84623\\r\\n\\r\\n PAUL MORROW\\r\\n\\r...MBEAVERTON38:3438:37138840.0
95569\\r\\n\\r\\n DEREK G OSBORNE\\r...MHILLSBORO39:2139:24\\n\\r\\n INTEL TEAM ...141660.0
\n", "
" ], "text/plain": [ " Place Bib Name Gender \\\n", "5 1 814 \\r\\n\\r\\n JARED WILSON\\r\\n\\... M \n", "6 2 573 \\r\\n\\r\\n NATHAN A SUSTERSI... M \n", "7 3 687 \\r\\n\\r\\n FRANCISCO MAYA\\r\\... M \n", "8 4 623 \\r\\n\\r\\n PAUL MORROW\\r\\n\\r... M \n", "9 5 569 \\r\\n\\r\\n DEREK G OSBORNE\\r... M \n", "\n", " City Chip Time Gun Time \\\n", "5 TIGARD 36:21 36:24 \n", "6 PORTLAND 36:42 36:45 \n", "7 PORTLAND 37:44 37:48 \n", "8 BEAVERTON 38:34 38:37 \n", "9 HILLSBORO 39:21 39:24 \n", "\n", " Team Runner_mins \n", "5 130860.0 \n", "6 \\n\\r\\n INTEL TEAM ... 132120.0 \n", "7 135840.0 \n", "8 138840.0 \n", "9 \\n\\r\\n INTEL TEAM ... 141660.0 " ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df7['Runner_mins'] = time_mins\n", "df7.head()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Runner_mins
count577.000000
mean97617.157972
std92975.700498
min60.016667
25%68.300000
50%152580.000000
75%186540.000000
max215700.000000
\n", "
" ], "text/plain": [ " Runner_mins\n", "count 577.000000\n", "mean 97617.157972\n", "std 92975.700498\n", "min 60.016667\n", "25% 68.300000\n", "50% 152580.000000\n", "75% 186540.000000\n", "max 215700.000000" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df7.describe(include=[np.number])" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "from pylab import rcParams\n", "rcParams['figure.figsize'] = 15, 5" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([], [Text(1, 0, 'Runners')])" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA44AAAEvCAYAAAAKKJ/2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWG0lEQVR4nO3df7BmdX0f8PdHVxsGf6GYWwIMazt0MhSV6FaZmOlcfyGaTNDGqrQTtsayJmLTTDJTadoZrMaO7WgaaTu0pDKCUyUm0UoEQUq8cZoJkUUREE3ZURiWQTGuAVcp/uinf+xZebK9+927C88+d/e+XjPP3PN8zvec7+fMztw77z3n+T7V3QEAAID9edyiGwAAAGB9ExwBAAAYEhwBAAAYEhwBAAAYEhwBAAAYEhwBAAAY2rToBtaL448/vjdv3rzoNgDYoL7zne/k2GOPXXQbAGxgN99881929zNX2yc4TjZv3pzt27cvug0ANqiVlZUsLy8vug0ANrCqunt/+zyqCgAAwJDgCAAAwJDgCAAAwJDgCAAAwJDgCAAAwJDgCAAAwJDgCAAAwJDgCAAAwJDgCAAAwJDgCAAAwNCmRTcAAOtFVS26hYXq7kW3AMA65Y4jAEy6e2GvU972iYXOLzQCMCI4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMCQ4AgAAMDS34FhVJ1fVp6vqjqr6YlX986n+9Kq6vqrunH4eN9Wrqi6uqh1VdWtVPW/mXFun8XdW1daZ+vOr6rbpmIurqkZzAAAAcPDmecfxB0l+o7tPS3Jmkguq6rQkFya5obtPTXLD9D5JXpnk1Om1LcklyZ4QmOSiJC9M8oIkF80EwUuSnD9z3NlTfX9zAAAAcJDmFhy7+77u/ty0/e0kX0pyYpJzklw+Dbs8yaun7XOSXNF73JjkaVV1QpJXJLm+u3d197eSXJ/k7GnfU7r7xu7uJFfsc67V5gAAAOAgbTock1TV5iQ/leTPkyx1933Trq8lWZq2T0xyz8xhO6faqL5zlXoGc+zb17bsubuZpaWlrKysHOSVAcBjx98hANaruQfHqnpSkj9M8mvd/eD0McQkSXd3VfU85x/N0d2XJrk0SbZs2dLLy8vzbAUA9u/aq+PvEADr1VxXVa2qJ2RPaPzv3f3Rqfz16THTTD/vn+r3Jjl55vCTptqoftIq9dEcAAAAHKR5rqpaSd6f5Evd/dszu65Ksndl1K1JPj5TP29aXfXMJA9Mj5tel+SsqjpuWhTnrCTXTfserKozp7nO2+dcq80BAADAQZrno6ovSvKLSW6rqlum2m8meXeSj1TVm5LcneR1075rkrwqyY4k303yxiTp7l1V9c4kN03j3tHdu6bttyT5QJJjknxyemUwBwAAAAdpbsGxu/9XktrP7peuMr6TXLCfc12W5LJV6tuTnL5K/ZurzQEAAMDBm+tnHAEAADjyCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMbVp0AwAw67n/5lN54KHvL7qNhdh84dWLbmEhnnrME/KFi85adBsADAiOAKwrDzz0/dz17p9ddBuH3crKSpaXlxfdxkJs1MAMcCTxqCoAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDgiMAAABDcwuOVXVZVd1fVbfP1N5eVfdW1S3T61Uz+/5lVe2oqr+oqlfM1M+eajuq6sKZ+rOq6s+n+u9V1ROn+t+Y3u+Y9m+e1zUCAABsBNXd8zlx1d9PsjvJFd19+lR7e5Ld3f2efcaeluTDSV6Q5CeS/M8kf2fa/b+TvDzJziQ3JTm3u++oqo8k+Wh3X1lV/yXJF7r7kqp6S5LndPcvV9Ubkrymu19/oH63bNnS27dvf/QXDsCj8uzLn73oFliA27betugWADa8qrq5u7estm/TvCbt7s8cxN2+c5Jc2d0PJ/lqVe3InhCZJDu6+ytJUlVXJjmnqr6U5CVJ/tE05vIkb09yyXSut0/1P0jyn6qqel4JGYDH1Le/9O7c9e6fXXQbh93KykqWl5cX3cZCbL7w6kW3AMABLOIzjm+tqlunR1mPm2onJrlnZszOqba/+jOS/FV3/2Cf+l8717T/gWk8AAAAh2Budxz345Ik70zS08/3Jvmlw9zDj1TVtiTbkmRpaSkrKyuLagWAGRvx9/Hu3bs35HXvtZGvHeBIcFiDY3d/fe92Vf1ukk9Mb+9NcvLM0JOmWvZT/2aSp1XVpumu4uz4vefaWVWbkjx1Gr9aP5cmuTTZ8xnHjfqIEMC6cu3VG/KRzY38qOpG/TcHOJIc1kdVq+qEmbevSbJ3xdWrkrxhWhH1WUlOTfLZ7FkM59RpBdUnJnlDkqumzyt+Oslrp+O3Jvn4zLm2TtuvTfLHPt8IAABw6OZ2x7GqPpxkOcnxVbUzyUVJlqvqjOx5VPWuJG9Oku7+4rRK6h1JfpDkgu7+4XSetya5Lsnjk1zW3V+cpnhbkiur6reSfD7J+6f6+5N8cFpgZ1f2hE0AAAAO0TxXVT13lfL7V6ntHf+uJO9apX5NkmtWqX8lj6y8Olv/P0n+4UE1CwAAwH4tYlVVAAAAjiCCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAEOCIwAAAENrCo5VdUpVvWzaPqaqnjzftgAAAFgvDhgcq+r8JH+Q5L9OpZOS/I859gQAAMA6spY7jhckeVGSB5Oku+9M8uPzbAoAAID1Yy3B8eHu/t7eN1W1KUnPryUAAADWk7UExz+pqt9MckxVvTzJ7yf5o/m2BQAAwHqxluB4YZJvJLktyZuTXJPkX8+zKQAAANaPTQca0N3/N8nvTi8AAAA2mLWsqvpzVfX5qtpVVQ9W1ber6sHD0RwAAACLd8A7jkl+J8k/SHJbd1sUBwAAYINZy2cc70lyu9AIAACwMa3ljuO/SHJNVf1Jkof3Frv7t+fWFQAAAOvGWoLju5LsTvJjSZ4433YAAABYb9YSHH+iu0+feycAAACsS2v5jOM1VXXW3DsBAABgXVpLcPyVJNdW1UO+jgMAAGDjOeCjqt395MPRCAAAAOvTfoNjVf1kd3+5qp632v7u/tz82gIAAGC9GN1x/PUk25K8d5V9neQlc+kIAACAdWUUHG9Nku5+8WHqBQAAgHVotDjOLx22LgAAAFi31rKqKgAAABvY6FHV5+znazcqSXf3U+bUEwAAAOvIKDje1t0/ddg6AQAAYF3yqCoAAABDo+D4+4etCwAAANat/QbH7v63h7MRAAAA1iePqgIAADAkOAIAADB0wOBYVc+oqv9YVZ+rqpur6n1V9YzD0RwAAACLt5Y7jlcmuT/JLyR5bZJvJPm9Ax1UVZdV1f1VdftM7elVdX1V3Tn9PG6qV1VdXFU7qurWqnrezDFbp/F3VtXWmfrzq+q26ZiLq6pGcwAAAHBo1hIcT+jud3b3V6fXbyVZWsNxH0hy9j61C5Pc0N2nJrlhep8kr0xy6vTaluSSZE8ITHJRkhcmeUGSi2aC4CVJzp857uwDzAEAAMAhWEtw/FRVvaGqHje9XpfkugMd1N2fSbJrn/I5SS6fti9P8uqZ+hW9x41JnlZVJyR5RZLru3tXd38ryfVJzp72PaW7b+zuTnLFPudabQ4AAAAOwVqC4/lJPpTk4STfy55HV99cVd+uqgcPcr6l7r5v2v5aHrlzeWKSe2bG7Zxqo/rOVeqjOQAAADgEmw40oLufPI+Ju7urqudx7rXOUVXbsufR2CwtLWVlZWWe7QCwRhvx9/Hu3bs35HXvtZGvHeBIsN/gWFU/2d1fnl2oZlZ3f+4Q5vt6VZ3Q3fdNj5veP9XvTXLyzLiTptq9SZb3qa9M9ZNWGT+aY7VruDTJpUmyZcuWXl5e3t9QAA6Xa6/ORvx9vLKysiGvO8mG/TcHOJKMHlX99enne1d5vecQ57sqyd6VUbcm+fhM/bxpddUzkzwwPW56XZKzquq4aVGcs5JcN+17sKrOnFZTPW+fc602BwAAAIdgv3ccu3vb9PPFh3Liqvpw9twtPL6qdmbP6qjvTvKRqnpTkruTvG4afk2SVyXZkeS7Sd44zb2rqt6Z5KZp3Du6e++CO2/JnpVbj0nyyemVwRwAAAAcggN+xjFJquqnk2yeHd/dV4yO6e5z97PrpauM7SQX7Oc8lyW5bJX69iSnr1L/5mpzAAAAcGgOGByr6oNJ/naSW5L8cCrv/QoMAAAAjnJrueO4Jclp011BAAAANpi1fI/j7Un+5rwbAQAAYH0afR3HH2XPI6lPTnJHVX02ycN793f3z8+/PQAAABZt9KjqoX7lBgAAAEeRUXC8N8lSd//pbLGqfibJfXPtCgAAgHVj9BnH30ny4Cr1B6Z9AAAAbACj4LjU3bftW5xqm+fWEQAAAOvKKDg+bbDvmMe4DwAAANapUXDcXlXn71usqn+a5Ob5tQQAAMB6Mloc59eSfKyq/nEeCYpbkjwxyWvm3BcAAADrxH6DY3d/PclPV9WLk5w+la/u7j8+LJ0BAACwLozuOCZJuvvTST59GHoBAABgHRp9xhEAAAAERwAAAMYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYERwAAAIYWEhyr6q6quq2qbqmq7VPt6VV1fVXdOf08bqpXVV1cVTuq6taqet7MebZO4++sqq0z9edP598xHVuH/yoBAACODou84/ji7j6ju7dM7y9MckN3n5rkhul9krwyyanTa1uSS5I9QTPJRUlemOQFSS7aGzanMefPHHf2/C8HAADg6LSeHlU9J8nl0/blSV49U7+i97gxydOq6oQkr0hyfXfv6u5vJbk+ydnTvqd0943d3UmumDkXAAAAB2lRwbGTfKqqbq6qbVNtqbvvm7a/lmRp2j4xyT0zx+6caqP6zlXqAAAAHIJNC5r3Z7r73qr68STXV9WXZ3d2d1dVz7uJKbRuS5KlpaWsrKzMe0oA1mAj/j7evXv3hrzuvTbytQMcCRYSHLv73unn/VX1sez5jOLXq+qE7r5vetz0/mn4vUlOnjn8pKl2b5LlfeorU/2kVcav1selSS5Nki1btvTy8vJqwwA4nK69Ohvx9/HKysqGvO4kG/bfHOBIctgfVa2qY6vqyXu3k5yV5PYkVyXZuzLq1iQfn7avSnLetLrqmUkemB5pvS7JWVV13LQozllJrpv2PVhVZ06rqZ43cy4AAAAO0iLuOC4l+dj0DRmbknyou6+tqpuSfKSq3pTk7iSvm8Zfk+RVSXYk+W6SNyZJd++qqncmuWka947u3jVtvyXJB5Ick+ST0wsAAIBDcNiDY3d/JclzV6l/M8lLV6l3kgv2c67Lkly2Sn17ktMfdbMAAACsq6/jAAAAYB0SHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABgSHAEAABg6aoNjVZ1dVX9RVTuq6sJF9wMAAHCkOiqDY1U9Psl/TvLKJKclObeqTltsVwAAAEemozI4JnlBkh3d/ZXu/l6SK5Ocs+CeAAAAjkibFt3AnJyY5J6Z9zuTvHBBvQBwkDZfePVC5r373/3cQuZdL0552ycWMu9Tj3nCQuYFYO2O1uC4JlW1Lcm2JFlaWsrKyspiGwIgHzj72MVNfvanFzb17t2786QnPWlh8y+av8EA69vRGhzvTXLyzPuTptpf092XJrk0SbZs2dLLy8uHpTkA2NfKykr8HQJgvTpaP+N4U5JTq+pZVfXEJG9IctWCewIAADgiHZV3HLv7B1X11iTXJXl8ksu6+4sLbgsAAOCIdFQGxyTp7muSXLPoPgAAAI50R+ujqgAAADxGBEcAAACGBEcAAACGBEcAAACGBEcAAACGBEcAAACGBEcAAACGqrsX3cO6UFXfSHL3ovsAYMM6PslfLroJADa0U7r7mavtEBwBYB2oqu3dvWXRfQDAajyqCgAAwJDgCAAAwJDgCADrw6WLbgAA9sdnHAEAABhyxxEAAIChTYtuAACOVFX1wyS3Zc/f068m+cXu/quFNgUAc+COIwAcuoe6+4zuPj3JriQXLKKJqvIfwQDMleAIAI+NP0tyYpJU1UpVbZm2j6+qu6btf1JVH62qa6vqzqr693sPrqrdVfWuqvpCVd1YVUtT/ZlV9YdVddP0etFUf3tVfbCq/jTJB6vq71bVZ6vqlqq6tapOPczXD8BRTHAEgEepqh6f5KVJrlrD8DOSvD7Js5O8vqpOnurHJrmxu5+b5DNJzp/q70vyH7r77yX5hST/beZcpyV5WXefm+SXk7yvu89IsiXJzkdzTQAwy6MtAHDojqmqW7LnTuOXkly/hmNu6O4HkqSq7khySpJ7knwvySemMTcnefm0/bIkp1XV3uOfUlVPmrav6u6Hpu0/S/KvquqkJB/t7jsP+aoAYB/uOALAoXtousN3SpLKI59x/EEe+Rv7Y/sc8/DM9g/zyH/ifr8f+Y6s2frjkpw5fZbyjO4+sbt3T/u+s/dE3f2hJD+f5KEk11TVSx7VlQHADMERAB6l7v5ukl9N8hvTQjV3JXn+tPu1j/L0n0ryz/a+qaozVhtUVX8ryVe6++IkH0/ynEc5LwD8iOAIAI+B7v58kluTnJvkPUl+pao+n+T4R3nqX02yZVrw5o7s+Szjal6X5Pbp0dnTk1zxKOcFgB+pR56KAQAAgP+fO44AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAMCY4AAAAM/T+P/9CNe6qt4AAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df7.boxplot(column='Runner_mins')\n", "plt.grid(True, axis='y')\n", "plt.ylabel('Chip Time')\n", "plt.xticks([1], ['Runners'])" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "x = df7['Runner_mins']\n", "ax = sns.distplot(x, hist=True, kde=True, rug=False, color='m', bins=25, hist_kws={'edgecolor':'black'})\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "f_fuko = df7.loc[df7[' Gender']==' F']['Runner_mins']\n", "m_fuko = df7.loc[df7[' Gender']==' M']['Runner_mins']\n", "sns.distplot(f_fuko, hist=True, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Female')\n", "sns.distplot(m_fuko, hist=False, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Male')\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Runner_mins \\\n", " count mean std min 25% \n", " Gender \n", " F 163.0 61844.168609 91100.389419 60.200000 66.550000 \n", " M 414.0 111701.692432 89982.758495 60.016667 69.320833 \n", "\n", " \n", " 50% 75% max \n", " Gender \n", " F 75.216667 173280.0 215700.0 \n", " M 163320.000000 188745.0 215580.0 \n" ] } ], "source": [ "g_stats = df7.groupby(\" Gender\", as_index=True).describe()\n", "print(g_stats)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 0.98, '')" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df7.boxplot(column='Runner_mins', by=' Gender')\n", "plt.ylabel('Chip Time')\n", "plt.suptitle(\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Now it is your turn:" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "url5 = 'https://www.4icu.org/at/vienna/'\n", "url6 = 'https://www.opec.org/opec_web/en/press_room/307.htm'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 4 }