{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to Python \n", "\n", "## Data Persistence with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ #### _file_\n", "+ #### _pickle_\n", "+ #### _dill_\n", "+ #### _json_" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "import dill " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [_file/open_](https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files)\n", "\n", "### open() returns a file object, and is most commonly used with two arguments: open(filename, mode). \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModeDescription
'r'Open a file for reading. (default)
'w'Open a file for writing. Creates a new file if it does not exist or truncates the file if it exists.
'x'Open a file for exclusive creation. If the file already exists, the operation fails.
'a'Open for appending at the end of the file without truncating it. Creates a new file if it does not exist.
't'Open in text mode. (default)
'b'Open in binary mode.
'+'Open a file for updating (reading and writing)
" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "text = 'My string'\n", "f = open('my_file.txt', mode='w', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_io.TextIOWrapper name='my_file.txt' mode='w' encoding='utf-8'>\n" ] } ], "source": [ "print(f)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['_CHUNK_SIZE',\n", " '__class__',\n", " '__del__',\n", " '__delattr__',\n", " '__dict__',\n", " '__dir__',\n", " '__doc__',\n", " '__enter__',\n", " '__eq__',\n", " '__exit__',\n", " '__format__',\n", " '__ge__',\n", " '__getattribute__',\n", " '__gt__',\n", " '__hash__',\n", " '__init__',\n", " '__init_subclass__',\n", " '__iter__',\n", " '__le__',\n", " '__lt__',\n", " '__ne__',\n", " '__new__',\n", " '__next__',\n", " '__reduce__',\n", " '__reduce_ex__',\n", " '__repr__',\n", " '__setattr__',\n", " '__sizeof__',\n", " '__str__',\n", " '__subclasshook__',\n", " '_checkClosed',\n", " '_checkReadable',\n", " '_checkSeekable',\n", " '_checkWritable',\n", " '_finalizing',\n", " 'buffer',\n", " 'close',\n", " 'closed',\n", " 'detach',\n", " 'encoding',\n", " 'errors',\n", " 'fileno',\n", " 'flush',\n", " 'isatty',\n", " 'line_buffering',\n", " 'mode',\n", " 'name',\n", " 'newlines',\n", " 'read',\n", " 'readable',\n", " 'readline',\n", " 'readlines',\n", " 'reconfigure',\n", " 'seek',\n", " 'seekable',\n", " 'tell',\n", " 'truncate',\n", " 'writable',\n", " 'write',\n", " 'write_through',\n", " 'writelines']" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir(f)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "False\n" ] } ], "source": [ "print(f.writable())\n", "print(f.closed)" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.write('Hello\\n')" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.write('How are you?\\n')\n", "f.write('one more\\n')\n", "f.write('Ok, bye!\\n')\n", "f.write('empty\\n')\n", "f.write('A small text\\tafter a ')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After creating the file and wrtiting in it, we should close the file in order for it be accessible:" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.closed" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "f.close()" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.closed" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [], "source": [ "#f.write('Trying to write again!\\n') #error!" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "g = open('my_file.txt', mode='r', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_io.TextIOWrapper name='my_file.txt' mode='r' encoding='utf-8'>\n" ] } ], "source": [ "print(g)" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] } ], "source": [ "print(g.writable())" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [], "source": [ "#g.write('something') #error!" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [], "source": [ "g.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading the content of a file:\n", "\n", "+ #### read()\n", "+ #### readline()\n", "+ #### readlines() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading all file at once: read()" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n", "How are you?\n", "one more\n", "Ok, bye!\n", "empty\n", "A small text\tafter a \n" ] } ], "source": [ "g = open('my_file.txt', mode='r', encoding='utf-8')\n", "\n", "all_text = g.read()\n", "print(all_text)" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "new_attempt = g.read()\n", "print(new_attempt) #the file will seen to be empty, because the read leads the pointer to the last position" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 135, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g.seek(0) #moving to the first position again" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n", "How are you?\n", "one more\n", "Ok, bye!\n", "empty\n", "A small text\tafter a \n" ] } ], "source": [ "new_attempt = g.read()\n", "print(new_attempt)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading all lines in a list: readlines()" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello\\n', 'How are you?\\n', 'one more\\n', 'Ok, bye!\\n', 'empty\\n', 'A small text\\tafter a ']\n" ] } ], "source": [ "g.seek(0)\n", "list_of_lines = g.readlines()\n", "print(list_of_lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading one line at a time: readline()" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n", "\n" ] } ], "source": [ "g.seek(0)\n", "first_line = g.readline()\n", "print(first_line)" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "How are you?\n", "\n" ] } ], "source": [ "second_line = g.readline()\n", "print(second_line)" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one more\n", "\n", "Ok, bye!\n", "\n" ] } ], "source": [ "third_line = g.readline()\n", "print(third_line)\n", "fourth_line = g.readline()\n", "print(fourth_line)" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "g.close()" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [], "source": [ "h = open('my_file.txt', mode='a', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "34" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h.write('\\nAdded a line\\nand yet another one\\n')" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h.seek(0)" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "#h.read() #error!" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "h.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The pythonic way to deal with files:" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "with open('new_file.txt', mode='w', encoding='utf-8') as f:\n", " f.write('Weight\\t\\t72\\n')\n", " f.write('Height\\t\\t183\\n')\n", " f.write('Age\\t\\t44\\n')\n", " f.write('Gender\\t\\tMasculine\\n')\n", " f.write('\\n')" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.closed" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Weight\\t\\t72\\n', 'Height\\t\\t183\\n', 'Age\\t\\t44\\n', 'Gender\\t\\tMasculine\\n', '\\n']\n" ] } ], "source": [ "with open('new_file.txt', mode='r', encoding='utf-8') as f:\n", " list_of_lines = f.readlines()\n", " \n", "print(list_of_lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deleting files" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "os.remove(\"./new_file.txt\")\n", "os.remove(\"./my_file.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [_pickle_](https://docs.python.org/3/library/pickle.html)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The pickle module implements binary protocols for serializing and de-serializing a Python object structure. “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream, and “unpickling” is the inverse operation, whereby a byte stream (from a binary file or bytes-like object) is converted back into an object hierarchy. Pickling (and unpickling) is alternatively known as “serialization” " ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: '6', 2: '2', 3: 'f'}" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_dict = {1:\"6\",2:\"2\",3:\"f\"}\n", "example_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "filehandler = open(\"./dict.pickle\",\"wb\")\n", "print(type(filehandler))\n", "pickle.dump(example_dict, filehandler)\n", "filehandler.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Doing the Pythonic way" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "with open(\"./dict.pickle\",\"wb\") as f:\n", " pickle.dump(example_dict, f)" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [], "source": [ "del(example_dict) #deleting variable in the environment" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "#example_dict #error" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Retrieving" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [], "source": [ "with open(\"./dict.pickle\",\"rb\") as f:\n", " example_dict = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: '6', 2: '2', 3: 'f'}" ] }, "execution_count": 157, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multiple objects:" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] } ], "source": [ "a = 12\n", "b = ['one', 'list']\n", "c = ('one','tuple')\n", "d = {1,2,4}\n", "\n", "print(type(a))\n", "print(type(b))\n", "print(type(c))\n", "print(type(d))" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [], "source": [ "with open('my_objects.pkl', 'wb') as f:\n", " pickle.dump((a,b,c,d), f)" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [], "source": [ "del a\n", "del b\n", "del c\n", "del d" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "#print(type(a))\n", "#print(type(b))\n", "#print(type(c))\n", "#print(type(d)) #error!" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "with open('my_objects.pkl', 'rb') as f:\n", " a,b,c,d = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] } ], "source": [ "print(type(a))\n", "print(type(b))\n", "print(type(c))\n", "print(type(d))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### To serialise functions or classes to files, use the module [dill](https://medium.com/@emlynoregan/serialising-all-the-functions-in-python-cd880a63b591) \n", "[Docs](https://dill.readthedocs.io/en/latest/dill.html)" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "def summing(x,y):\n", " return x + y " ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(summing))" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [], "source": [ "with open(\"my_function.dill\", \"wb\") as f:\n", " dill.dump(summing, f)" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "del summing" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "#summing(3,4) #error" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [], "source": [ "with open(\"my_function.dill\", \"rb\") as f:\n", " summing = dill.load(f)" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summing(3,4)" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [], "source": [ "class my_integer(int):\n", " def __init__(self, x):\n", " self.x = x\n", " \n", " def __add__(self,y):\n", " return self.x - y" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n" ] } ], "source": [ "x = my_integer(10)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [], "source": [ "with open('my_class.dill', 'wb') as f:\n", " dill.dump(my_integer, f)" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "del my_integer" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [], "source": [ "#x = my_integer(10) #error" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [], "source": [ "with open('my_class.dill', 'rb') as f:\n", " my_integer = dill.load(f)" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n" ] } ], "source": [ "x = my_integer(10)\n", "print(x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deleting files" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [], "source": [ "os.remove(\"./dict.pickle\")\n", "os.remove(\"./my_objects.pkl\")\n", "os.remove(\"./my_function.dill\")\n", "os.remove(\"./my_class.dill\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [_json_](https://docs.python.org/3/library/json.html)\n", "\n", "### JSON can store Lists, bools, numbers, tuples and dictionaries. But to be saved into a file, all these structures must be reduced to strings. It is the string version that can be read or written to a file. Python has a JSON module that will help converting the datastructures to JSON strings. \n", "\n", "+ [JSON](https://www.w3schools.com/whatis/whatis_json.asp) stands for JavaScript Object Notation\n", "+ JSON is a lightweight format for storing and transporting data\n", "+ JSON is often used when data is sent from a server to a web page\n", "+ JSON is \"self-describing\" and easy to understand\n", "\n", "### JSON Syntax Rules\n", "\n", "+ Data is in name/value pairs\n", "+ Data is separated by commas\n", "+ Curly braces hold objects\n", "+ Square brackets hold arrays" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [], "source": [ "import json " ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "30\n" ] } ], "source": [ "# some JSON:\n", "x = '{ \"name\":\"John\", \"age\":30, \"city\":\"New York\"}'\n", "\n", "# parse x:\n", "y = json.loads(x)\n", "\n", "# the result is a Python dictionary:\n", "print(y[\"age\"]) " ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(y)" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}\n" ] } ], "source": [ "# a Python object (dict):\n", "x = {\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}\n", "\n", "# convert into JSON:\n", "y = json.dumps(x)\n", "\n", "# the result is a JSON string:\n", "print(y)" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert Python objects into JSON strings, and print the values:" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"name\": \"John\", \"age\": 30}\n", "[\"apple\", \"bananas\"]\n", "[\"apple\", \"bananas\"]\n", "\"hello\"\n", "42\n", "31.76\n", "true\n", "false\n", "null\n" ] } ], "source": [ "print(json.dumps({\"name\": \"John\", \"age\": 30}))\n", "print(json.dumps([\"apple\", \"bananas\"]))\n", "print(json.dumps((\"apple\", \"bananas\")))\n", "print(json.dumps(\"hello\"))\n", "print(json.dumps(42))\n", "print(json.dumps(31.76))\n", "print(json.dumps(True))\n", "print(json.dumps(False))\n", "print(json.dumps(None)) " ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"name\": \"John\", \"age\": 30, \"married\": true, \"divorced\": false, \"children\": [\"Ann\", \"Billy\"], \"pets\": null, \"cars\": [{\"model\": \"BMW 230\", \"mpg\": 27.5}, {\"model\": \"Ford Edge\", \"mpg\": 24.1}]}\n" ] } ], "source": [ "x = {\n", " \"name\": \"John\",\n", " \"age\": 30,\n", " \"married\": True,\n", " \"divorced\": False,\n", " \"children\": (\"Ann\",\"Billy\"),\n", " \"pets\": None,\n", " \"cars\": [\n", " {\"model\": \"BMW 230\", \"mpg\": 27.5},\n", " {\"model\": \"Ford Edge\", \"mpg\": 24.1}\n", " ]\n", "}\n", "\n", "print(json.dumps(x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Practical example: Webscraping and saving data:" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "!pip install -U -q requests beautifulsoup4" ] }, { "cell_type": "code", "execution_count": 190, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "import requests\n", "import string\n", "from bs4 import BeautifulSoup\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 191, "metadata": { "editable": true }, "outputs": [ { "ename": "SSLError", "evalue": "HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/FIFA_World_Cup (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:997)')))", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mSSLCertVerificationError\u001b[0m Traceback (most recent call last)", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\connectionpool.py:386\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 385\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 386\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_conn(conn)\n\u001b[0;32m 387\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 388\u001b[0m \u001b[39m# Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.\u001b[39;00m\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\connectionpool.py:1042\u001b[0m, in \u001b[0;36mHTTPSConnectionPool._validate_conn\u001b[1;34m(self, conn)\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mgetattr\u001b[39m(conn, \u001b[39m\"\u001b[39m\u001b[39msock\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m): \u001b[39m# AppEngine might not have `.sock`\u001b[39;00m\n\u001b[1;32m-> 1042\u001b[0m conn\u001b[39m.\u001b[39;49mconnect()\n\u001b[0;32m 1044\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m conn\u001b[39m.\u001b[39mis_verified:\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\connection.py:414\u001b[0m, in \u001b[0;36mHTTPSConnection.connect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 412\u001b[0m context\u001b[39m.\u001b[39mload_default_certs()\n\u001b[1;32m--> 414\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msock \u001b[39m=\u001b[39m ssl_wrap_socket(\n\u001b[0;32m 415\u001b[0m sock\u001b[39m=\u001b[39;49mconn,\n\u001b[0;32m 416\u001b[0m keyfile\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mkey_file,\n\u001b[0;32m 417\u001b[0m certfile\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcert_file,\n\u001b[0;32m 418\u001b[0m key_password\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mkey_password,\n\u001b[0;32m 419\u001b[0m ca_certs\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mca_certs,\n\u001b[0;32m 420\u001b[0m ca_cert_dir\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mca_cert_dir,\n\u001b[0;32m 421\u001b[0m ca_cert_data\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mca_cert_data,\n\u001b[0;32m 422\u001b[0m server_hostname\u001b[39m=\u001b[39;49mserver_hostname,\n\u001b[0;32m 423\u001b[0m ssl_context\u001b[39m=\u001b[39;49mcontext,\n\u001b[0;32m 424\u001b[0m tls_in_tls\u001b[39m=\u001b[39;49mtls_in_tls,\n\u001b[0;32m 425\u001b[0m )\n\u001b[0;32m 427\u001b[0m \u001b[39m# If we're using all defaults and the connection\u001b[39;00m\n\u001b[0;32m 428\u001b[0m \u001b[39m# is TLSv1 or TLSv1.1 we throw a DeprecationWarning\u001b[39;00m\n\u001b[0;32m 429\u001b[0m \u001b[39m# for the host.\u001b[39;00m\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\util\\ssl_.py:449\u001b[0m, in \u001b[0;36mssl_wrap_socket\u001b[1;34m(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)\u001b[0m\n\u001b[0;32m 448\u001b[0m \u001b[39mif\u001b[39;00m send_sni:\n\u001b[1;32m--> 449\u001b[0m ssl_sock \u001b[39m=\u001b[39m _ssl_wrap_socket_impl(\n\u001b[0;32m 450\u001b[0m sock, context, tls_in_tls, server_hostname\u001b[39m=\u001b[39;49mserver_hostname\n\u001b[0;32m 451\u001b[0m )\n\u001b[0;32m 452\u001b[0m \u001b[39melse\u001b[39;00m:\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\util\\ssl_.py:493\u001b[0m, in \u001b[0;36m_ssl_wrap_socket_impl\u001b[1;34m(sock, ssl_context, tls_in_tls, server_hostname)\u001b[0m\n\u001b[0;32m 492\u001b[0m \u001b[39mif\u001b[39;00m server_hostname:\n\u001b[1;32m--> 493\u001b[0m \u001b[39mreturn\u001b[39;00m ssl_context\u001b[39m.\u001b[39;49mwrap_socket(sock, server_hostname\u001b[39m=\u001b[39;49mserver_hostname)\n\u001b[0;32m 494\u001b[0m \u001b[39melse\u001b[39;00m:\n", "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:513\u001b[0m, in \u001b[0;36mSSLContext.wrap_socket\u001b[1;34m(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)\u001b[0m\n\u001b[0;32m 507\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mwrap_socket\u001b[39m(\u001b[39mself\u001b[39m, sock, server_side\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[0;32m 508\u001b[0m do_handshake_on_connect\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[0;32m 509\u001b[0m suppress_ragged_eofs\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[0;32m 510\u001b[0m server_hostname\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, session\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[0;32m 511\u001b[0m \u001b[39m# SSLSocket class handles server_hostname encoding before it calls\u001b[39;00m\n\u001b[0;32m 512\u001b[0m \u001b[39m# ctx._wrap_socket()\u001b[39;00m\n\u001b[1;32m--> 513\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msslsocket_class\u001b[39m.\u001b[39;49m_create(\n\u001b[0;32m 514\u001b[0m sock\u001b[39m=\u001b[39;49msock,\n\u001b[0;32m 515\u001b[0m server_side\u001b[39m=\u001b[39;49mserver_side,\n\u001b[0;32m 516\u001b[0m do_handshake_on_connect\u001b[39m=\u001b[39;49mdo_handshake_on_connect,\n\u001b[0;32m 517\u001b[0m suppress_ragged_eofs\u001b[39m=\u001b[39;49msuppress_ragged_eofs,\n\u001b[0;32m 518\u001b[0m server_hostname\u001b[39m=\u001b[39;49mserver_hostname,\n\u001b[0;32m 519\u001b[0m context\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m,\n\u001b[0;32m 520\u001b[0m session\u001b[39m=\u001b[39;49msession\n\u001b[0;32m 521\u001b[0m )\n", "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:1071\u001b[0m, in \u001b[0;36mSSLSocket._create\u001b[1;34m(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)\u001b[0m\n\u001b[0;32m 1070\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mdo_handshake_on_connect should not be specified for non-blocking sockets\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m-> 1071\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdo_handshake()\n\u001b[0;32m 1072\u001b[0m \u001b[39mexcept\u001b[39;00m (\u001b[39mOSError\u001b[39;00m, \u001b[39mValueError\u001b[39;00m):\n", "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:1342\u001b[0m, in \u001b[0;36mSSLSocket.do_handshake\u001b[1;34m(self, block)\u001b[0m\n\u001b[0;32m 1341\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msettimeout(\u001b[39mNone\u001b[39;00m)\n\u001b[1;32m-> 1342\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mdo_handshake()\n\u001b[0;32m 1343\u001b[0m \u001b[39mfinally\u001b[39;00m:\n", "\u001b[1;31mSSLCertVerificationError\u001b[0m: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:997)", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[1;31mMaxRetryError\u001b[0m Traceback (most recent call last)", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\requests\\adapters.py:489\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m 488\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m chunked:\n\u001b[1;32m--> 489\u001b[0m resp \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49murlopen(\n\u001b[0;32m 490\u001b[0m method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod,\n\u001b[0;32m 491\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[0;32m 492\u001b[0m body\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mbody,\n\u001b[0;32m 493\u001b[0m headers\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mheaders,\n\u001b[0;32m 494\u001b[0m redirect\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 495\u001b[0m assert_same_host\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 496\u001b[0m preload_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 497\u001b[0m decode_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 498\u001b[0m retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[0;32m 499\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[0;32m 500\u001b[0m )\n\u001b[0;32m 502\u001b[0m \u001b[39m# Send the request.\u001b[39;00m\n\u001b[0;32m 503\u001b[0m \u001b[39melse\u001b[39;00m:\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\connectionpool.py:787\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 785\u001b[0m e \u001b[39m=\u001b[39m ProtocolError(\u001b[39m\"\u001b[39m\u001b[39mConnection aborted.\u001b[39m\u001b[39m\"\u001b[39m, e)\n\u001b[1;32m--> 787\u001b[0m retries \u001b[39m=\u001b[39m retries\u001b[39m.\u001b[39;49mincrement(\n\u001b[0;32m 788\u001b[0m method, url, error\u001b[39m=\u001b[39;49me, _pool\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m, _stacktrace\u001b[39m=\u001b[39;49msys\u001b[39m.\u001b[39;49mexc_info()[\u001b[39m2\u001b[39;49m]\n\u001b[0;32m 789\u001b[0m )\n\u001b[0;32m 790\u001b[0m retries\u001b[39m.\u001b[39msleep()\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\urllib3\\util\\retry.py:592\u001b[0m, in \u001b[0;36mRetry.increment\u001b[1;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[0;32m 591\u001b[0m \u001b[39mif\u001b[39;00m new_retry\u001b[39m.\u001b[39mis_exhausted():\n\u001b[1;32m--> 592\u001b[0m \u001b[39mraise\u001b[39;00m MaxRetryError(_pool, url, error \u001b[39mor\u001b[39;00m ResponseError(cause))\n\u001b[0;32m 594\u001b[0m log\u001b[39m.\u001b[39mdebug(\u001b[39m\"\u001b[39m\u001b[39mIncremented Retry for (url=\u001b[39m\u001b[39m'\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m): \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m, url, new_retry)\n", "\u001b[1;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/FIFA_World_Cup (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:997)')))", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[1;31mSSLError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn [191], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m page \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39;49mget(\u001b[39m'\u001b[39;49m\u001b[39mhttps://en.wikipedia.org/wiki/FIFA_World_Cup\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[0;32m 2\u001b[0m soup \u001b[39m=\u001b[39m BeautifulSoup(page\u001b[39m.\u001b[39mtext, \u001b[39m\"\u001b[39m\u001b[39mlxml\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m text \u001b[39m=\u001b[39m soup\u001b[39m.\u001b[39mtext\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\requests\\api.py:73\u001b[0m, in \u001b[0;36mget\u001b[1;34m(url, params, **kwargs)\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget\u001b[39m(url, params\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[0;32m 63\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[0;32m 64\u001b[0m \n\u001b[0;32m 65\u001b[0m \u001b[39m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[39m :rtype: requests.Response\u001b[39;00m\n\u001b[0;32m 71\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 73\u001b[0m \u001b[39mreturn\u001b[39;00m request(\u001b[39m\"\u001b[39m\u001b[39mget\u001b[39m\u001b[39m\"\u001b[39m, url, params\u001b[39m=\u001b[39mparams, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\requests\\api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[39m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[39m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[39m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[0;32m 58\u001b[0m \u001b[39mwith\u001b[39;00m sessions\u001b[39m.\u001b[39mSession() \u001b[39mas\u001b[39;00m session:\n\u001b[1;32m---> 59\u001b[0m \u001b[39mreturn\u001b[39;00m session\u001b[39m.\u001b[39mrequest(method\u001b[39m=\u001b[39mmethod, url\u001b[39m=\u001b[39murl, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\requests\\sessions.py:587\u001b[0m, in \u001b[0;36mSession.request\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m 582\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[0;32m 583\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[0;32m 584\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[0;32m 585\u001b[0m }\n\u001b[0;32m 586\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[1;32m--> 587\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msend(prep, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msend_kwargs)\n\u001b[0;32m 589\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\requests\\sessions.py:701\u001b[0m, in \u001b[0;36mSession.send\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m 698\u001b[0m start \u001b[39m=\u001b[39m preferred_clock()\n\u001b[0;32m 700\u001b[0m \u001b[39m# Send the request\u001b[39;00m\n\u001b[1;32m--> 701\u001b[0m r \u001b[39m=\u001b[39m adapter\u001b[39m.\u001b[39msend(request, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 703\u001b[0m \u001b[39m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[0;32m 704\u001b[0m elapsed \u001b[39m=\u001b[39m preferred_clock() \u001b[39m-\u001b[39m start\n", "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\requests\\adapters.py:563\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m 559\u001b[0m \u001b[39mraise\u001b[39;00m ProxyError(e, request\u001b[39m=\u001b[39mrequest)\n\u001b[0;32m 561\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(e\u001b[39m.\u001b[39mreason, _SSLError):\n\u001b[0;32m 562\u001b[0m \u001b[39m# This branch is for urllib3 v1.22 and later.\u001b[39;00m\n\u001b[1;32m--> 563\u001b[0m \u001b[39mraise\u001b[39;00m SSLError(e, request\u001b[39m=\u001b[39mrequest)\n\u001b[0;32m 565\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m(e, request\u001b[39m=\u001b[39mrequest)\n\u001b[0;32m 567\u001b[0m \u001b[39mexcept\u001b[39;00m ClosedPoolError \u001b[39mas\u001b[39;00m e:\n", "\u001b[1;31mSSLError\u001b[0m: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/FIFA_World_Cup (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:997)')))" ] } ], "source": [ "page = requests.get('https://en.wikipedia.org/wiki/FIFA_World_Cup')\n", "soup = BeautifulSoup(page.text, \"lxml\")\n", "text = soup.text\n", "words = text.split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "upper = [m for m in words if m.istitle()]\n", "upper_clean = [m.strip(string.punctuation) for m in upper]\n", "upper_clean = [m.strip(string.digits) for m in upper_clean]\n", "upper_clean = [m.strip(string.punctuation) for m in upper_clean]\n", "upper_clean = [m for m in upper_clean if len(m)>1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "frequencies = Counter(upper_clean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('World', 295), ('Cup', 261), ('The', 103), ('Retrieved', 91), ('Brazil', 64)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frequencies.most_common(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "with open('Fifa_stats.txt','w') as f:\n", " for key, value in frequencies.items():\n", " if value > 4:\n", " f.write(f'The word {key} appears {value} times\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "with open('Fifa_stats.txt','r') as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The word World appears 295 times\n", "The word Cup appears 261 times\n", "The word Wikipedia appears 5 times\n", "The word November appears 19 times\n", "The word July appears 28 times\n", "The word British appears 7 times\n", "The word English appears 5 times\n", "The word June appears 25 times\n", "The word From appears 7 times\n", "The word Association appears 38 times\n", "The word This appears 11 times\n", "The word For appears 6 times\n", "The word France appears 38 times\n", "The word Brazil appears 64 times\n", "The word The appears 103 times\n", "The word Fédération appears 31 times\n", "The word Internationale appears 31 times\n", "The word Football appears 58 times\n", "The word War appears 7 times\n", "The word Russia appears 14 times\n", "The word In appears 22 times\n", "The word Germany appears 63 times\n", "The word Italy appears 36 times\n", "The word Argentina appears 29 times\n", "The word Uruguay appears 34 times\n", "The word England appears 23 times\n", "The word Spain appears 21 times\n", "The word Olympic appears 15 times\n", "The word Games appears 20 times\n", "The word Mexico appears 26 times\n", "The word Switzerland appears 9 times\n" ] } ], "source": [ "print(text[0:1028])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.8 64-bit (microsoft store)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" }, "vscode": { "interpreter": { "hash": "a3a5d4f9651fa550758ade4a473ec6b18daeb3d63067db8abd62b64f4493ea9c" } } }, "nbformat": 4, "nbformat_minor": 4 }