{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to Python \n", "\n", "## Data Persistence with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ #### _file_\n", "+ #### _pickle_\n", "+ #### _dill_\n", "+ #### _json_" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "import dill " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [_file/open_](https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files)\n", "\n", "### open() returns a file object, and is most commonly used with two arguments: open(filename, mode). \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModeDescription
'r'Open a file for reading. (default)
'w'Open a file for writing. Creates a new file if it does not exist or truncates the file if it exists.
'x'Open a file for exclusive creation. If the file already exists, the operation fails.
'a'Open for appending at the end of the file without truncating it. Creates a new file if it does not exist.
't'Open in text mode. (default)
'b'Open in binary mode.
'+'Open a file for updating (reading and writing)
" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "text = 'My string'\n", "f = open('my_file.txt', mode='w', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_io.TextIOWrapper name='my_file.txt' mode='w' encoding='utf-8'>\n" ] } ], "source": [ "print(f)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['_CHUNK_SIZE',\n", " '__class__',\n", " '__del__',\n", " '__delattr__',\n", " '__dict__',\n", " '__dir__',\n", " '__doc__',\n", " '__enter__',\n", " '__eq__',\n", " '__exit__',\n", " '__format__',\n", " '__ge__',\n", " '__getattribute__',\n", " '__gt__',\n", " '__hash__',\n", " '__init__',\n", " '__init_subclass__',\n", " '__iter__',\n", " '__le__',\n", " '__lt__',\n", " '__ne__',\n", " '__new__',\n", " '__next__',\n", " '__reduce__',\n", " '__reduce_ex__',\n", " '__repr__',\n", " '__setattr__',\n", " '__sizeof__',\n", " '__str__',\n", " '__subclasshook__',\n", " '_checkClosed',\n", " '_checkReadable',\n", " '_checkSeekable',\n", " '_checkWritable',\n", " '_finalizing',\n", " 'buffer',\n", " 'close',\n", " 'closed',\n", " 'detach',\n", " 'encoding',\n", " 'errors',\n", " 'fileno',\n", " 'flush',\n", " 'isatty',\n", " 'line_buffering',\n", " 'mode',\n", " 'name',\n", " 'newlines',\n", " 'read',\n", " 'readable',\n", " 'readline',\n", " 'readlines',\n", " 'reconfigure',\n", " 'seek',\n", " 'seekable',\n", " 'tell',\n", " 'truncate',\n", " 'writable',\n", " 'write',\n", " 'write_through',\n", " 'writelines']" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir(f)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "False\n" ] } ], "source": [ "print(f.writable())\n", "print(f.closed)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.write('Hello\\n')" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.write('How are you?\\n')\n", "f.write('one more\\n')\n", "f.write('Ok, bye!\\n')\n", "f.write('empty\\n')\n", "f.write('A small text\\tafter a ')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After creating the file and wrtiting in it, we should close the file in order for it be accessible:" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.closed" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "f.close()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.closed" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "I/O operation on closed file.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Trying to write again!\\n'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#error!\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mValueError\u001b[0m: I/O operation on closed file." ] } ], "source": [ "f.write('Trying to write again!\\n') #error!" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "g = open('my_file.txt', mode='r', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_io.TextIOWrapper name='my_file.txt' mode='r' encoding='utf-8'>\n" ] } ], "source": [ "print(g)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] } ], "source": [ "print(g.writable())" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "ename": "UnsupportedOperation", "evalue": "not writable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnsupportedOperation\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'something'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#error!\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mUnsupportedOperation\u001b[0m: not writable" ] } ], "source": [ "g.write('something') #error!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading the content of a file:\n", "\n", "+ #### read()\n", "+ #### readline()\n", "+ #### readlines() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading all file at once: read()" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n", "How are you?\n", "one more\n", "Ok, bye!\n", "empty\n", "A small text\tafter a \n" ] } ], "source": [ "all_text = g.read()\n", "print(all_text)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "new_attempt = g.read()\n", "print(new_attempt) #the file will seen to be empty, because the read leads the pointer to the last position" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g.seek(0) #moving to the first position again" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n", "How are you?\n", "one more\n", "Ok, bye!\n", "empty\n", "A small text\tafter a \n" ] } ], "source": [ "new_attempt = g.read()\n", "print(new_attempt)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading all lines in a list: readlines()" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello\\n', 'How are you?\\n', 'one more\\n', 'Ok, bye!\\n', 'empty\\n', 'A small text\\tafter a ']\n" ] } ], "source": [ "g.seek(0)\n", "list_of_lines = g.readlines()\n", "print(list_of_lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading one line at a time: readline()" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n", "\n" ] } ], "source": [ "g.seek(0)\n", "first_line = g.readline()\n", "print(first_line)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "How are you?\n", "\n" ] } ], "source": [ "second_line = g.readline()\n", "print(second_line)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one more\n", "\n", "Ok, bye!\n", "\n" ] } ], "source": [ "third_line = g.readline()\n", "print(third_line)\n", "fourth_line = g.readline()\n", "print(fourth_line)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "g.close()" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "h = open('my_file.txt', mode='a', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "34" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h.write('\\nAdded a line\\nand yet another one\\n')" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h.seek(0)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "ename": "UnsupportedOperation", "evalue": "not readable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnsupportedOperation\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#error!\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mUnsupportedOperation\u001b[0m: not readable" ] } ], "source": [ "h.read() #error!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The pythonic way to deal with files:" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "with open('new_file.txt', mode='w', encoding='utf-8') as f:\n", " f.write('Weight\\t\\t72\\n')\n", " f.write('Height\\t\\t183\\n')\n", " f.write('Age\\t\\t44\\n')\n", " f.write('Gender\\t\\tMasculine\\n')\n", " f.write('\\n')" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.closed" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Weight\\t\\t72\\n', 'Height\\t\\t183\\n', 'Age\\t\\t44\\n', 'Gender\\t\\tMasculine\\n', '\\n']\n" ] } ], "source": [ "with open('new_file.txt', mode='r', encoding='utf-8') as f:\n", " list_of_lines = f.readlines()\n", " \n", "print(list_of_lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deleting files" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "os.remove(\"./new_file.txt\")\n", "os.remove(\"./my_file.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [_pickle_](https://docs.python.org/3/library/pickle.html)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The pickle module implements binary protocols for serializing and de-serializing a Python object structure. “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream, and “unpickling” is the inverse operation, whereby a byte stream (from a binary file or bytes-like object) is converted back into an object hierarchy. Pickling (and unpickling) is alternatively known as “serialization” " ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: '6', 2: '2', 3: 'f'}" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_dict = {1:\"6\",2:\"2\",3:\"f\"}\n", "example_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "filehandler = open(\"./dict.pickle\",\"wb\")\n", "print(type(filehandler))\n", "pickle.dump(example_dict, filehandler)\n", "filehandler.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Doing the Pythonic way" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "with open(\"./dict.pickle\",\"wb\") as f:\n", " pickle.dump(example_dict, f)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "del(example_dict) #deleting variable in the environment" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'example_dict' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mexample_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'example_dict' is not defined" ] } ], "source": [ "example_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Retrieving" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "with open(\"./dict.pickle\",\"rb\") as f:\n", " example_dict = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: '6', 2: '2', 3: 'f'}" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multiple objects:" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] } ], "source": [ "a = 12\n", "b = ['one', 'list']\n", "c = ('one','tuple')\n", "d = {1,2,4}\n", "\n", "print(type(a))\n", "print(type(b))\n", "print(type(c))\n", "print(type(d))" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "with open('my_objects.pkl', 'wb') as f:\n", " pickle.dump((a,b,c,d), f)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "del a\n", "del b\n", "del c\n", "del d" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'a' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#error!\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'a' is not defined" ] } ], "source": [ "print(type(a))\n", "print(type(b))\n", "print(type(c))\n", "print(type(d)) #error!" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "with open('my_objects.pkl', 'rb') as f:\n", " a,b,c,d = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] } ], "source": [ "print(type(a))\n", "print(type(b))\n", "print(type(c))\n", "print(type(d))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### To serialise functions or classes to files, use the module [dill](https://medium.com/@emlynoregan/serialising-all-the-functions-in-python-cd880a63b591) \n", "[Docs](https://dill.readthedocs.io/en/latest/dill.html)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "def summing(x,y):\n", " return x + y " ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(summing))" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "with open(\"my_function.dill\", \"wb\") as f:\n", " dill.dump(summing, f)" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "del summing" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'summing' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msumming\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'summing' is not defined" ] } ], "source": [ "summing(3,4)" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "with open(\"my_function.dill\", \"rb\") as f:\n", " summing = dill.load(f)" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summing(3,4)" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [], "source": [ "class my_integer(int):\n", " def __init__(self, x):\n", " self.x = x\n", " \n", " def __add__(self,y):\n", " return self.x - y" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n" ] } ], "source": [ "x = my_integer(10)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [], "source": [ "with open('my_class.dill', 'wb') as f:\n", " dill.dump(my_integer, f)" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [], "source": [ "del my_integer" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'my_integer' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmy_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'my_integer' is not defined" ] } ], "source": [ "x = my_integer(10)" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "with open('my_class.dill', 'rb') as f:\n", " my_integer = dill.load(f)" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n" ] } ], "source": [ "x = my_integer(10)\n", "print(x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deleting files" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "os.remove(\"./dict.pickle\")\n", "os.remove(\"./my_objects.pkl\")\n", "os.remove(\"./my_function.dill\")\n", "os.remove(\"./my_class.dill\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [_json_](https://docs.python.org/3/library/json.html)\n", "\n", "### JSON can store Lists, bools, numbers, tuples and dictionaries. But to be saved into a file, all these structures must be reduced to strings. It is the string version that can be read or written to a file. Python has a JSON module that will help converting the datastructures to JSON strings. \n", "\n", "+ [JSON](https://www.w3schools.com/whatis/whatis_json.asp) stands for JavaScript Object Notation\n", "+ JSON is a lightweight format for storing and transporting data\n", "+ JSON is often used when data is sent from a server to a web page\n", "+ JSON is \"self-describing\" and easy to understand\n", "\n", "### JSON Syntax Rules\n", "\n", "+ Data is in name/value pairs\n", "+ Data is separated by commas\n", "+ Curly braces hold objects\n", "+ Square brackets hold arrays" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "import json " ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "30\n" ] } ], "source": [ "# some JSON:\n", "x = '{ \"name\":\"John\", \"age\":30, \"city\":\"New York\"}'\n", "\n", "# parse x:\n", "y = json.loads(x)\n", "\n", "# the result is a Python dictionary:\n", "print(y[\"age\"]) " ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(y)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}\n" ] } ], "source": [ "# a Python object (dict):\n", "x = {\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}\n", "\n", "# convert into JSON:\n", "y = json.dumps(x)\n", "\n", "# the result is a JSON string:\n", "print(y)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert Python objects into JSON strings, and print the values:" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"name\": \"John\", \"age\": 30}\n", "[\"apple\", \"bananas\"]\n", "[\"apple\", \"bananas\"]\n", "\"hello\"\n", "42\n", "31.76\n", "true\n", "false\n", "null\n" ] } ], "source": [ "print(json.dumps({\"name\": \"John\", \"age\": 30}))\n", "print(json.dumps([\"apple\", \"bananas\"]))\n", "print(json.dumps((\"apple\", \"bananas\")))\n", "print(json.dumps(\"hello\"))\n", "print(json.dumps(42))\n", "print(json.dumps(31.76))\n", "print(json.dumps(True))\n", "print(json.dumps(False))\n", "print(json.dumps(None)) " ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"name\": \"John\", \"age\": 30, \"married\": true, \"divorced\": false, \"children\": [\"Ann\", \"Billy\"], \"pets\": null, \"cars\": [{\"model\": \"BMW 230\", \"mpg\": 27.5}, {\"model\": \"Ford Edge\", \"mpg\": 24.1}]}\n" ] } ], "source": [ "x = {\n", " \"name\": \"John\",\n", " \"age\": 30,\n", " \"married\": True,\n", " \"divorced\": False,\n", " \"children\": (\"Ann\",\"Billy\"),\n", " \"pets\": None,\n", " \"cars\": [\n", " {\"model\": \"BMW 230\", \"mpg\": 27.5},\n", " {\"model\": \"Ford Edge\", \"mpg\": 24.1}\n", " ]\n", "}\n", "\n", "print(json.dumps(x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Practical example: Webscraping and saving data:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "import requests\n", "import string\n", "from bs4 import BeautifulSoup\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "editable": true }, "outputs": [], "source": [ "page = requests.get('https://en.wikipedia.org/wiki/FIFA_World_Cup')\n", "soup = BeautifulSoup(page.text, \"lxml\")\n", "text = soup.text\n", "words = text.split()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "upper = [m for m in words if m.istitle()]\n", "upper_clean = [m.strip(string.punctuation) for m in upper]\n", "upper_clean = [m.strip(string.digits) for m in upper_clean]\n", "upper_clean = [m.strip(string.punctuation) for m in upper_clean]\n", "upper_clean = [m for m in upper_clean if len(m)>1]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "frequencies = Counter(upper_clean)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('World', 295), ('Cup', 261), ('The', 103), ('Retrieved', 91), ('Brazil', 64)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frequencies.most_common(5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "with open('Fifa_stats.txt','w') as f:\n", " for key, value in frequencies.items():\n", " if value > 4:\n", " f.write(f'The word {key} appears {value} times\\n')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "with open('Fifa_stats.txt','r') as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "editable": true, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The word World appears 295 times\n", "The word Cup appears 261 times\n", "The word Wikipedia appears 5 times\n", "The word November appears 19 times\n", "The word July appears 28 times\n", "The word British appears 7 times\n", "The word English appears 5 times\n", "The word June appears 25 times\n", "The word From appears 7 times\n", "The word Association appears 38 times\n", "The word This appears 11 times\n", "The word For appears 6 times\n", "The word France appears 38 times\n", "The word Brazil appears 64 times\n", "The word The appears 103 times\n", "The word Fédération appears 31 times\n", "The word Internationale appears 31 times\n", "The word Football appears 58 times\n", "The word War appears 7 times\n", "The word Russia appears 14 times\n", "The word In appears 22 times\n", "The word Germany appears 63 times\n", "The word Italy appears 36 times\n", "The word Argentina appears 29 times\n", "The word Uruguay appears 34 times\n", "The word England appears 23 times\n", "The word Spain appears 21 times\n", "The word Olympic appears 15 times\n", "The word Games appears 20 times\n", "The word Mexico appears 26 times\n", "The word Switzerland appears 9 times\n" ] } ], "source": [ "print(text[0:1028])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }