0% found this document useful (0 votes)
14 views11 pages

SRC 7

The document contains Python code for analyzing TV show data from an IMDb dataset. The code extracts non-adult TV shows from 1970 onwards from a TSV file, cleans the data, and stores it in a CSV or SQLite database. Over the course of 4 files, the code is improved to handle missing data fields, add primary keys, and normalize the data across two tables with a one-to-many relationship.

Uploaded by

Godo Quaran
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views11 pages

SRC 7

The document contains Python code for analyzing TV show data from an IMDb dataset. The code extracts non-adult TV shows from 1970 onwards from a TSV file, cleans the data, and stores it in a CSV or SQLite database. Over the course of 4 files, the code is improved to handle missing data fields, add primary keys, and normalize the data across two tables with a one-to-many relationship.

Uploaded by

Godo Quaran
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

favorites0.

py

1 import csv
2
3 # Open CSV file
4 with open("CS50 2019 - Lecture 7 - Favorite TV Shows (Responses) - Form Responses 1.csv", "r") as file:
5
6 # Create DictReader
7 reader = csv.DictReader(file)
8
9 # Iterate over CSV file, printing each title
10 for row in reader:
11 print(row["title"])
favorites1.py

1 import csv
2
3 # For counting favorites
4 counts = {}
5
6 # Open CSV file
7 with open("CS50 2019 - Lecture 7 - Favorite TV Shows (Responses) - Form Responses 1.csv", "r") as file:
8
9 # Create DictReader
10 reader = csv.DictReader(file)
11
12 # Iterate over CSV file
13 for row in reader:
14
15 # Force title to lowercase
16 title = row["title"].lower()
17
18 # Add title to counts
19 if title in counts:
20 counts[title] += 1
21 else:
22 counts[title] = 1
23
24 # Print counts
25 for title, count in counts.items():
26 print(title, count, sep=" | ")
favorites2.py

1 import csv
2
3 # For counting favorites
4 counts = {}
5
6 # Open CSV file
7 with open("CS50 2019 - Lecture 7 - Favorite TV Shows (Responses) - Form Responses 1.csv", "r") as file:
8
9 # Create DictReader
10 reader = csv.DictReader(file)
11
12 # Iterate over CSV file
13 for row in reader:
14
15 # Force title to lowercase
16 title = row["title"].lower()
17
18 # Add title to counts
19 if title in counts:
20 counts[title] += 1
21 else:
22 counts[title] = 1
23
24 # Print counts, sorted by title
25 for title, count in sorted(counts.items()):
26 print(title, count, sep=" | ")
favorites3.py

1 import csv
2
3 # For counting favorites
4 counts = {}
5
6 # Open CSV file
7 with open("CS50 2019 - Lecture 7 - Favorite TV Shows (Responses) - Form Responses 1.csv", "r") as file:
8
9 # Create DictReader
10 reader = csv.DictReader(file)
11
12 # Iterate over CSV file
13 for row in reader:
14
15 # Force title to lowercase
16 title = row["title"].lower()
17
18 # Add title to counts
19 if title in counts:
20 counts[title] += 1
21 else:
22 counts[title] = 1
23
24 # Function for comparing items by value
25 def f(item):
26 return item[1]
27
28 # Print counts, sorted by key
29 for title, count in sorted(counts.items(), key=f, reverse=True):
30 print(title, count, sep=" | ")
favorites4.py

1 import csv
2
3 # For counting favorites
4 counts = {}
5
6 # Open CSV file
7 with open("CS50 2019 - Lecture 7 - Favorite TV Shows (Responses) - Form Responses 1.csv", "r") as file:
8
9 # Create DictReader
10 reader = csv.DictReader(file)
11
12 # Iterate over CSV file
13 for row in reader:
14
15 # Force title to lowercase
16 title = row["title"].lower()
17
18 # Add title to counts
19 if title in counts:
20 counts[title] += 1
21 else:
22 counts[title] = 1
23
24 # Print counts, sorted by key
25 for title, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
26 print(title, count, sep=" | ")
import0.py

1 import csv
2
3 # Open TSV file
4 # https://datasets.imdbws.com/title.basics.tsv.gz
5 with open("title.basics.tsv", "r") as titles:
6
7 # Create DictReader
8 reader = csv.DictReader(titles, delimiter="\t")
9
10 # Open CSV file
11 with open("shows0.csv", "w") as shows:
12
13 # Create writer
14 writer = csv.writer(shows)
15
16 # Write header
17 writer.writerow(["tconst", "primaryTitle", "startYear", "genres"])
18
19 # Iterate over TSV file
20 for row in reader:
21
22 # If non-adult TV show
23 if row["titleType"] == "tvSeries" and row["isAdult"] == "0":
24
25 # Write row
26 writer.writerow([row["tconst"], row["primaryTitle"], row["startYear"], row["genres"]])
import1.py

1 import csv
2
3 # Open TSV file
4 # https://datasets.imdbws.com/title.basics.tsv.gz
5 with open("title.basics.tsv", "r") as titles:
6
7 # Create DictReader
8 reader = csv.DictReader(tiles, delimiter="\t")
9
10 # Open CSV file
11 with open("shows1.csv", "w") as shows:
12
13 # Create writer
14 writer = csv.writer(shows)
15
16 # Write header
17 writer.writerow(["tconst", "primaryTitle", "startYear", "genres"])
18
19 # Iterate over TSV file
20 for row in reader:
21
22 # If non-adult TV show
23 if row["titleType"] == "tvSeries" and row["isAdult"] == "0":
24
25 # If year not missing
26 if row["startYear"] != "\\N":
27
28 # If since 1970
29 if int(row["startYear"]) >= 1970:
30
31 # Write row
32 writer.writerow([row["tconst"], row["primaryTitle"], row["startYear"], row["genres"]])
import2.py

1 import csv
2
3 # Open TSV file
4 # https://datasets.imdbws.com/title.basics.tsv.gz
5 with open("title.basics.tsv", "r") as titles:
6
7 # Create DictReader
8 reader = csv.DictReader(titles, delimiter="\t")
9
10 # Open CSV file
11 with open("shows2.csv", "w") as shows:
12
13 # Create writer
14 writer = csv.writer(shows)
15
16 # Write header
17 writer.writerow(["tconst", "primaryTitle", "startYear", "genres"])
18
19 # Iterate over TSV file
20 for row in reader:
21
22 # If non-adult TV show
23 if row["titleType"] == "tvSeries" and row["isAdult"] == "0":
24
25 # If year not missing
26 if row["startYear"] != "\\N":
27
28 # Remove \N from genres
29 genres = row["genres"] if row["genres"] != "\\N" else None
30
31 # If since 1970
32 if int(row["startYear"]) >= 1970:
33
34 # Write row
35 writer.writerow([row["tconst"], row["primaryTitle"], row["startYear"], genres])
import3.py

1 import cs50
2 import csv
3
4 # Create database
5 open("shows3.db", "w").close()
6 db = cs50.SQL("sqlite:///shows3.db")
7
8 # Create table
9 db.execute("CREATE TABLE shows (tconst TEXT, primaryTitle TEXT, startYear NUMERIC, genres TEXT)")
10
11 # Open TSV file
12 # https://datasets.imdbws.com/title.basics.tsv.gz
13 with open("title.basics.tsv", "r") as titles:
14
15 # Create DictReader
16 reader = csv.DictReader(titles, delimiter="\t")
17
18 # Iterate over TSV file
19 for row in reader:
20
21 # If non-adult TV show
22 if row["titleType"] == "tvSeries" and row["isAdult"] == "0":
23
24 # If year not missing
25 if row["startYear"] != "\\N":
26
27 # If since 1970
28 startYear = int(row["startYear"])
29 if startYear >= 1970:
30
31 # Remove \N from genres
32 genres = row["genres"] if row["genres"] != "\\N" else None
33
34 # Insert show
35 db.execute("INSERT INTO shows (tconst, primaryTitle, startYear, genres) VALUES(?, ?, ?, ?)",
36 row["tconst"], row["primaryTitle"], startYear, genres)
import4.py

1 import cs50
2 import csv
3
4 # Create database
5 open("shows4.db", "w").close()
6 db = cs50.SQL("sqlite:///shows4.db")
7
8 # Create tables
9 db.execute("CREATE TABLE shows (id INT, title TEXT, year NUMERIC, PRIMARY KEY(id))")
10 db.execute("CREATE TABLE genres (show_id INT, genre TEXT, FOREIGN KEY(show_id) REFERENCES shows(id))")
11
12 # Open TSV file
13 # https://datasets.imdbws.com/title.basics.tsv.gz
14 with open("title.basics.tsv", "r") as titles:
15
16 # Create DictReader
17 reader = csv.DictReader(titles, delimiter="\t")
18
19 # Iterate over TSV file
20 for row in reader:
21
22 # If non-adult TV show
23 if row["titleType"] == "tvSeries" and row["isAdult"] == "0":
24
25 # If year not missing
26 if row["startYear"] != "\\N":
27
28 # If since 1970
29 startYear = int(row["startYear"])
30 if startYear >= 1970:
31
32 # Trim prefix from tconst
33 id = int(row["tconst"][2:])
34
35 # Insert show
36 db.execute("INSERT INTO shows (id, title, year) VALUES(?, ?, ?)", id, row["primaryTitle"], sta
rtYear)
37
38 # Insert genres
39 if row["genres"] != "\\N":
40 for genre in row["genres"].split(","):
41 db.execute("INSERT INTO genres (show_id, genre) VALUES(?, ?)", id, genre)
search.py

1 import csv
2
3 # Prompt user for title
4 title = input("Title: ")
5
6 # Open CSV file
7 with open("shows2.csv", "r") as input:
8
9 # Create DictReader
10 reader = csv.DictReader(input)
11
12 # Iterate over CSV file
13 for row in reader:
14
15 # Search for title
16 if title.lower() == row["primaryTitle"].lower():
17 print(row["primaryTitle"], row["startYear"], row["genres"], sep=" | ")

You might also like