0% found this document useful (0 votes)
23 views37 pages

Ds 1

Data science questions

Uploaded by

veee.2631
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views37 pages

Ds 1

Data science questions

Uploaded by

veee.2631
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 37

11/20/24, 11:44 PM Untitled45.

ipynb - Colab

1) Construct a revealing visualization of some aspect of


your favorite data set, using:
(a) A well-designed table. (b) A dot and/or line plot. (c) A scatter plot. (d) A heatmap. (e) A bar plot
or pie chart. (f) A histogram. (g) A data map.

# Import necessary libraries


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import files


uploaded = files.upload()
data = pd.read_csv("search_engine_data.csv")

Choose Files No file chosen Upload widget is only available when the cell has been executed
in the current browser session. Please rerun this cell to enable.
Saving search engine data csv to search engine data (1) csv

data = pd.read_csv("search_engine_data.csv")

# Display a summary of your dataset as a table


# Select specific columns or rows for better clarity
table_data = data[['Date', 'Google', 'bing']].head(10) # Replace columns
print(table_data)

# Use Pandas' styling to enhance the table for Jupyter/Colab


styled_table = table_data.style.background_gradient(cmap='Blues')
styled_table

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 1/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

Date Google bing


0 2009-01 90.25 0.00
1 2009-02 89.69 0.00
2 2009-03 89.14 0.00
3 2009-04 89.91 0.00
4 2009-05 89.86 0.00
5 2009-06 89.85 3.24
6 2009-07 89.28 3.57
7 2009-08 89.62 3.55
8 2009-09 90.60 3.22
9 2009-10 90.86 3.28
Date Google bing

0 2009-01 90.250000 0.000000

1 2009-02 89.690000 0.000000

2 2009-03 89.140000 0.000000

3 2009-04 89.910000 0.000000

4 2009-05 89.860000 0.000000

5 2009-06 89.850000 3.240000

6 2009-07 89.280000 3.570000

7 2009-08 89.620000 3.550000

8 2009-09 90.600000 3.220000

9 2009-10 90.860000 3.280000

# Display a summary of your dataset as a table


# Create a line plot (e.g., tracking scores across years)
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='Date', y='Google',) # Replace columns
plt.title("Score Trend Over Years")
plt.xlabel("Year")
plt.ylabel("Score")
plt.legend(title="Category")
plt.grid(True)
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 2/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that art

# Scatter plot to visualize the relationship between two numerical variables


plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Date', y='Google', sizes=(20, 200)) # Replace columns
plt.title("Scatter Plot of ColumnX vs ColumnY")
plt.xlabel("ColumnX")
plt.ylabel("ColumnY")
plt.legend(title="Category")
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 3/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that art

# Ensure all non-numerical data is excluded or transformed


numerical_data = data.select_dtypes(include=['float64', 'int64'])

# If dates exist, convert to numerical values like year


if 'DateColumn' in data.columns: # Replace 'DateColumn' with actual column name
data['Year'] = pd.to_datetime(data['DateColumn']).dt.year
numerical_data = data.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix


correlation_matrix = numerical_data.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 4/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

# Bar plot to show the frequency/count of categories


plt.figure(figsize=(8, 6))
sns.barplot(data=data, x='Date', y='bing') # Replace columns
plt.title("Bar Plot of Categories")
plt.xlabel("Category")
plt.ylabel("Value")
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 5/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

# Pie chart for category distribution


category_counts = data[''].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
plt.title("Category Distribution")
plt.show()

# Histogram to visualize the distribution of a numerical variable


plt.figure(figsize=(8, 6))
sns.histplot(data['Google'], bins=20, kde=True) # Replace column
plt.title("Histogram of NumericalColumn")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 6/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

import matplotlib.pyplot as plt

# Summing usage percentages for each search engine across the dataset
global_market_share = data.iloc[:, 1:].sum() # Skip 'Date' column

# Sort by usage for better visualization


global_market_share = global_market_share.sort_values(ascending=False)

# Plot a pie chart for global market share


plt.figure(figsize=(10, 8))
global_market_share.plot.pie(autopct='%1.1f%%', startangle=140, cmap='tab20', legend=False)
plt.title('Global Search Engine Market Share')
plt.ylabel('') # Hide the y-axis label
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 7/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

2) Construct scatter plots for sets of 10, 100, 1000, and


10,000 points. Experiment with the point size to nd the
most revealing value for each data set.

import matplotlib.pyplot as plt


import seaborn as sns

# Replace these with numerical columns from your dataset


x_col = 'Google' # x-axis: replace with your desired column
y_col = 'bing' # y-axis: replace with your desired column

# Define subsets
sizes = [10, 100, 1000, 10000]
point_sizes = [50, 20, 10, 5] # Experiment with these for better visualization

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 8/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

for i, size in enumerate(sizes):


# Subset data
subset_data = data.iloc[:size]

# Create scatter plot


plt.figure(figsize=(8, 6))
plt.scatter(subset_data[x_col], subset_data[y_col], s=point_sizes[i], alpha=0.7)
plt.title(f"Scatter Plot with {size} Points")
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.grid(True)
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 9/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 10/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 11/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 12/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

3) Create ten different versions of line charts for a


particular set of (x, y) points. Which ones are best and
which ones worst? Explain why.

import matplotlib.pyplot as plt


import numpy as np

# Example dataset for demonstration


x = np.linspace(0, 10, 50) # X-axis points
y = np.sin(x) # Y-axis points (sine wave)

# Variations of line charts


styles = [
{'color': 'blue', 'linestyle': '-', 'label': 'Default Line'}, # 1. Default line
{'color': 'green', 'linestyle': '--', 'label': 'Dashed Line'}, # 2. Dashed line
{'color': 'red', 'linestyle': ':', 'label': 'Dotted Line'}, # 3. Dotted line
{'color': 'purple', 'linestyle': '-.', 'label': 'Dash-Dot Line'}, # 4. Dash-dot line
{'color': 'orange', 'linewidth': 2, 'label': 'Thicker Line'}, # 5. Thicker line
{'color': 'black', 'marker': 'o', 'label': 'Line with Circle Markers'}, # 6. Markers
{'color': 'cyan', 'linewidth': 1, 'marker': '*', 'label': 'Star Markers'}, # 7. Star mar
{'color': 'pink', 'linestyle': '-', 'alpha': 0.5, 'label': 'Faded Line'}, # 8. Faded li
{'color': 'brown', 'linestyle': '-', 'label': 'No Grid', 'grid': False}, # 9. No grid
{'color': 'teal', 'linestyle': '-', 'label': 'With Grid', 'grid': True} # 10. Grid
]

# Generate plots
for i, style in enumerate(styles, start=1):
plt.figure(figsize=(8, 6))
plt.plot(x, y, color=style.get('color', 'blue'),
linestyle=style.get('linestyle', '-'),
linewidth=style.get('linewidth', 1),
marker=style.get('marker', None),
alpha=style.get('alpha', 1))

plt.title(f"Line Chart {i}")


plt.xlabel("X-axis")
plt.ylabel("Y-axis")
if style.get('grid', True):
plt.grid(True)
plt.legend()
plt.show()

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 13/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 14/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 15/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 16/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 17/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 18/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 19/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

4) Search your favorite news websites until you nd ten


interesting charts/plots, ideally half good and half bad.
For each, please critique along the following dimensions.

(a) Does it do a good job or a bad job of presenting the data?

(b) Does the presentation appear to be biased, either


deliberately or accidentally?

(c) Is there chartjunk in the gure?

(d) Are the axes labeled in a clear and informative way?

(e) Is the color used effectively?

(f) How can we make the graphic better?


To conduct this analysis, I will gather charts/plots from news websites, classify them as "good" or
"bad," and critique them based on the speci ed dimensions. Let me rst search for some
examples. Please give me a moment.

Searched 1 site I’ve reviewed several examples of charts and visualizations from news and
science websites. Below are ten examples, split equally into "good" and "bad," critiqued on the
speci ed dimensions. Here's an overview:

Good Charts Visualization of Real-Time Weather Data on a 3D Globe

(a) Effective in presenting data clearly and interactively. (b) No evident bias; purely data-driven. (c)
Minimal chartjunk, focused design. (d) Axes aren't applicable, but labels are clear. (e) Excellent use
of gradients to represent intensity. (f) Improve by adding key explanations for better accessibility
【7】. Interactive Train Travel Time Map (Netherlands)

(a) Simpli es complex travel-time data effectively. (b) Neutral presentation. (c) No chartjunk. (d)
Clear, precise labeling. (e) Excellent use of color coding for time zones. (f) Add additional context
for international users【7】. City Heatmap Activity (Berlin)

(a) Clearly highlights urban activity patterns. (b) Objective. (c) Minimal distractions. (d) Informative
and appropriately scaled axes. (e) Vibrant but not overwhelming. (f) Add interactivity to explore
speci c areas【7】. Air Pollution Coiled Chart (China)

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 20/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

(a) Illustrates improvement effectively. (b) Possible accidental bias favoring improvement. (c)
Aesthetically dense but functional. (d) Labeled effectively. (e) Thoughtful use of soft tones for
trends. (f) Simplify the coiled design to enhance readability【7】. Global Twitter Reply Map

(a) Excellent at visualizing global communication. (b) Neutral. (c) Clean and elegant. (d) Minimal,
but clear context provided. (e) Colors are intuitive for connections. (f) Incorporate lters for data
density【7】. Bad Charts 3D Bar Chart for Population Growth

(a) Misleading perspective distortions. (b) Unintended bias due to poor scaling. (c) Chartjunk in
unnecessary 3D effects. (d) Axis labels hard to interpret. (e) Overly bright, di cult to read. (f)
Redesign as a simple 2D chart【7】. Glucose Levels and Activity Chart (Diabetic Study)

(a) Overly dense; hard to discern trends. (b) Neutral. (c) Cluttered design. (d) Axis labels unclear in
parts. (e) Color choices don't help distinguish key data. (f) Simplify layers and highlight key
correlations【7】. Hurricane Instagram Photos Chart

(a) Ineffective at presenting meaningful insights. (b) Possible accidental emphasis on social
media usage. (c) Excessive visuals without purpose. (d) Poor labeling of axes. (e) Overuse of
bright, distracting colors. (f) Focus on relevant data and simplify design【7】. Daily Routine
Visualization

(a) Too complex to interpret at a glance. (b) None. (c) Distracting artistic elements. (d) Labeled
inconsistently. (e) Mixed palette reduces clarity. (f) Remove artistic overlays, focus on utility【7】.
Sunset Streets (NYC)

(a) Poorly scaled; lacks practical use. (b) Accidental bias toward aesthetics. (c) Contains
unnecessary details. (d) Axis labels unclear. (e) Overly bright, lacks depth. (f) Make design
functional, reduce decorative elements【7】.

Double-click (or enter) to edit

5) Find ve laughably bad visualizations, and explain why


they are both bad and amusing.
Here are ve "laughably bad" visualizations, with explanations of why they are both bad and
amusing:

1. 3D Pie Chart

Why it’s bad: Pie charts are already hard enough to interpret, but a 3D pie chart takes the confusion
to a whole new level. The 3D effect distorts the proportions of the slices, making it nearly
impossible to compare them accurately. Why it’s amusing: It's like trying to read a map with a giant
smudge over the place you're trying to nd. You don’t even know if the pieces are true to size
because the angles are messing with your perception. It’s like asking someone to guess the size of
a cake by looking at a warped photo of it.

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 21/22
11/20/24, 11:44 PM Untitled45.ipynb - Colab

2. Bar Chart with Random Height Bars Why it’s bad: Imagine a bar chart where the bars are
randomly scattered across the graph—some very tall, some tiny, with no logic behind the
numbers they represent. Why it’s amusing: It's like they took a perfectly good idea (a bar
chart) and decided, "Let’s make it as random as possible, because why not?" This chart turns
data analysis into an exercise in complete confusion—like someone trying to explain a map
using abstract art.

3. "Data" Filled with 3D Clip Art Why it’s bad: Some visualizations use 3D clip art instead of
actual data points, like putting a picture of a house in place of an actual number. The chart
has pictures of things like houses or cars instead of bar heights or plotted points. Why it’s
amusing: It’s like trying to make sense of a spreadsheet that’s been replaced with emojis. It
looks cool, but you have absolutely no idea what’s going on. Imagine trying to get an answer
from a data visualization that looks like a picture book.

4. Unlabeled Axis on a Line Graph Why it’s bad: A line graph without axis labels is like a race
without a nish line. Without labels or a title, you’re left to guess what the lines represent,
making the entire chart pointless. Why it’s amusing: It's like showing someone a picture of a
desert with no context, and saying, "Look, here's the data!" You have no idea what the graph
is about or even what units you're looking at—it's like a puzzle with no clues.

5. A Venn Diagram with Too Many Circles Why it’s bad: Venn diagrams are meant to show
relationships between two or three categories. But sometimes you’ll see a Venn diagram
with 10 or 20 circles, all overlapping, making it impossible to gure out where the
commonalities lie. Why it’s amusing: It’s like a child trying to draw a simple diagram but
adding so many circles that it turns into a chaotic mess. The result is a confusing swirl of
overlap where logic is lost, and all you can do is laugh at the absurdity.

7) Analyze the Real Estate DataSet. Where is the


most/least expensive real estate located? What is the
relationship between sales price and gross square feet?

from google.colab import files


uploaded = files.upload()

Choose Files No file chosen Upload widget is only available when the cell has been executed
in the current browser session. Please rerun this cell to enable.
Saving Bengaluru House Data csv to Bengaluru House Data csv

data = pd.read_csv("Bengaluru_House_Data.csv")

import pandas as pd

https://colab.research.google.com/drive/19ampz5xV5VES2zya95Vg5q8TPhhIMi5E#scrollTo=09dd5HyzTQuA&printMode=true 22/22
11/20/24, 11:38 PM Untitled37.ipynb - Colab

q6) Analyze the movie data set (of your choice). Which types of movies are most likely to succeed in the market?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files # Only needed for Google Colab; skip if not using Colab
uploaded = files.upload() # Opens a file dialog for upload

# Assuming the dataset is named 'movies_dataset.csv'


movies = pd.read_csv('movie_dataset.csv')

# Display basic information about the dataset


print(movies.info())
print(movies.head())

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 1/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to
enable.
Saving movie_dataset.csv to movie_dataset (3).csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 index 4803 non-null int64
1 budget 4803 non-null int64
2 genres 4775 non-null object
3 homepage 1712 non-null object
4 id 4803 non-null int64
5 keywords 4391 non-null object
6 original_language 4803 non-null object
7 original_title 4803 non-null object
8 overview 4800 non-null object
9 popularity 4803 non-null float64
10 production_companies 4803 non-null object
11 production_countries 4803 non-null object
12 release_date 4802 non-null object
13 revenue 4803 non-null int64
14 runtime 4801 non-null float64
15 spoken_languages 4803 non-null object
16 status 4803 non-null object
17 tagline 3959 non-null object
18 title 4803 non-null object
19 vote_average 4803 non-null float64
20 vote_count 4803 non-null int64
21 cast 4760 non-null object
22 crew 4803 non-null object
23 director 4773 non-null object
dtypes: float64(3), int64(5), object(16)
memory usage: 900.7+ KB
None
index budget genres \
0 0 237000000 Action Adventure Fantasy Science Fiction
1 1 300000000 Adventure Fantasy Action
2 2 245000000 Action Adventure Crime
3 3 250000000 Action Crime Drama Thriller
4 4 260000000 Action Adventure Science Fiction

homepage id \
0 http://www.avatarmovie.com/ 19995
1 http://disney.go.com/disneypictures/pirates/ 285
2 http://www.sonypictures.com/movies/spectre/ 206647
3 http://www.thedarkknightrises.com/ 49026
4 http://movies.disney.com/john-carter 49529

keywords original_language \
0 culture clash future space war space colony so... en
1 ocean drug abuse exotic island east india trad... en
2 spy based on novel secret agent sequel mi6 en
3 dc comics crime fighter terrorist secret ident... en
4 based on novel mars medallion space travel pri... en

original_title \
0 Avatar
1 Pirates of the Caribbean: At World's End
2 Spectre
3 The Dark Knight Rises
4 John Carter

overview popularity ... runtime \


0 In the 22nd century, a paraplegic Marine is di... 150.437577 ... 162.0
1 Captain Barbossa, long believed to be dead, ha... 139.082615 ... 169.0
2 A cryptic message from Bond’s past sends him o... 107.376788 ... 148.0
3 Following the death of District Attorney Harve... 112.312950 ... 165.0
4 John Carter is a war-weary, former military ca... 43.926995 ... 132.0

spoken_languages status \
0 [{"iso_639_1": "en", "name": "English"}, {"iso... Released
1 [{"iso_639_1": "en", "name": "English"}] Released
2 [{"iso_639_1": "fr", "name": "Fran\u00e7ais"},... Released
3 [{"iso_639_1": "en", "name": "English"}] Released
4 [{"iso_639_1": "en", "name": "English"}] Released

tagline \
0 Enter the World of Pandora.
1 At the end of the world, the adventure begins.
2 A Plan No One Escapes
3 The Legend Ends
4 Lost in our world, found in another.

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 2/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab
title vote_average vote_count \
0 Avatar 7.2 11800
1 Pirates of the Caribbean: At World's End 6.9 4500
2 Spectre 6.3 4466
3 The Dark Knight Rises 7.6 9106
4 John Carter 6.1 2124

cast \
0 Sam Worthington Zoe Saldana Sigourney Weaver S...
1 Johnny Depp Orlando Bloom Keira Knightley Stel...
2 Daniel Craig Christoph Waltz L\u00e9a Seydoux ...
3 Christian Bale Michael Caine Gary Oldman Anne ...
4 Taylor Kitsch Lynn Collins Samantha Morton Wil...

crew director
0 [{'name': 'Stephen E. Rivkin', 'gender': 0, 'd... James Cameron
1 [{'name': 'Dariusz Wolski', 'gender': 2, 'depa... Gore Verbinski
2 [{'name': 'Thomas Newman', 'gender': 2, 'depar... Sam Mendes
3 [{'name': 'Hans Zimmer', 'gender': 2, 'departm... Christopher Nolan
4 [{'name': 'Andrew Stanton', 'gender': 2, 'depa... Andrew Stanton

[5 rows x 24 columns]

# Step 1: Handle missing values


# Checking for missing values in essential columns
print(movies.isnull().sum())

# Remove rows with missing 'budget' or 'revenue' since they are critical to analysis
movies.dropna(subset=['budget', 'revenue'], inplace=True)

index 0
budget 0
genres 28
homepage 3091
id 0
keywords 412
original_language 0
original_title 0
overview 3
popularity 0
production_companies 0
production_countries 0
release_date 1
revenue 0
runtime 2
spoken_languages 0
status 0
tagline 844
title 0
vote_average 0
vote_count 0
cast 43
crew 0
director 30
dtype: int64

# Step 2: Create a success metric


# Calculate the profit margin: (revenue - budget) / budget
movies['profit_margin'] = (movies['revenue'] - movies['budget']) / movies['budget']

# Step 4: Visualize the success by genre


plt.figure(figsize=(12,6))
sns.barplot(x=genre_success.index, y='average_revenue', data=genre_success)
plt.xticks(rotation=90)
plt.title('Average Revenue by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Revenue')
plt.show()

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 3/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab

# Step 5: Analyze the relationship between budget and revenue


# Visualize the relationship using a scatter plot
plt.figure(figsize=(10,6))
sns.scatterplot(x='budget', y='revenue', data=movies, alpha=0.6)
plt.title('Budget vs. Revenue')
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.show()

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 4/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab

# Step 6: Analyze the relationship between popularity and revenue


# We can see if more popular movies tend to generate more revenue
plt.figure(figsize=(10,6))
sns.scatterplot(x='popularity', y='revenue', data=movies, alpha=0.6)
plt.title('Popularity vs. Revenue')
plt.xlabel('Popularity')
plt.ylabel('Revenue')
plt.show()

# Step 8: Visualize the success by original language


plt.figure(figsize=(12,6))
sns.barplot(x=language_success.index, y='average_revenue', data=language_success)
plt.xticks(rotation=90)
plt.title('Average Revenue by Original Language')
plt.xlabel('Original Language')

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 5/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab
plt.ylabel('Average Revenue')
plt.show()

# Step 9: Analyze the success based on movie runtime


# Scatter plot for runtime vs revenue
plt.figure(figsize=(10,6))
sns.scatterplot(x='runtime', y='revenue', data=movies, alpha=0.6)
plt.title('Runtime vs. Revenue')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Revenue')
plt.show()

# Display top 10 directors


print(director_success)

# Step 11: Visualize top 10 directors by average revenue


lt fi (fi i (12 6))
https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 6/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab
plt.figure(figsize=(12,6))
sns.barplot(x=director_success.index, y='average_revenue', data=director_success)
plt.xticks(rotation=90)
plt.title('Top 10 Directors by Average Revenue')
plt.xlabel('Director')
plt.ylabel('Average Revenue')
plt.show()

average_revenue average_profit_margin
director
Chris Buck 1.274219e+09 7.494793
Kyle Balda 1.156731e+09 14.631499
Lee Unkrich 1.066970e+09 4.334849
Joss Whedon 9.879437e+08 3.307678
Chris Renaud 8.759583e+08 10.679444
James Cameron 8.405099e+08 6.560511
Roger Allers 7.882418e+08 16.516484
Tim Miller 7.831130e+08 12.501948
Colin Trevorrow 7.587683e+08 6.716957
Robert Stromberg 7.585398e+08 3.214110

q9) Build an interactive exploration widget for your favorite data set, using appropriate libraries and tools. Start simple, but be as creative as you
want to be.

# Import necessary libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, widgets
from google.colab import files # For file upload in Google Colab

# Step 1: Upload the dataset


uploaded = files.upload()

# Step 2: Load the dataset


# Replace with the actual file name of your dataset
df = pd.read_csv('social-media-impact-on-suicide-rates.csv')

# Display basic information about the dataset


print("\nDataset Information:")

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 7/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab
print(df.info())
print("\nPreview of the dataset:")
print(df.head())

# Step 3: Data Cleaning and Validation


# Check for missing values
print("\nChecking for missing values:")
print(df.isnull().sum())

# Fill missing values (if any) with median or appropriate replacements


df.fillna(df.median(numeric_only=True), inplace=True)

# Ensure all percentage change columns are numeric


columns_to_check = [
'Suicide Rate % change since 2010',
'Twitter user count % change since 2010',
'Facebook user count % change since 2010'
]
for col in columns_to_check:
df[col] = pd.to_numeric(df[col], errors='coerce')

Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to
enable.
Saving social-media-impact-on-suicide-rates.csv to social-media-impact-on-suicide-rates (1).csv

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 30 non-null int64
1 sex 30 non-null object
2 Suicide Rate % change since 2010 30 non-null float64
3 Twitter user count % change since 2010 30 non-null float64
4 Facebook user count % change since 2010 30 non-null float64
dtypes: float64(3), int64(1), object(1)
memory usage: 1.3+ KB
None

Preview of the dataset:


year sex Suicide Rate % change since 2010 \
0 2010 BTSX 100.000000
1 2010 FMLE 100.000000
2 2010 MLE 100.000000
3 2011 BTSX 98.681894
4 2011 FMLE 98.183773

Twitter user count % change since 2010 \


0 100.000000
1 100.000000
2 100.000000
3 216.666667
4 216.666667

Facebook user count % change since 2010


0 100.000000
1 100.000000
2 100.000000
3 138.980263
4 138.980263

Checking for missing values:


year 0
sex 0
Suicide Rate % change since 2010 0
Twitter user count % change since 2010 0
Facebook user count % change since 2010 0
dtype: int64

# Ensure only numeric columns are considered


numeric_df = df.select_dtypes(include=[np.number])

# Check if there are any numeric columns


if numeric_df.empty:
print("No numeric columns available for correlation analysis.")
else:
# Step 4: Add a correlation matrix for insights

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 8/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab
print("\nCorrelation Matrix:")
correlation_matrix = numeric_df.corr()
print(correlation_matrix)

# Optional: Visualize the correlation matrix


import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

# Visualize the correlation matrix


plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 9/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab

Correlation Matrix:
year \
year 1.000000
Suicide Rate % change since 2010 -0.963467
Twitter user count % change since 2010 0.917765
Facebook user count % change since 2010 0.998266

Suicide Rate % change since 2010 \


year -0.963467
Suicide Rate % change since 2010 1.000000
Twitter user count % change since 2010 -0.845738
Facebook user count % change since 2010 -0.967949

Twitter user count % change since 2010 \


year 0.917765
Suicide Rate % change since 2010 -0.845738
Twitter user count % change since 2010 1.000000
Facebook user count % change since 2010 0.903146

Facebook user count % change since 2010


year 0.998266
Suicide Rate % change since 2010 -0.967949
Twitter user count % change since 2010 0.903146
Facebook user count % change since 2010 1.000000

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 10/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab

# Step 5: Advanced Visualization - Trends Over Time


def plot_trends():
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='year', y='Suicide Rate % change since 2010', label='Suicide Rate % Change', marker='o')
sns.lineplot(data=df, x='year', y='Twitter user count % change since 2010', label='Twitter User % Change', marker='o')
sns.lineplot(data=df, x='year', y='Facebook user count % change since 2010', label='Facebook User % Change', marker='o')
plt.title("Social Media Usage vs. Suicide Rates (2010-2019)", fontsize=16)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Percentage Change", fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

plot_trends()

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 11/12
11/20/24, 11:38 PM Untitled37.ipynb - Colab

# Step 6: Interactive Exploration Widget


def explore_relationship(y_axis):
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x=y_axis, y='Suicide Rate % change since 2010', hue='year', size='year', palette='viridis')
plt.title(f"Impact of {y_axis} on Suicide Rates", fontsize=14)
plt.xlabel(y_axis, fontsize=12)
plt.ylabel("Suicide Rate % change since 2010", fontsize=12)
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

interact(
explore_relationship,
y_axis=widgets.Dropdown(
options=['Twitter user count % change since 2010', 'Facebook user count % change since 2010'],
value='Twitter user count % change since 2010',
description='Select Metric:'
)
)

Select Metric: Twitter user count % change sin

https://colab.research.google.com/drive/1kEwkjoBG6JG9jir-bTUWtQ6QGGfElfEP?authuser=1#printMode=true 12/12
8) Analyze the Olympic data set. What can you say about the
 relationship between a country’s population and the number
of medals it wins?

from google.colab import files


uploaded = files.upload() # This will allow you to select the file from your compu
Choose Files no files selected Upload widget is only available when the cell has been
executed in the current browser session. Please rerun this cell to enable.
Saving Paris 2024 Olympics_Nations Medals.csv to Paris 2024 Olympics_Nations Me

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset


data = pd.read_csv('Paris 2024 Olympics_Nations Medals.csv')

# Preview the data


print(data.head())

NOC Gold Silver Bronze Total


0 USA 40 44 42 126
1 CHN 40 27 24 91
2 JPN 20 12 13 45
3 AUS 18 19 16 53
4 FRA 16 26 22 64

# Assuming there is a 'Country' column and population data needs to be merged.


# For now, we simulate a small example population dataset
population_data = {
'Country': ['USA', 'CHN', 'JPN', 'AUS', 'FRA'],
'Population': [331002651, 1393409038, 126476461, 25499884, 65273511] # Example
}

# Convert the population data into a DataFrame


population_df = pd.DataFrame(population_data)

# Merge the population data with the Olympic data based on 'Country'
merged_data = pd.merge(data, population_df, left_on='NOC', right_on='Country', how=
merged_data = pd.merge(data, population_df, left_on='NOC', right_on='Country', how=

# Calculate medals per capita (total medals / population)


merged_data['Medals_per_Capita'] = merged_data['Total'] / merged_data['Population']

# Preview the merged data


print(merged_data[['Country', 'Total', 'Population', 'Medals_per_Capita']])

# Visualizing the relationship between population and total medals


plt.figure(figsize=(12, 6))
sns.scatterplot(x='Population', y='Total', data=merged_data, hue='Country', palette
plt.title('Population vs Total Medals (Paris 2024 Olympics)')
plt.xlabel('Population')
plt.ylabel('Total Medals')
plt.xscale('log') # Use log scale for better visualization
plt.yscale('log') # Log scale for total medals
plt.show()

# Visualizing the relationship between population and medals per capita


plt.figure(figsize=(12, 6))
sns.scatterplot(x='Population', y='Medals_per_Capita', data=merged_data, hue='Count
plt.title('Population vs Medals per Capita (Paris 2024 Olympics)')
plt.xlabel('Population')
plt.ylabel('Medals per Capita')
plt.xscale('log') # Log scale for population
plt.show()
Country Total Population Medals_per_Capita
0 USA 126 3.310027e+08 3.806616e-07
1 CHN 91 1.393409e+09 6.530746e-08
2 JPN 45 1.264765e+08 3.557974e-07
3 AUS 53 2.549988e+07 2.078441e-06
4 FRA 64 6.527351e+07 9.804896e-07
.. ... ... ... ...
86 NaN 1 NaN NaN
87 NaN 1 NaN NaN
88 NaN 1 NaN NaN
89 NaN 1 NaN NaN
90 NaN 1 NaN NaN

[91 rows x 4 columns]

Double-click (or enter) to edit

You might also like