netflix-data-analysis
July 30, 2023
1 Exploratory Data Analysis Netflix
[189]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("Netflix Userbase.csv")
np.random.seed(1)
print(f"In this dataset {data.shape[0]} rows and {data.shape[1]} columns")
data.sample(n=5)
In this dataset 2500 rows and 10 columns
[189]: User ID Subscription Type Monthly Revenue Join Date Last Payment Date \
1406 1407 Basic 14 31-10-22 04-07-23
297 298 Standard 14 03-11-22 27-06-23
1276 1277 Premium 11 30-09-22 04-07-23
2368 2369 Standard 11 24-09-22 13-07-23
438 439 Standard 12 23-10-22 27-06-23
Country Age Gender Device Plan Duration
1406 Canada 45 Female Smart TV 1 Month
297 Mexico 41 Male Tablet 1 Month
1276 Brazil 30 Male Smartphone 1 Month
2368 Australia 40 Female Smartphone 1 Month
438 Spain 37 Female Smart TV 1 Month
[190]: '''
we dont have userID so we can
that userID
'''
data.drop("User ID",axis=1,inplace=True)
data.head()
1
[190]: Subscription Type Monthly Revenue Join Date Last Payment Date \
0 Basic 10 15-01-22 10-06-23
1 Premium 15 05-09-21 22-06-23
2 Standard 12 28-02-23 27-06-23
3 Standard 12 10-07-22 26-06-23
4 Basic 10 01-05-23 28-06-23
Country Age Gender Device Plan Duration
0 United States 28 Male Smartphone 1 Month
1 Canada 35 Female Tablet 1 Month
2 United Kingdom 42 Male Smart TV 1 Month
3 Australia 51 Female Laptop 1 Month
4 Germany 33 Male Smartphone 1 Month
[191]: data.dtypes
[191]: Subscription Type object
Monthly Revenue int64
Join Date object
Last Payment Date object
Country object
Age int64
Gender object
Device object
Plan Duration object
dtype: object
[192]: data.isnull().sum()
[192]: Subscription Type 0
Monthly Revenue 0
Join Date 0
Last Payment Date 0
Country 0
Age 0
Gender 0
Device 0
Plan Duration 0
dtype: int64
[193]: #check the duplicate values
duplicates = data[data.duplicated()]
if duplicates.empty:
print("No duplicates Found!")
else:
2
print("Duplicates Found")
print(duplicates)
No duplicates Found!
[194]: '''
changing dates in the datetime format
'''
data["Join Date"] = pd.to_datetime(data["Join Date"])
data["Last Payment Date"] = pd.to_datetime(data["Last Payment Date"])
'''
Handling the datetime features
'''
data["Join Year "] = data["Join Date"].dt.year
data["Join Month "] = data["Join Date"].dt.month
data["Last Payment Year"] = data["Last Payment Date"].dt.year
data["Last Payment Month"] = data["Last Payment Date"].dt.month
data["Account till"] = (pd.to_datetime("today")-data["Join Date"]).dt.days
data.head()
[194]: Subscription Type Monthly Revenue Join Date Last Payment Date \
0 Basic 10 2022-01-15 2023-10-06
1 Premium 15 2021-05-09 2023-06-22
2 Standard 12 2023-02-28 2023-06-27
3 Standard 12 2022-10-07 2023-06-26
4 Basic 10 2023-01-05 2023-06-28
Country Age Gender Device Plan Duration Join Year \
0 United States 28 Male Smartphone 1 Month 2022
1 Canada 35 Female Tablet 1 Month 2021
2 United Kingdom 42 Male Smart TV 1 Month 2023
3 Australia 51 Female Laptop 1 Month 2022
4 Germany 33 Male Smartphone 1 Month 2023
Join Month Last Payment Year Last Payment Month Account till
0 1 2023 10 562
1 5 2023 6 813
2 2 2023 6 153
3 10 2023 6 297
4 1 2023 6 207
[195]: data.rename(columns={"Plan Duration":"Plan Duration(months)"},
inplace = True)
for i in range(len(data)):
data["Plan Duration(months)"] = data["Plan Duration(months)"][i][0]
3
data.head()
[195]: Subscription Type Monthly Revenue Join Date Last Payment Date \
0 Basic 10 2022-01-15 2023-10-06
1 Premium 15 2021-05-09 2023-06-22
2 Standard 12 2023-02-28 2023-06-27
3 Standard 12 2022-10-07 2023-06-26
4 Basic 10 2023-01-05 2023-06-28
Country Age Gender Device Plan Duration(months) Join Year \
0 United States 28 Male Smartphone 1 2022
1 Canada 35 Female Tablet 1 2021
2 United Kingdom 42 Male Smart TV 1 2023
3 Australia 51 Female Laptop 1 2022
4 Germany 33 Male Smartphone 1 2023
Join Month Last Payment Year Last Payment Month Account till
0 1 2023 10 562
1 5 2023 6 813
2 2 2023 6 153
3 10 2023 6 297
4 1 2023 6 207
[196]: """Adding new columns are filled with dummy values
based on previous features
"""
#Feature encoding
subscription_type_en = pd.get_dummies(data["Subscription␣
↪Type"],prefix="subscription type")
data = pd.concat([data, subscription_type_en],axis=1)
Country_en = pd.get_dummies(data["Country"],prefix="Country_type")
data = pd.concat([data, Country_en],axis=1)
Gender_en = pd.get_dummies(data["Gender"],prefix="Gender_type")
data = pd.concat([data, Gender_en],axis=1)
Device_en = pd.get_dummies(data["Device"],prefix="Device_type")
data = pd.concat([data, Device_en],axis=1)
#feature transform
data["Age Bins"] = pd.cut(data["Age"],
bins=[0,18,28,38,48,58,100],
labels=["<18","18-27","28-37","38-47","48-57","58+"])
data.head()
4
[196]: Subscription Type Monthly Revenue Join Date Last Payment Date \
0 Basic 10 2022-01-15 2023-10-06
1 Premium 15 2021-05-09 2023-06-22
2 Standard 12 2023-02-28 2023-06-27
3 Standard 12 2022-10-07 2023-06-26
4 Basic 10 2023-01-05 2023-06-28
Country Age Gender Device Plan Duration(months) Join Year \
0 United States 28 Male Smartphone 1 2022
1 Canada 35 Female Tablet 1 2021
2 United Kingdom 42 Male Smart TV 1 2023
3 Australia 51 Female Laptop 1 2022
4 Germany 33 Male Smartphone 1 2023
… Country_type_Spain Country_type_United Kingdom \
0 … 0 0
1 … 0 0
2 … 0 1
3 … 0 0
4 … 0 0
Country_type_United States Gender_type_Female Gender_type_Male \
0 1 0 1
1 0 1 0
2 0 0 1
3 0 1 0
4 0 0 1
Device_type_Laptop Device_type_Smart TV Device_type_Smartphone \
0 0 0 1
1 0 0 0
2 0 1 0
3 1 0 0
4 0 0 1
Device_type_Tablet Age Bins
0 0 18-27
1 1 28-37
2 0 38-47
3 0 48-57
4 0 28-37
[5 rows x 34 columns]
[197]: data.info()
<class 'pandas.core.frame.DataFrame'>
5
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 34 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Subscription Type 2500 non-null object
1 Monthly Revenue 2500 non-null int64
2 Join Date 2500 non-null datetime64[ns]
3 Last Payment Date 2500 non-null datetime64[ns]
4 Country 2500 non-null object
5 Age 2500 non-null int64
6 Gender 2500 non-null object
7 Device 2500 non-null object
8 Plan Duration(months) 2500 non-null object
9 Join Year 2500 non-null int64
10 Join Month 2500 non-null int64
11 Last Payment Year 2500 non-null int64
12 Last Payment Month 2500 non-null int64
13 Account till 2500 non-null int64
14 subscription type_Basic 2500 non-null uint8
15 subscription type_Premium 2500 non-null uint8
16 subscription type_Standard 2500 non-null uint8
17 Country_type_Australia 2500 non-null uint8
18 Country_type_Brazil 2500 non-null uint8
19 Country_type_Canada 2500 non-null uint8
20 Country_type_France 2500 non-null uint8
21 Country_type_Germany 2500 non-null uint8
22 Country_type_Italy 2500 non-null uint8
23 Country_type_Mexico 2500 non-null uint8
24 Country_type_Spain 2500 non-null uint8
25 Country_type_United Kingdom 2500 non-null uint8
26 Country_type_United States 2500 non-null uint8
27 Gender_type_Female 2500 non-null uint8
28 Gender_type_Male 2500 non-null uint8
29 Device_type_Laptop 2500 non-null uint8
30 Device_type_Smart TV 2500 non-null uint8
31 Device_type_Smartphone 2500 non-null uint8
32 Device_type_Tablet 2500 non-null uint8
33 Age Bins 2500 non-null category
dtypes: category(1), datetime64[ns](2), int64(7), object(5), uint8(19)
memory usage: 273.6+ KB
[198]: #check if which subscription is higher and subscription type
plt.figure(figsize=(22,20))
plt.subplot(1,2,1)
plt.title("Subscriptions Count")
sns.countplot(x="Subscription Type",
data=data,
6
color="green",hatch="/")
#apply text on plots
plt.text(0,1010,"Basic",fontsize=10
,fontweight="bold",
color="red")
plt.text(1,810,"Premium",fontsize=10
,fontweight="bold",
color="red")
plt.text(2,810,"Standard",fontsize=10,
fontweight="bold",
color="red")
plt.subplot(1,2,2)
sns.countplot(data=data,x="Country",
hue="Subscription Type",
palette="muted")
plt.title("Subscription Type by Country")
plt.xlabel("Country")
plt.ylabel("count")
plt.legend(title="subscription type")
plt.show()
7
[199]: #gender distribution
plt.figure(figsize=(8,6))
gender = data.Gender.value_counts()
index = ["Male","Female"]
colors = ["m","c"]
exp = [0.01,0.1]
values= gender.values.tolist()
plt.pie(values,
labels=index,
autopct="%.2f%%",
colors=colors,
explode=exp,
shadow=True,
startangle=80)
plt.title('Gender Distribution')
8
plt.show()
[200]: #age distribution
plt.figure(figsize=(12,10))
sns.histplot(data=data,
x="Age",
stat="count",
color="b",
edgecolor="cyan",
lw=2)
plt.title("Age Distribution")
plt.show()
9
[201]: plt.figure(figsize=(8,6))
sns.countplot(data=data,
x="Subscription Type",
hue="Age Bins",
edgecolor="blue",
lw=1
)
plt.title("Age by Subscription type")
plt.legend(loc="upper right")
plt.show()
10
[202]: #monthly revenue by country and device revenue monthly
plt.figure(figsize=(25,23))
plt.subplot(1,2,1)
#device by revenue
sns.barplot(x="Device",
y ="Monthly Revenue",
data=data,
edgecolor="blue")
plt.title("Monthly revenue each device")
#country by revenue
plt.subplot(1,2,2)
sns.barplot(x="Country",
y="Monthly Revenue",
data=data,
edgecolor="blue")
plt.title("Monthly revenue each country")
plt.show()
11
[203]: #People of this age are using this device
sns.countplot(x="Age Bins",
hue="Device",
data=data,
edgecolor="blue",
lw=1)
plt.title("Age by device")
plt.ylabel("Device")
plt.show()
12
[204]: #monthly revenue by device
sns.barplot(x="Device",
y="Monthly Revenue",
data=data,
edgecolor="blue"
)
plt.show()
13
[205]: #percentage of subscription
x = data["Subscription Type"].value_counts(ascending=False)
index = [i for i in x.index]
var = [0,0.02,0.1]
plt.figure(figsize=(8,6))
plt.pie(x.values,
labels=index,
autopct="%.2f%%",
explode=var,
shadow=True,
startangle=90)
plt.show()
14
[206]: #Subscription revenue every month
sns.barplot(x="Subscription Type",
y="Monthly Revenue",
hue="Device",
data=data,
edgecolor="blue",
lw=1)
plt.legend(loc="upper right")
plt.title("Subscription Revenue Monthly")
plt.show()
15
[207]: #subscription type by plan duration
plt.figure(figsize=(14,10))
plt.subplot(1,2,1)
sns.countplot(data=data,
x="Subscription Type",
hue="Plan Duration(months)",
palette="muted")
plt.title("Subscription Type by plan Duration")
plt.ylabel("Plan Duration")
#plan duration of ages
plt.subplot(1,2,2)
sns.countplot(data=data,
x="Age Bins",
hue="Plan Duration(months)",
palette="muted")
plt.legend(loc="upper right")
plt.title("Plan Duration Age Bins")
plt.ylabel("plan duration")
plt.show()
16
[208]: #Monthly revenue
plt.figure(figsize=(10,6))
sns.lineplot(x="Join Month ",
y="Monthly Revenue",
data=data,
estimator= "mean",
color="blue")
plt.title("Monthly Revenue Over Time")
plt.show()
17
2 End!
18