PracticalWeek02
PracticalWeek02
Exercise:
In [1]:
# Line Wrapping in Collaboratory Google results
# put this in the first cell of your notebook
def set_css():
display(HTML('''
<style>
pre {
white-space: pre-wrap;
}
</style>
'''))
get_ipython().events.register('pre_run_cell', set_css)
In [2]:
import io
import requests
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
Mounted at /content/gdrive
Banking_Marketing_df
german_credit_df
In [3]:
# Import dataset
import pandas as pd
DATA_DIR_1 = "/content/gdrive/MyDrive/Colab Notebooks/210412-ITS70304/Banking_Marketing.csv"
Banking_Marketing_df = pd.read_csv (DATA_DIR_1, header=0)
age float64
job object
marital object
education object
default object
housing object
loan object
contact object
month object
day_of_week object
duration float64
campaign int64
pdays int64
previous int64
poutcome object
emp_var_rate float64
cons_price_idx float64
cons_conf_idx float64
euribor3m float64
nr_employed float64
y int64
dtype: object
In [ ]:
print("Find missing value of each column using isna()")
print (Banking_Marketing_df.isna().sum())
In [ ]:
print("\nRemove all rows with missing data by using dropna()")
data = Banking_Marketing_df.dropna ()
print(data.isna().sum())
In [ ]:
print(Banking_Marketing_df.isna().sum())
age 2
job 0
marital 0
education 0
default 0
housing 0
loan 0
contact 6
month 0
day_of_week 0
duration 7
campaign 0
pdays 0
previous 0
poutcome 0
emp_var_rate 0
cons_price_idx 0
cons_conf_idx 0
euribor3m 0
nr_employed 0
y 0
dtype: int64
1.2 - Imputation
Dataset: Banking_Marketing.csv
In [ ]:
# Computation of the Mean value by using mean ()
mean_age = Banking_Marketing_df.age.mean ()
print()
print ("Mean age: %.2f" % mean_age)
In [ ]:
# Computation of Median value by using median ()
# used median because the 'duration' variable is too diverse
median_duration = Banking_Marketing_df.duration.median()
print ("\nMedian duration: %.2f" % median_duration)
In [ ]:
# Computation of the Mean value by using mean ()
mean_age = Banking_Marketing_df.age.mean ()
print()
print ("Mean age: %.2f" % mean_age)
# impute using fillna. Used mode to find the most popular contact
Banking_Marketing_df.contact.fillna (mode_contact, inplace = True)
print("\nImpute missing data with mode (most popular contact):")
print (Banking_Marketing_df.isna().sum())
In [4]:
DATA_DIR_2 = "/content/gdrive/MyDrive/Colab Notebooks/210412-ITS70304/german_credit_data.csv"
german_credit_df = pd.read_csv (DATA_DIR_2, header=0)
In [6]:
german_credit_df.shape
In [5]:
# Display a BoxPlot
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sbn
sbn.boxplot(german_credit_df['Age'])
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argum
ent will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
IQR: 15.00
Lower_Fence: 4.50
Upper_Fence: 64.50
Display Outliers
Unnamed: 0 Age Sex ... Credit amount Duration Purpose
0 0 67 male ... 1169 6 radio/TV
75 75 66 male ... 1526 12 car
137 137 66 male ... 766 12 radio/TV
163 163 70 male ... 7308 10 car
179 179 65 male ... 571 21 car
186 186 74 female ... 5129 9 car
187 187 68 male ... 1175 16 car
213 213 66 male ... 1908 30 business
330 330 75 male ... 6615 24 car
430 430 74 male ... 3448 5 business
438 438 65 male ... 3394 42 repairs
536 536 75 female ... 1374 6 car
554 554 67 female ... 1199 9 education
606 606 74 male ... 4526 24 business
624 624 65 male ... 2600 18 radio/TV
723 723 66 female ... 790 9 radio/TV
756 756 74 male ... 1299 6 car
774 774 66 male ... 1480 12 car
779 779 67 female ... 3872 18 repairs
807 807 65 male ... 930 12 radio/TV
846 846 68 male ... 6761 18 car
883 883 65 female ... 1098 18 radio/TV
917 917 68 male ... 14896 6 car
Why Seaborn Boxplot still showing outliers, after removing the outliers?
Seaborn uses inter-quartile range to detect the outliers. When you remove outliers, the number of data changes thus its quantile changes
Let's investigate by computing a new quantile range after remove the outliers.
IQR: 15.00
Lower_Fence: 4.50
Upper_Fence: 64.50
IQRb: 14.00
Lower_Fence_b: 6.00
Upper_Fence_b: 63.00
The new upper fence now is at 63, if you check the condition based on the new upper and lower fence, you will see there a 5 rows with outliers (german_credit_remOutliers["Age"] < Lower_Fence_b) |
(german_credit_remOutliers["Age"] > Upper_Fence_b)
But if you check the condition against the firstly calculated upper and lower fence, you will get an empty array print (german_credit_remOutliers[((german_credit_remOutliers["Age"] < Lower_Fence) |
(german_credit_remOutliers["Age"] > Upper_Fence))])
In [26]:
german_credit_remOutliers = (german_credit_df[~((german_credit_df["Age"] < Lower_Fence) | (german_credit_df["Age"] > Upper_Fence))])
german_credit_remOutliers.shape
In [28]:
# Compute new quantile range after remove outliers
# Compute the Interquartile Range (IQR)
Q1b = german_credit_remOutliers['Age'].quantile(0.25)
Q3b = german_credit_remOutliers['Age'].quantile(0.75)
IQRb = Q3b - Q1b
print ("IQRb: %.2f" %IQRb)
IQRb: 14.00
Lower_Fence: 6.00
Upper_Fence: 63.00
In [31]:
sbn.boxplot(german_credit_remOutliers['Age'])
# Use showfliers=False if you want to disable outliers from boxplot
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argum
ent will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
Out[31]: <matplotlib.axes._subplots.AxesSubplot at 0x7fd10cc710d0>
In [33]:
# Check condition based on the firstly calculated IQR → results return empty df
# Display Outliers and Filtering Out the Outliers
print("\nDisplay Outliers")
print (german_credit_remOutliers[((german_credit_remOutliers["Age"] < Lower_Fence) | (german_credit_remOutliers["Age"] > Upper_Fence))])
Display Outliers
Empty DataFrame
Columns: [Unnamed: 0, Age, Sex, Job, Housing, Saving accounts, Checking account, Credit amount, Duration, Purpose]
Index: []
In [35]:
# Check condition based on the newly calculated IQR → results return 5 rows with outliers
# Note that the age 64 > new upper fence 63
# Display Outliers and Filtering Out the Outliers
print("\nDisplay Outliers")
print (german_credit_remOutliers[((german_credit_remOutliers["Age"] < Lower_Fence_b) | (german_credit_remOutliers["Age"] > Upper_Fence_b))])
Display Outliers
Unnamed: 0 Age Sex ... Credit amount Duration Purpose
219 219 64 female ... 1364 10 car
629 629 64 male ... 3832 9 education
678 678 64 male ... 2384 24 radio/TV
976 976 64 female ... 753 6 radio/TV
987 987 64 female ... 1409 13 radio/TV
[5 rows x 10 columns]
2. Data Integration
Dataset:
1. student.csv
2. marks.csv
In [ ]:
# Import dataset
import pandas as pd
DATA_DIR_3 = "/content/gdrive/MyDrive/Colab Notebooks/210412-ITS70304/student.csv"
DATA_DIR_4 = "/content/gdrive/MyDrive/Colab Notebooks/210412-ITS70304/marks.csv"
student_df = pd.read_csv (DATA_DIR_3, header=0)
marks_df = pd.read_csv (DATA_DIR_4, header=0)
In [ ]:
#Checking of Data
print (student_df.head())
print (marks_df.head())
3. Data Transformation
- Replacement of Categorical Data with Numbers (student.csv)
- Label encoding (Banking_Marketing.csv)
- Transforming Data of Different Scale (Wholesale customers data.csv)
Numerical Data
Categorical Data
Dataset:
1. student.csv
2. Banking_Marketing.csv
3. Wholesale customers data.csv
In [ ]:
import numpy as np
In [ ]:
print(df_categorical['Grade'].unique())
In [ ]:
print(df_categorical.Grade.value_counts())
2nd Class 80
3rd Class 80
1st Class 72
Name: Grade, dtype: int64
In [ ]:
print(df_categorical.Gender.value_counts())
Male 136
Female 96
Name: Gender, dtype: int64
In [ ]:
print(df_categorical.Employed.value_counts())
no 133
yes 99
Name: Employed, dtype: int64
Warning:
In [ ]:
df_categorical.Grade.replace({"1st Class": 1, "2nd Class": 2, "3rd Class": 3 }, inplace=True)
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py:4582: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
In [ ]:
df_categorical.Gender.replace({"Male": 0, "Female": 1}, inplace=True)
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py:4582: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
In [ ]:
df_categorical.Employed.replace({"yes": 1, "no": 2}, inplace=True)
print (df_categorical.head())
Dataset:
Banking_Marketing.csv
(this dataset already imported previously and used as 'Banking_Marketing_df')
In [ ]:
# Read Dataset and import LabelEncoder from sklearn.preprocessing package
from sklearn.preprocessing import LabelEncoder
print (Banking_Marketing_df.head())
[5 rows x 21 columns]
In [ ]:
# Remove Missing Data
Banking_Marketing_df = Banking_Marketing_df.dropna()
In [ ]:
# Select Non-Numerical Columns
data_column_category = Banking_Marketing_df.select_dtypes (exclude=[np.number]).columns
print (data_column_category)
print (Banking_Marketing_df[data_column_category].head())
[5 rows x 10 columns]
In [ ]:
# Iterate through column to convert to numeric data using LabelEncoder ()
label_encoder = LabelEncoder()
for i in data_column_category:
Banking_Marketing_df[i] = label_encoder.fit_transform (Banking_Marketing_df[i])
In [ ]:
print("Label Encoder Data:")
print(Banking_Marketing_df.head())
[5 rows x 21 columns]
In [ ]:
DATA_DIR_5 = "/content/gdrive/MyDrive/Colab Notebooks/210412-ITS70304/Wholesale customers data.csv"
In [ ]:
# Read Dataset
from sklearn import preprocessing
WholesaleData_df = pd.read_csv (DATA_DIR_5, header=0)
print (WholesaleData_df.head())
In [ ]:
null_ = WholesaleData_df.isna().any()
In [ ]:
dtypes = WholesaleData_df.dtypes
In [ ]:
# Check for Missing Data
null_ = WholesaleData_df.isna().any()
dtypes = WholesaleData_df.dtypes
info = pd.concat ([null_,dtypes], axis = 1, keys = ['Null', 'type'])
print(info) # This is different way of viewing data
Null type
Channel False int64
Region False int64
Fresh False int64
Milk False int64
Grocery False int64
Frozen False int64
Detergents_Paper False int64
Delicassen False int64
In [ ]:
# Perform Standard Scaling and Implement fit_transform () method
std_scale = preprocessing.StandardScaler().fit_transform (WholesaleData_df)
scaled_frame = pd.DataFrame (std_scale, columns = WholesaleData_df.columns)
print (scaled_frame.head(25))
In [ ]:
# Using MinMax Scaler Method
minmax_scale = preprocessing.MinMaxScaler().fit_transform (WholesaleData_df)
scaled_frame = pd.DataFrame (minmax_scale, columns = WholesaleData_df.columns)
print (scaled_frame.head())
[5 rows x 8 columns]
4. Data Discretization
A process of converting continuous data into discrete buckets by grouping it.
Dataset: Student_bucketing.csv
In [ ]:
DATA_DIR_6 = "/content/gdrive/MyDrive/Colab Notebooks/210412-ITS70304/Student_bucketing.csv"
In [ ]:
StudentBucketing_df = pd.read_csv (DATA_DIR_6, header=0)
print (StudentBucketing_df.head())
In [ ]:
# Perform Bucketing using pd.cut ()
StudentBucketing_df['bucket']=pd.cut(StudentBucketing_df['marks'], 5, labels = ['Poor', 'Below_average', 'Average', 'Above_Average','Excellent'])
In [ ]:
print (StudentBucketing_df.head(10))
In [ ]:
# Perform Bucketing using pd.cut ()
StudentBucketing_df['bucket']=pd.cut(StudentBucketing_df['marks'], 3, labels = ['Poor', 'Average', 'Excellent'])
print (StudentBucketing_df.head(10))