----------------------------------Coding
Library---------------------------------------------
##Import popular library for EDA/Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#Be careful using warnigns
import warnings
warnings.filterwarnings('ignore')
##Read file/get the data (csv/excel)
-----------------------------------------------------------------------------------
-----------
Method 1: Read csv file
df = pd.read_csv('titanic_train.csv')
##Actual file path
df = pd.read_csv('D:\\For Dan\\Learning\\Udemy\\Python\\P4-Demographic-Data.csv')
##Method 2: Change Woring Directory
import os
print(os.getcwd())
-->C:\Users\wooju\Desktop\Python Programing
os.chdir('D:\\For Dan\\Learning\\Udemy\\Python')
df = pd.read_csv('P4-Demographic-Data.csv')
df.columns
##Column rename
stats.columns = ['CountryName', 'CountryCode', 'BirthRate',
'InternetUsers','IncomeGroup']
## [column name] to get the unique items within the column
df.IncomeGroup.unique()
df.info()
df.describe()
##df.describe().transpose()
##Passing the filter with more than 1 conditions ( and & or |)
df[(df.BirthRate >= 40) & (df.InternetUsers < 2)]
df[df.CountryName == 'Malta']
movies.Genre = movies.Genre.astype('category')
----------------------------------Visualization
---------------------------------------------
import seaborn as sns
sns.set_style('darkgrid')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = 8,4
plt.figure(figsize=(8,4))
##Histogram/Distribution
sns.set()
vis1 = sns.distplot(stats['InternetUsers'], hist_kws={"edgecolor":"Black"},
bins=20)
plt.show()
plt.hist(movies.AudienceRatings, bins = 15)
#With filter
h1 = plt.hist(movies[movies.Genre == 'Drama'].BudgetMillion)
##Stacked column chart
listgen = list() or []
listlabel = list() or[]
for gen in movies.Genre.cat.categories:
listgen.append(movies[movies.Genre == gen].BudgetMillion)
listlabel.append(gen)
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(11.7,8.27)
h2 = plt.hist(list1, bins = 20, stacked = True, rwidth = 1, label = listlabel)
#
plt.title('Movie Budget Distribution', fontsize=30)
plt.ylabel('Number of Movies',fontsize=15)
plt.xlabel('Budget',fontsize=15)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.legend(frameon = True, fancybox = True, shadow = True, fontsize=15)
plt.show()
##Subplot
f, axes = plt.subplots(1,2,figsize = (12,6), sharex = True, sharey=True)
k3 = sns.kdeplot(movies.BudgetMillion, movies.AudienceRatings,cmap = 'Greens',
ax = axes[0])
k4 = sns.kdeplot(movies.BudgetMillion, movies.CriticRatings,
ax = axes[1])
k3.set(xlim = (-20,160)) #custom x-axis range
plt.show()
##violin plot
w = sns.violinplot(data=movies, x = 'Genre', y = 'CriticRatings')
##Boxplot
sns.set()
vis2 = sns.boxplot(data = stats, x = 'IncomeGroup', y = 'BirthRate')
##Linear Model
vis3 = sns.lmplot(data = stats, x = 'InternetUsers', y = 'BirthRate',
fit_reg = False, hue = 'IncomeGroup', size = 10, aspect=1)
##Jointplot
j = sns.jointplot(data = movies, x = 'CriticRatings', y = 'AudienceRatings')
j = sns.jointplot(data = movies, x = 'CriticRatings', y = 'AudienceRatings',
kind = 'kde')
##FacetGrid
# Controlling Axes and Adding Diagonals
g = sns.FacetGrid(movies, row='Genre', col='YearRelease', hue='Genre')
kws = dict(s=50, edgecolor='black', linewidth=0.5)
g = g.map(plt.scatter, 'CriticRatings', 'AudienceRatings')
g.set(xlim=(0,100), ylim=(20,100))
for ax in g.axes.flat:
ax.plot((0,100),(20,100), c='grey', ls='--')
g.add_legend()
plt.show()
-----------------------------------------------------------------------------------
-----------
#sns.set_style('darkgrid') #white, whitegrid, dard, darkgrid
sns.set_style('dark', {'axes.facecolor':'Black'})
f, axes = plt.subplots(2,2, figsize = (15,15))
k1 = sns.kdeplot(movies.BudgetMillion, movies.AudienceRatings,
shade = True, Shade_lowest = True, cmap='inferno',
ax = axes[0,0])
k1b = sns.kdeplot(movies.BudgetMillion, movies.AudienceRatings, cmap = 'PuBu',
ax = axes[0,0])
k2 = sns.kdeplot(movies.BudgetMillion, movies.CriticRatings,
shade = True, Shade_lowest = True, cmap='inferno',
ax = axes[0,1])
k2b = sns.kdeplot(movies.BudgetMillion, movies.CriticRatings, cmap = 'cool',
ax = axes[0,1])
v = sns.violinplot(data=movies, x = 'YearRelease', y = 'BudgetMillion',
palette='YlOrRd',
ax = axes[1,0])
k4 = sns.kdeplot(movies.CriticRatings, movies.AudienceRatings,
shade = True, shade_lowest = False, cmap = 'Blues_r',
ax = axes[1,1])
k4b = sns.kdeplot(movies.CriticRatings, movies.AudienceRatings, cmap =
'gist_gray_r',
ax = axes[1,1])
k1.set(xlim = (-20,200))
k2.set(xlim = (-20,200))
plt.show()
def myplot(data, playerlist = Players):
Col = {"KobeBryant":'Black',"JoeJohnson":'green',"LeBronJames":'red',
"CarmeloAnthony":'y',"DwightHoward":'k',"ChrisBosh":'m',
"ChrisPaul":'b',"KevinDurant":'k',"DerrickRose":'c',"DwayneWade":'m'}
Mkers = {"KobeBryant":"o","JoeJohnson":"D","LeBronJames":"^",
"CarmeloAnthony":"*","DwightHoward":"v","ChrisBosh":'',
"ChrisPaul":"p","KevinDurant":"D","DerrickRose":"H","DwayneWade":"^"}
for name in playerlist:
plt.plot(data[Pdict[name]], c=Col[name], ls = '--',
Marker = Mkers[name], ms = 8, label = name)
plt.legend(loc = 'upper left', bbox_to_anchor = (1,1))
plt.xticks(list(range(0,10)), Seasons, rotation = 'horizontal')
plt.show()
------------------------------------------Machine
Learning-------------------------------------
----------------LinearRegression
----------------LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
lm = LinearRegression(
logmodel = LogisticRegression()
lm.fit(X_train,y_train)
logmodel.fit(X_train,y_train)
predictions = lm.predict(X_test)
predictions = logmodel.predict(X_test)
from sklearn.metrics import confusion_matrix
accuracy = confusion_matrix(y_test,predictions)
from sklearn.metrics import accuracy_score
acscore = accuracy_score(y_test,predictions)
#F1-Score??
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
---------------------KNN
from sklearn.preprocessing import StandardScaler
##Standardize
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS', axis=1))
scaled_features = scaler.transform(df.drop('TARGET CLASS', axis=1))
df_feat = pd.DataFrame(scaled_features, columns=df.columns[0:-1])
from sklearn.model_selection import train_test_split
X = df_feat
y = df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=101)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
##Find the minimum K-value
error_rate = []
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
error_rate.append(np.mean(predictions != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='-', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K-value')
plt.xlabel('K')
plt.ylabel('K-value')
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
--------------------------------------Decision
Tree-----------------------------------------
X = final_data.drop('not.fully.paid', axis=1)
y = final_data['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=101)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, rfc_pred))
print('\n')
print(classification_report(y_test, rfc_pred))
------------------------------Standardisation vs Max-Min
Normalization----------------------------------------------------
Unit/magnitude
Standardisation
#Import library
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_X = sc_X.fit_transform(df)
#Convert to table format - StandardScaler
sc_X = pd.DataFrame(data=sc_X, columns=["Age",
"Salary","Purchased","Country_France","Country_Germany", "Country_spain"])
sc_X
Max-Min Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaled_features = scaler.transform(df)
#Convert to table format - MinMaxScaler
df_MinMax = pd.DataFrame(data=scaled_features, columns=["Age",
"Salary","Purchased","Country_France","Country_Germany", "Country_spain"])