Jashan ML
Jashan ML
of
MACHINE LEARNING
Submitted by:
Submitted to:
Dr. Brahmaleen K. Sidhu (Assistant Professor)
1. CSV file:
import pandas as pd
csv_file_path = C:/Users/bakin/OneDrive/Documents/Downloads/fifa.csv'
# specify the file path
df = pd.read_csv(csv_file_path) # load the data into a data frame
print(df)
2. Excel file:
import pandas as pd
excel_file_path = 'C:/path/to/your/file.xlsx' # specify the file path
df = pd.read_excel(excel_file_path, sheet_name='sheet_name') # load the data into a data
frame
print(df)
3. SQL database:
import pandas as pd
import sqlalchemy
db_con_string = 'postgresql://user:password@localhost:5432/mydatabase' # specify the
database connection string
eng = sqlalchemy.create_engine(db_con_string) # create a
database connection object
query = 'SELECT * FROM mytable' # write the SQL query
df = pd.read_sql_query(query, con=eng) # load the data into a data frame
print(df)
4. SPSS file:
import pandas as pd
import spss
spss_file_path = 'C:/path/to/your/file.sav' # specify the file path
df = pd.read_spss(spss_file_path) # load the data into a data
frame print(df)
x= [5,7,8,7,2,17,2,9,4,11,12,9,6]
y= [99,86,87,88,111,86,103,87,94,78,77,85,86]
plt.scatter(x,y)
plt.show()
O/P:
2. LINE CHART:
import matplotlib.pyplot as plt
import numpy as np
x = np.array([1, 2, 3, 4]) # X-axis points # define data values
y = x*2 # Y-axis points
plt.plot(x, y) # Plot the chart
plt.show()
O/P:
3. Bar Chart: import pandas as pd
import matplotlib.pyplot as plt
data ={ 'name':['john','tom','joe'],
'age':[32,45,56],
'height':[150,180,160]
}
df=pd.DataFrame(data)
df.plot(x='name',y=['age','height'],kind='bar')
plt.title('bar chart:age and height')
plt.xlabel('name')
plt.ylabel('values')
plt.show()
O/P:
PRACTICAL 3. Illustrate various methods of slicing a pandas dataframe.
1. Selecting columns:
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 32, 37],
'city': ['NYC', 'LA', 'Chicago']
})
name = df.name # select the 'name' column
# or
name = df['name']
print(name)
O/P:
0 Alice
1 Bob
2 Charlie
Name: name, dtype: object
2. Selecting Rows.
import pandas as pd
# create a data frame
df =
pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 32, 37],
'city': ['NYC', 'LA', 'Chicago']
})
row_1 = df # select the row with index
1 # or
row_1 = df[df.index == 1]
print(row_1)
row_1 = df.loc[1] # select the row with index 1, using the .loc method
print(row_1)
row_1 = df.iloc[1] # select the row with index 1, using the .iloc method
print(row_1)
O/P:
name age city
1 Bob 32 LA
name Bob
age 32
city LA
Name: 1, dtype: object
name Bob
age 32
city LA
Name: 1, dtype: object
3. Selecting cells.
import pandas as pd
# create a data frame
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 32, 37],
'city': ['NYC', 'LA', 'Chicago']
})
age = df.loc[1, 'age'] # select the 'age' of the seco
nd row
print(age)
age = df.iloc[1]['age'] # select the 'age' of the second row, using th
e .iloc method
print(age)
O/P: 32
32
PRACTICAL 4. Illustrate the following data cleaning tasks on dataset stored in pandas
dataframe.
1. Handling empty cells
import pandas as pd
df = pd.read_csv('C:/Users/bakin/OneDrive/Documents/Downloads/fifa.csv')
new_df = df.dropna()
#print(new_df.to_string())
#df.dropna(inplace = True)
df["Age"].fillna(130, inplace = True)
print(df)
O/P:
Unnamed: 0 ID Name Age \
0 0 158023 L. Messi 31
1 1 20801 Cristiano Ronaldo 33
2 2 190871 Neymar Jr 26
3 3 193080 De Gea 27
4 4 192985 K. De Bruyne 27
... ... ... ... ...
18202 18202 238813 J. Lundstram 19
18203 18203 243165 N. Christoffersson 19
18204 18204 241638 B. Worman 16
18205 18205 246268 D. Walker-Rice 17
18206 18206 246269 G. Nugent 16
2. Binary Encoding:
import pandas as pd
df = pd.DataFrame({'animal': ['cat', 'dog', 'bird', 'cat','elephant']}) # create a sample dataframe
with a #categorical variable
print(f"Before Encoding the Data:\n\n{df}\n")
animal_map = {'cat': 0, 'dog': 1, 'bird': 2,'elephant':3} # perform binary encoding on the
'animal' column
df['animal'] = df['animal'].map(animal_map)
df['animal'] = df['animal'].apply(lambda x: format(x, 'b'))
print(f"After Encoding the Data:\n\n{df}\n") # print the resulting dataframe
O/P:
Before Encoding the Data:
animal
0 cat
1 dog
2 bird
3 cat
4 elephant
3. Label Encoding:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
df = pd.DataFrame({'color': ['red', 'green', 'blue', 'red', 'green']}) # Create a sample
dataframe with categorical data
print(f"Before Encoding the Data:\n\n{df}\n")
le = LabelEncoder() # Create a LabelE
ncoder object
df['color_label'] = le.fit_transform(df['color']) # Fit and transform
t he categorical data
print(df)
O/P:
Before Encoding the Data:
color
0 red
1 green
2 blue
3 red
4 green
color color_label
0 red 2
1 green 1
2 blue 0
3 red 2
4 green 1
4. Ordinal Encoding:
import pandas as pd
df = pd.DataFrame({'quality': ['low', 'medium', 'high', 'medium']})
print(f"Before Encoding the Data:\n\n{df}\n")
# specify the order of the categories
quality_map = {'low': 0, 'medium': 1, 'high': 2}
# perform ordinal encoding on the 'quality' column
df['quality_map'] = df['quality'].map(quality_map)
print(df)
O/P:
Before Encoding the Data:
quality
0 low
1 medium
2 high
3 medium
quality quality_map
0 low 0
1 medium 1
2 high 2
3 medium 1
Practical 6. Perform data Normalization on an appropriate dataset.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler ,MinMaxScaler
df=pd.read_csv('C:/Users/bakin/OneDrive/Documents/Downloads/fifa.csv',index_col=0)
print(f"there are {df.shape[0]} row and {df.shape[1]} columns in the dataframe.")
df['Acceleration'].hist(bins=20)
plt.title('Acceleration before Transformation')
plt.show()minmax_scaler=MinMaxScaler()
df["Acceleration_minmax"]=minmax_scaler.fit_transform(df[['Acceleration']])
df['Acceleration_minmax'].hist(bins=20)
plt.title('Acceleration after Normalisation')
plt.show()
O/P:
Practical 7. Perform data Standardization on an appropriate dataset.
import pandas as pd
import numpy as pn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler ,MinMaxScaler
df=pd.read_csv('C:/Users/bakin/OneDrive/Documents/Downloads/fifa.csv',index_col=0)
print(f"there are {df.shape[0]} row and {df.shape[1]} columns in the dataframe.")
df['Acceleration'].hist(bins=20)
plt.title('Acceleration before Transformation')
plt.show()
std_scaler=StandardScaler()
df['Acceleration_z_std']=std_scaler.fit_transform(df[['Acceleration']])
df['Acceleration_z_std'].hist(bins=20)
plt.title('Acceleration after standardization')
plt.show()
O/P:
Practical 8. Illustrate the task of predicting using simple linear regression on an
appropriate dataset. Evaluate the performance using train-test split and appropriate
metrics.
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,linear_model
from sklearn.metrics import mean_squared_error,r2_score
diabetes_X,diabetes_y=datasets.load_diabetes(return_X_y=True)
diabetes_X=diabetes_X[:,np.newaxis,2]
#import matplotlib.pyplot as plt
diabetes_X_train=diabetes_X[:-20] #split the dats into training / testing sets
diabetes_X_test=diabetes_X[-20:]
diabetes_y_train=diabetes_y[:-20] #split the targets into training/testing sets
diabetes_y_test=diabetes_y[-20:]
regr=linear_model.LinearRegression() #Create linear regression object
regr.fit(diabetes_X_train,diabetes_y_train) #Train the model using the training sets
diabetes_y_pred=regr.predict(diabetes_X_test) #Make prediction using the testing set
print('Coeffients:\n',regr.coef_) #The coefficients
#The mean squared error
print("Mean squared error:%2f"% mean_squared_error(diabetes_y_test,diabetes_y_pred))
#The coefficient of determination:1 is perfect prediction
print("coefficient of determination:%2f" %r2_score(diabetes_y_test,diabetes_y_pred))
#Plot output
plt.scatter(diabetes_X_test,diabetes_y_test,color='black')
plt.plot(diabetes_X_test,diabetes_y_pred,color='blue',linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
O/P:
Mean squared error:2548.072399
coefficient of determination:0.472575
Practical 9. Illustrate the task of prediction using multiple linear regression on an
appropriate dataset. Evaluate the performance using k-fold cross validation and
appropriate metrics.
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
Stock_Market={'Year':[2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2
016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016],
'Month': [12, 11,10,9,8,7,6,5,4,3,2,1,12,11,10,9,8,7,6,5,4,3,2,1],
'Interest_Rate':[2.75,2.5,2.5,2.5,2.5,2.5,2.5,2.25,2.25,2.25,2,2,2,1.75,1.75,1.75,1.75,1.75,1.75,
1.75,1.75,1.75,1.75,1.75],
'Unemployment_Rate':
[5.3,5.3,5.3,5.3,5.4,5.6,5.5,5.5,5.5,5.6,5.7,5.9,6,5.9,5.8,6.1,6.2,6.1,6.1,6.1,5.9,6.2,6.2,6.1],
'Stock_Index_Price':[1464,1394,1357,1293,1256,1254,1234,1195,1159,1167,1130,1075,1047
,965,943,958,971,949,884,866,876,822,704,719]
}
df =
pd.DataFrame(Stock_Market,columns=['Year','Month','Interest_Rate','Unemployment_Rate','
Stock_Index_Price'])
X = df[['Interest_Rate','Unemployment_Rate']]
Y = df['Stock_Index_Price']
# fit the model
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
#make prediction
New_Interest_Rate = 2.75
New_Unemployment_Rate = 5.3
print ('Predicted Stock Index Price: \n', regr.predict([[New_Interest_Rate
,New_Unemployment_Rate]]))
O/P:
Intercept:
1798.4039776258544
Coefficients:
[ 345.54008701 -250.14657137]
Predicted Stock Index Price:
[1422.86238865]
O/P:
Practical 12. Generate and test decision tree for the sklearn Iris plants dataset.
from sklearn.datasets import load_iris
from sklearn import tree
import matplotlib.pyplot as plt
clf = tree.DecisionTreeClassifier(criterion = "entropy")
iris = load_iris(as_frame=True)
print(iris.DESCR)
print(iris.frame)
print("Feature names= ", iris.feature_names)
print("Class names= ", iris.target_names)
clf = clf.fit(iris.data, iris.target)
print(iris.target_names[clf.predict([[5, 3,1,.2]])])
tree.plot_tree(clf, feature_names=iris.feature_names, class_names=iris.target_names,
filled=True, rounded=True)
plt.show()
O/P:
Iris plants dataset
:Summary Statistics:
============== ==== ==== ======= ===== ====================
Min Max Mean SD Class Correlation
============== ==== ==== ======= ===== ====================
sepal length: 4.3 7.9 5.84 0.83 0.7826
sepal width: 2.0 4.4 3.05 0.43 -0.4194
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
============== ==== ==== ======= ===== ====================
:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%[email protected])
:Date: July, 1988
=============== ======== ======= ======= ====== ============
|details-end|
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
.. ... ... ... ... ...
145 6.7 3.0 5.2 2.3 2
146 6.3 2.5 5.0 1.9 2
147 6.5 3.0 5.2 2.0 2
148 6.2 3.4 5.4 2.3 2
149 5.9 3.0 5.1 1.8 2
[150 rows x 5 columns]