In [2]: # DATA VISUALIZATION LAB: LINEAR REGRESSION USING NUMPY
# TANAYA YADAV - 15BCE0461
# LAB SLOT L53+L54
In [2]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [3]: mlb11 = pd.read_csv('/Users/tanaya/Semester 7/Data Visualization/mlb11.csv')
In [4]: # QUESTION 1
# What type of plot would you use to display the relationship between runs and one of the other nume
rical variables?
# Plot this relationship using the variable at bats as the predictor. Does the relationship look lin
ear?
# If you knew a team’s at bats, would you be comfortable using a linear model to predict the number
of runs?
In [5]: # Scatter Plot to present two numerical variables simultaneously because it permits the relationship
between the variables to be examined with ease.
# Linear relationship between runs scored in a season and a number of other player statistics.
# If the relationship looks linear, we can quantify the strength of the relationship with the correl
ation coefficient.
In [6]: dataframe1=mlb11[['runs','at_bats']]
dataframe1
Out[6]:
runs at_bats
0 855 5659
1 875 5710
2 787 5563
3 730 5672
4 762 5532
5 718 5600
6 867 5518
7 721 5447
8 735 5544
9 615 5598
10 708 5585
11 644 5436
12 654 5549
13 735 5612
14 667 5513
15 713 5579
16 654 5502
17 704 5509
18 731 5421
19 743 5559
20 619 5487
21 625 5508
22 610 5421
23 645 5452
24 707 5436
25 641 5528
26 624 5441
27 570 5486
28 593 5417
29 556 5421
In [7]: dataframe1.columns
Out[7]: Index(['runs', 'at_bats'], dtype='object')
In [8]: # Scatter Plot (X= Runs, Y= At Bats)
# The relationship looks moderately linear but not strong enough to be able to comfortably use a lin
ear model to predict the number of runs.
plot1=dataframe1.plot.scatter(x='runs', y='at_bats', c='pink')
In [9]: # Since the relationship is linear we can quanitfy the strength of the relationship with the correla
tion coefficient.
dataframe1.corr(method='pearson', min_periods=1)
Out[9]:
runs at_bats
runs 1.000000 0.610627
at_bats 0.610627 1.000000
In [46]: at_bats = np.array([5659,5710,5563,5672,5532,5600,5518,5447,5544,5598,5585,5436,5549,5612,5513,5579,
5502,5509,
5421,5559,5487,5508,5421,5452,5436,5528,5441,5486,5417,5421])
# Linear Model
linear_model1= np.polyfit(runs, at_bats,1)
linear_model1
Out[46]: array([5.91333589e-01, 5.11335102e+03])
In [48]: dataframe_atbats=mlb11[['runs', 'at_bats']]
plot_atbats=dataframe_atbats.plot.scatter(x='runs', y='at_bats', c='grey')
In [52]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, at_bats, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()
In [ ]: # If the team’s at bats was known using a linear model to predict the number of runs would be suitab
le.
# The correlation coefficient of X=RUNS; Y=AT_BATS is 0.610627.
In [10]: # QUESTION 2
# Choose another traditional variable from mlb11.csv that you think might be a good predictor of run
s.
# Produce a scatterplot of the two variables and fit a linear model.
# At a glance, does there seem to be a linear relationship?
In [11]: runs= np.array([855,875,787,730,762,718,867,721,735,615,708,644,654,735,667,713,654,704,731,743,619,
625,610,645,
707,641,624,570,593,556])
wins= np.array([96,90,95,71,90,77,97,96,73,56,69,82,71,79,86,102,79,80,94,81,63,72,72,74,91,89,80,86
,71,67])
# Linear Model
linear_model= np.polyfit(runs, wins,1)
linear_model
Out[11]: array([ 0.08315339, 23.29147734])
In [12]: # Taking 'Wins' as the traditional variable
dataframe2=mlb11[['runs', 'wins']]
plot2=dataframe2.plot.scatter(x='runs', y='wins', c='orange')
In [13]: # Correlation Coefficient between Runs and Wins
dataframe2.corr(method='pearson', min_periods=1)
Out[13]:
runs wins
runs 1.000000 0.600809
wins 0.600809 1.000000
In [14]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, wins, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()
In [15]: # Yes, the relationship between X= Runs and Y= Wins seems to be LINEAR.
In [16]: # QUESTION 3
# Now that you can summarize the linear relationship between two variables, investigate the relation
ships between runs and each of the other five traditional variables.
# Which variable best predicts runs?
# Support your conclusion using the graphical and numerical methods we’ve discussed.
In [17]: # Variable 1 - HITS
hits= np.array([1599,1600,1540,1560,1513,1477,1452,1422,1429,1442,1434,1395,1423,1438,1394,1409,1387
,1380,1357,
1384,1357,1358,1325,1330,1324,1345,1319,1327,1284,1263])
# Linear Model
linear_model_2= np.polyfit(runs, hits,1)
linear_model_2
Out[17]: array([ 0.84592348, 822.16747161])
In [18]: dataframe3=mlb11[['runs', 'hits']]
plot3=dataframe3.plot.scatter(x='runs', y='hits', c='blue')
In [19]: # Numerical Prediction
# Correlation Coefficient between Runs and Hits
dataframe3.corr(method='pearson', min_periods=1)
Out[19]:
runs hits
runs 1.000000 0.801211
hits 0.801211 1.000000
In [20]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, hits, 10))
xp = np.linspace(60,1000, 10)
plot4= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()
In [ ]: # Yes, the relationship between X= Runs and Y= HITS seems to be LINEAR.
In [21]: # Variable 2 - BAT_AVG
batting_average = np.array([0.283,0.280,0.277,0.275,0.273,0.264,0.263,0.261,0.258,0.258,0.257,0.257,
0.256,0.256,
0.253,0.253,0.252,0.25,0.25,0.249,0.247,0.247,0.244,0.244,0.244,0.243,0.
242,0.242,
0.237,0.233])
runs= np.array([855,875,787,730,762,718,867,721,735,615,708,644,654,735,667,713,654,704,731,743,619,
625,610,645,
707,641,624,570,593,556])
# Linear Model
linear_model_3= np.polyfit(runs, batting_average,1)
linear_model_3
Out[21]: array([1.25152321e-04, 1.68127684e-01])
In [22]: # Variable 3 - STRIKEOUTS
strikeouts = np.array([930,1108,1143,1006,978,1085,1138,1083,1201,1164,1120,1087,1202,1250,1086,1024
,989,1269,
1249,1184,1048,1244,1308,1094,1193,1260,1323,1122,1320,1280])
# Linear Model
linear_model_4= np.polyfit(runs, batting_average,1)
linear_model_4
Out[22]: array([1.25152321e-04, 1.68127684e-01])
In [24]: dataframe4=mlb11[['runs', 'strikeouts']]
plot4=dataframe4.plot.scatter(x='runs', y='strikeouts', c='orange')
In [25]: # Numerical Prediction
# Correlation Coefficient between Runs and Strikeouts
dataframe4.corr(method='pearson', min_periods=1)
Out[25]:
runs strikeouts
runs 1.000000 -0.411531
strikeouts -0.411531 1.000000
In [26]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, strikeouts, 10))
xp = np.linspace(60,1000, 10)
plot4= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()
In [ ]: # No, the relationship between X= Runs and Y= Strikeouts does not seem to be LINEAR.
In [27]: # Variable 4 - STOLEN BASES
stolen_bases = np.array([143,102,49,153,57,130,147,94,118,118,81,126,69,97,135,96,81,89,133,131,92,9
5,108,117,
155,77,106,85,170,125])
# Linear Model
linear_model_5= np.polyfit(runs, stolen_bases,1)
linear_model_5
Out[27]: array([1.95487456e-02, 9.57409900e+01])
In [28]: dataframe5=mlb11[['runs', 'stolen_bases']]
plot5=dataframe5.plot.scatter(x='runs', y='stolen_bases', c='magenta')
In [29]: # Numerical Prediction
# Correlation Coefficient between Runs and Stolen Bases
dataframe5.corr(method='pearson', min_periods=1)
Out[29]:
runs stolen_bases
runs 1.000000 0.053981
stolen_bases 0.053981 1.000000
In [30]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, stolen_bases, 10))
xp = np.linspace(60,1000, 10)
plot5= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()
In [ ]: # Yes, the relationship between X= Runs and Y= Stolen Bases seems to be WEAKLY LINEAR.
In [32]: # Variable 5 - NEW ON BASE
new_onbase = np.array([0.34,0.349,0.34,0.329,0.341,0.335,0.343,0.325,0.329,0.311,0.316,0.322,0.314,
0.326,
0.313,0.323,0.319,0.317,0.322,0.317,0.306,0.318,0.309,0.311,0.322,0.308,0.309
,
0.303,0.305,0.292])
# Linear Model
linear_model_6= np.polyfit(runs, new_onbase,1)
linear_model_6
Out[32]: array([1.50169403e-04, 2.16309169e-01])
In [33]: dataframe6=mlb11[['runs', 'new_onbase']]
plot6=dataframe6.plot.scatter(x='runs', y='new_onbase', c='purple')
In [34]: # Numerical Prediction
# Correlation Coefficient between Runs and New on Base
dataframe6.corr(method='pearson', min_periods=1)
Out[34]:
runs new_onbase
runs 1.000000 0.921469
new_onbase 0.921469 1.000000
In [53]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, new_onbase, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()