Introduction To Regression With Statsmodels in Python
Introduction To Regression With Statsmodels in Python
variables
I N T R O D U C T I O N T O R E G R E S S I O N W I T H S TAT S M O D E L S I N P Y T H O N
n_claims 22.904762
total_payment_sek 98.187302
dtype: float64
print(swedish_motor_insurance['n_claims'].corr(swedish_motor_insurance['total_payment_sek']))
0.9128782350234068
Logistic regression
The response variable is logical.
sns.scatterplot(x="n_claims",
y="total_payment_sek",
data=swedish_motor_insurance)
plt.show()
Chapter 2
Making predictions from linear regression models and understanding model coe cients.
Chapter 3
Assessing the quality of the linear regression model.
Chapter 4
Same again, but with logistic regression models
scikit-learn
Optimized for prediction (focus in other DataCamp courses)
Slope
The amount the y value increases if you increase x by one.
Equation
y = intercept + slope ∗ x
mdl_payment_vs_claims = mdl_payment_vs_claims.fit()
print(mdl_payment_vs_claims.params)
Intercept 19.994486
n_claims 3.413824
dtype: float64
Equation
total_payment_sek = 19.99 + 3.41 ∗ n_claims
Common Roach
sns.displot(data=fish,
x="mass_g",
col="species",
col_wrap=2,
bins=9)
plt.show()
species
Bream 617.828571
Perch 382.239286
Pike 718.705882
Roach 152.050000
Name: mass_g, dtype: float64
Intercept 617.828571
species[T.Perch] -235.589286
species[T.Pike] 100.877311
species[T.Roach] -465.778571
The coe cients are relative to the intercept: In case of a single, categorical variable,
617.83 − 235.59 = 382.24! coe cients are the means.
plt.show()
Intercept -1035.347565
length_cm 54.549981
dtype: float64
length_cm
0 20
1 21
2 22
3 23
4 24
5 25
...
0 55.652054
1 110.202035
2 164.752015
3 219.301996
4 273.851977
...
16 928.451749
17 983.001730
18 1037.551710
19 1092.101691
20 1146.651672
Length: 21, dtype: float64
pred_little_bream = little_bream.assign(
mass_g=mdl_mass_vs_length.predict(little_bream))
print(pred_little_bream)
length_cm mass_g
0 10 -489.847756
Intercept -1035.347565
length_cm 54.549981
dtype: float64
dataset 1 273.851977
2 268.396979
3 399.316934
print(mdl_mass_vs_length.fittedvalues)
4 410.226930
...
or equivalently 30 873.901768
31 873.901768
print(mdl_mass_vs_length.predict(explanatory_data)) 34 1037.551710
Length: 35, dtype: float64
print(bream["mass_g"] - mdl_mass_vs_length.fittedvalues)
Regression to the mean means extreme cases don't persist over time
sns.scatterplot(x="father_height_cm",
y="son_height_cm",
data=father_son)
plt.axline(xy1=(150, 150),
slope=1,
linewidth=2,
color="green")
plt.axis("equal")
plt.show()
sns.regplot(x="father_height_cm",
y="son_height_cm",
data=father_son,
ci = None,
line_kws={"color": "black"})
plt.axis("equal")
plt.show()
Intercept 86.071975
father_height_cm 0.514093
dtype: float64
mdl_son_vs_father.predict( mdl_son_vs_father.predict(
really_tall_father) really_short_father)
183.7 163.2
plt.show()
sns.regplot(x="length_cm_cubed",
y="mass_g",
data=perch,
ci=None)
plt.show()
Intercept -0.117478
length_cm_cubed 0.016796
dtype: float64
prediction_data = explanatory_data.assign(
mass_g=mdl_perch.predict(explanatory_data))
print(prediction_data)
ad_conversion["sqrt_n_impressions"] = np.sqrt(
ad_conversion["n_impressions"])
sns.regplot(x="sqrt_spent_usd",
y="sqrt_n_impressions",
data=ad_conversion,
ci=None)
prediction_data = explanatory_data.assign(sqrt_n_impressions=mdl_ad.predict(explanatory_data),
n_impressions=mdl_ad.predict(explanatory_data) ** 2)
print(prediction_data)
The proportion of the variance in the response variable that is predictable from the
explanatory variable
1 means a perfect t
print(mdl_bream.summary())
0.8780627095147174
0.8780627095147173
MSE = RSE²
mse: 5498.555084973521
rse = np.sqrt(mse)
print("rse: ", rse)
rse: 74.15224261594197
resid_sum_of_sq = sum(residuals_sq)
resid_sum_of_sq = sum(residuals_sq)
deg_freedom = len(bream.index) - 2
resid_sum_of_sq = sum(residuals_sq)
deg_freedom = len(bream.index) - 2
rse = np.sqrt(resid_sum_of_sq/deg_freedom)
The di erence between predicted bream masses and observed bream masses is typically
about 74g.
fig = plt.figure()
sns.regplot(x="length_cm",
y="mass_g",
data=roach,
ci=None)
sns.scatterplot(x="length_cm",
y="mass_g",
hue="extreme_l",
data=roach)
fig = plt.figure()
sns.regplot(x="length_cm",
y="mass_g",
data=roach,
ci=None)
sns.scatterplot(x="length_cm",
y="mass_g",
hue="extreme_l",
style="extreme_m",
data=roach)
In uence measures how much the model would change if you le the observation out of the
dataset when modeling.
print(roach.head())
roach["cooks_dist"] = summary_roach["cooks_d"]
print(roach.head())
sns.regplot(x="length_cm",
y="mass_g",
data=roach,
ci=None,
line_kws={"color": "green"})
sns.regplot(x="length_cm",
y="mass_g",
data=roach_not_short,
ci=None,
line_kws={"color": "red"})
1 h ps://www.rdocumentation.org/packages/bayesQR/topics/Churn
print(mdl_churn_vs_recency_lm.params)
Intercept 0.490780
time_since_last_purchase 0.063783
dtype: float64
plt.axline(xy1=(0, intercept),
slope=slope)
plt.show()
plt.axline(xy1=(0,intercept),
slope=slope)
plt.xlim(-10, 10)
plt.ylim(-0.2, 1.2)
plt.show()
Intercept -0.035019
time_since_last_purchase 0.269215
dtype: float64
plt.axline(xy1=(0,intercept),
slope=slope,
color="black")
plt.show()
plt.show()
explanatory_data = pd.DataFrame(
{"time_since_last_purchase": np.arange(-1, 6.25, 0.25)})
prediction_data = explanatory_data.assign(
has_churned = mdl_recency.predict(explanatory_data))
sns.scatterplot(x="time_since_last_purchase",
y="has_churned",
data=prediction_data,
color="red")
plt.show()
sns.scatterplot(x="time_since_last_purchase",
y="most_likely_outcome",
data=prediction_data,
color="red")
plt.show()
probability
odds_ratio =
(1 − probability)
0.25 1
odds_ratio = =
(1 − 0.25) 3
plt.axhline(y=1,
linestyle="dotted")
plt.show()
plt.axhline(y=1,
linestyle="dotted")
plt.yscale("log")
plt.show()
predicted_response = np.round(mdl_recency.predict())
print(outcomes.value_counts(sort=False))
actual_response predicted_response
0 0.0 141
1.0 59
1 0.0 111
1.0 89
print(conf_matrix)
[[141. 59.]
[111. 89.]]
from statsmodels.graphics.mosaicplot
import mosaic
mosaic(conf_matrix)
TN + TP
accuracy =
TN + FN + FP + TP 0.575
[[141., 59.],
[111., 89.]]
TN = conf_matrix[0,0]
TP = conf_matrix[1,1]
FN = conf_matrix[1,0]
FP = conf_matrix[0,1]
TN = conf_matrix[0,0]
TP = conf_matrix[1,1]
FN = conf_matrix[1,0]
FP = conf_matrix[0,1]
TN = conf_matrix[0,0]
TP = conf_matrix[1,1]
FN = conf_matrix[1,0]
FP = conf_matrix[0,1]
Transforming variables
Chapter 3 Chapter 4