230103-ECON209 S2025 Lab 2.ipynb-Colab
230103-ECON209 S2025 Lab 2.ipynb-Colab
ipynb - Colab
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
Writing 𝐿𝑇
𝐴 𝑋 in Colab
𝐸
It's the same as what we do on Overleaf, but here is the guide by Colab for your convenience.
dfAd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 5000 non-null int64
1 id 5000 non-null int64
2 urban 5000 non-null int64
3 female 5000 non-null int64
4 age 4994 non-null float64
5 time_in_commune_or_ward 5000 non-null float64
6 time_in_province 5000 non-null int64
7 lv_educ 4997 non-null float64
8 no_family_members 5000 non-null int64
9 party_member 5000 non-null int64
10 income 4563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 429.8 KB
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 1/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
# Drop missing data if any
dfAd = dfAd.dropna()
Unnamed:
id urban female age time_in_commune_or_ward time_in_province lv_e
0
keyboard_arrow_down Age
# or we can get more details for a given column
age_mean = dfAd['age'].mean()
age_var = dfAd['age'].var()
age_std = dfAd['age'].std()
age_median = dfAd['age'].median()
[q1, q3] = dfAd['age'].quantile([.25, .75]).values
age_min = dfAd['age'].min()
age_max = dfAd['age'].max()
print('mean:', age_mean)
print('var:', age_var)
print('stdev:', age_std)
print('median:', age_median)
print('q1:', q1)
print('q3:', q3)
print('min:', age_min)
print('max:', age_max)
mean: 48.914197937239415
var: 134.16405869039997
stdev: 11.582920991287127
median: 50.0
q1: 40.0
q3: 58.0
min: 18.0
max: 95.0
keyboard_arrow_down time-in-province
time_in_province_mean = dfAd['time_in_province'].mean()
time_in_province_var = dfAd['time_in_province'].var()
time_in_province_std = dfAd['time_in_province'].std()
time_in_province_median = dfAd['time_in_province'].median()
[q1, q3] = dfAd['time_in_province'].quantile([.25, .75]).values
time_in_province_min = dfAd['time_in_province'].min()
time_in_province_max = dfAd['time_in_province'].max()
print('mean:', time_in_province_mean)
print('var:', time_in_province_var)
print('stdev:', time_in_province_std)
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 2/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
print('median:', time_in_province_median)
print('q1:', q1)
print('q3:', q3)
print('min:', time_in_province_min)
print('max:', time_in_province_max)
mean: 45.69387755102041
var: 2545.0320366952697
stdev: 50.448310543518396
median: 43.0
q1: 33.0
q3: 54.0
min: 2
max: 888
keyboard_arrow_down no-famliy-members
no_family_members_mean = dfAd['no_family_members'].mean()
no_family_members_var = dfAd['no_family_members'].var()
no_family_members_std = dfAd['no_family_members'].std()
no_family_members_median = dfAd['no_family_members'].median()
[q1, q3] = dfAd['no_family_members'].quantile([.25, .75]).values
no_family_members_min = dfAd['no_family_members'].min()
no_family_members_max = dfAd['no_family_members'].max()
print('mean:', no_family_members_mean)
print('var:', no_family_members_var)
print('stdev:', no_family_members_std)
print('median:', no_family_members_median)
print('q1:', q1)
print('q3:', q3)
print('min:', no_family_members_min)
print('max:', no_family_members_max)
mean: 4.393899495281984
var: 3.0232551373942944
stdev: 1.7387510280066822
median: 4.0
q1: 3.0
q3: 5.0
min: 1
max: 20
keyboard_arrow_down income
income_mean = dfAd['income'].mean()
income_var = dfAd['income'].var()
income_std = dfAd['income'].std()
income_median = dfAd['income'].median()
[q1, q3] = dfAd['income'].quantile([.25, .75]).values
income_min = dfAd['income'].min()
income_max = dfAd['income'].max()
print('mean:', income_mean)
print('var:', income_var)
print('stdev:', income_std)
print('median:', income_median)
print('q1:', q1)
print('q3:', q3)
print('min:', income_min)
print('max:', income_max)
mean: 10212420.452051789
var: 131753814091836.88
stdev: 11478406.426496532
median: 7000000.0
q1: 5000000.0
q3: 13000000.0
min: 1000000.0
max: 200000000.0
keyboard_arrow_down lv-educ
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 3/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
print('mean:', lv_educ_mean)
print('var:', lv_educ_var)
print('stdev:', lv_educ_std)
print('median:', lv_educ_median)
print('q1:', q1)
print('q3:', q3)
print('min:', lv_educ_min)
print('max:', lv_educ_max)
mean: 4.47816545973228
var: 5.019990663574289
stdev: 2.2405335667144755
median: 4.0
q1: 3.0
q3: 6.0
min: 0.0
max: 9.0
keyboard_arrow_down urban
# or we can get more details for a given column
urban_mean = dfAd['urban'].mean()
urban_var = dfAd['urban'].var()
urban_std = dfAd['urban'].std()
urban_median = dfAd['urban'].median()
[q1, q3] = dfAd['urban'].quantile([.25, .75]).values
urban_min = dfAd['urban'].min()
urban_max = dfAd['urban'].max()
print('mean:', urban_mean)
print('var:', urban_var)
print('stdev:', urban_std)
print('median:', urban_median)
print('q1:', q1)
print('q3:', q3)
print('min:',urban_min)
print('max:', urban_max)
mean: 0.6001755540926048
var: 0.24001752843651028
stdev: 0.48991583811559947
median: 1.0
q1: 0.0
q3: 1.0
min: 0
max: 1
keyboard_arrow_down party-member
# or we can get more details for a given column
party_member_mean = dfAd['party_member'].mean()
party_member_var = dfAd['party_member'].var()
party_member_std = dfAd['party_member'].std()
party_member_median = dfAd['party_member'].median()
[q1, q3] = dfAd['party_member'].quantile([.25, .75]).values
party_member_min = dfAd['party_member'].min()
party_member_max = dfAd['party_member'].max()
print('mean:', party_member_mean)
print('var:', party_member_var)
print('stdev:', party_member_std)
print('median:', party_member_median)
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 4/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
print('q1:', q1)
print('q3:', q3)
print('min:',party_member_min)
print('max:', party_member_max)
count
age
60.0 175
55.0 169
58.0 166
50.0 156
56.0 153
... ...
19.0 3
78.0 2
77.0 1
80.0 1
95.0 1
62 rows × 1 columns
dtype: int64
age time_in_province
0 56.0 20
1 37.0 37
2 34.0 34
3 36.0 36
4 61.0 61
5 40.0 40
7 47.0 47
8 63.0 63
9 55.0 55
10 41.0 41
keyboard_arrow_down Hisotgrams-Age
#Histograms, distribution plots, boxplots are all good univariate analysis tools
figure, axes = plt.subplots(1, 2, figsize=(20,10)) #Create a grid with multiple sub-plots if you want to display all p
#Histograms
sns.histplot(ax = axes[0], data = dfAd['age'], discrete=True)
#Distribution plots
sns.histplot(ax = axes[1], data = dfAd['age'], stat = 'probability', element = 'step')
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 5/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
<Axes: xlabel='age', ylabel='Probability'>
#In case you are using the free version of Colab with RAM of around 12GB, it might not be able to handle the seaborn/s
#Matplotlib/plt is an alternative in these cases, although the plots might look not as good.
keyboard_arrow_down Histograms-Income
plt.hist(dfAd['income'], bins = 20)
plt.show()
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 6/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
keyboard_arrow_down lv-educ
plt.hist(dfAd['lv_educ'], bins = 20)
plt.show()
plt.figure(figsize=(8, 5))
sns.boxplot(x='party_member', y='income', data=dfAd)
plt.show()
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 7/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 8/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
Unnamed:
id urban female age time_in_commune_or_ward time_in_province lv_educ no_family_members part
0
... ... ... ... ... ... ... ... ... ...
ci = t_res.confidence_interval(confidence_level=0.95)
ci
ConfidenceInterval(low=8447318.963773292, high=11672681.036226708)
import numpy as np
import scipy.stats as stats
sample_mean = dfAd_sample["income"].mean()
sample_std = dfAd_sample["income"].std(ddof=1)
n = len(dfAd_sample)
alpha = 0.05
t_critical = stats.t.ppf(1 - alpha / 2, df=n-1)
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 9/10
2/15/25, 10:23 PM [230103] ECON209_S2025__Lab_2.ipynb - Colab
print(f"T-statistic: {t res.statistic}")
https://colab.research.google.com/drive/1wdiU4z6X7O8LtE1Khs6hwkfgz2s2I-lf#scrollTo=EV2ke4YlNOp1 10/10