0% found this document useful (0 votes)
17 views5 pages

Project Information-Gain

Uploaded by

ayush verma
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views5 pages

Project Information-Gain

Uploaded by

ayush verma
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

5/10/24, 9:02 PM project_information-gain

Information Gain for Classification


In [1]: import pandas as pd

In [2]: df=pd.read_csv(r"C:\Users\Administrator\Downloads\IndianWeatherRepository - Copy.cs

In [3]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94269 entries, 0 to 94268
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 latitude 94269 non-null float64
1 longitude 94269 non-null float64
2 wind_kph 94269 non-null float64
3 wind_degree 94269 non-null int64
4 pressure_mb 94269 non-null int64
5 precip_mm 94269 non-null float64
6 humidity 94269 non-null int64
7 cloud 94269 non-null int64
8 feels_like_celsius 94269 non-null float64
9 visibility_km 94269 non-null float64
10 uv_index 94269 non-null int64
11 gust_kph 94269 non-null float64
12 air_quality_Carbon_Monoxide 94269 non-null float64
13 air_quality_Ozone 94269 non-null float64
14 air_quality_Nitrogen_dioxide 94269 non-null float64
15 air_quality_Sulphur_dioxide 94269 non-null float64
16 air_quality_PM2.5 94269 non-null float64
17 air_quality_PM10 94269 non-null float64
18 air_quality_us_epa_index 94269 non-null int64
19 air_quality_gb_defra_index 94269 non-null int64
20 temperature_celsius 94269 non-null int64
dtypes: float64(13), int64(8)
memory usage: 15.1 MB

In [4]: ### Train test split to avoid overfitting


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.drop(labels=['temperature_celsius
df['temperature_celsius'],
test_size=0.3,
random_state=0)

In [5]: df.head()

localhost:8888/nbconvert/html/OneDrive/Desktop/project_information-gain.ipynb?download=false 1/5
5/10/24, 9:02 PM project_information-gain

Out[5]: latitude longitude wind_kph wind_degree pressure_mb precip_mm humidity cloud feels_li

0 24.57 77.72 20.5 281 1008 0.0 67 26

1 23.33 77.80 15.5 287 1008 0.0 70 19

2 22.07 78.93 18.4 317 1009 0.0 70 51

3 21.86 77.93 16.9 297 1009 0.0 76 65

4 22.75 77.72 16.2 274 1009 0.0 74 82

5 rows × 21 columns

In [6]: X_train.head()

Out[6]: latitude longitude wind_kph wind_degree pressure_mb precip_mm humidity cloud fe

76777 25.78 87.47 12.6 280 1016 0.0 57 0

70400 21.05 76.53 10.1 120 1014 0.0 45 10

21995 14.68 77.60 8.3 46 1009 0.0 51 26

11190 20.71 81.55 6.1 236 1004 0.0 92 24

67631 13.34 77.10 11.5 127 1014 0.0 74 64

In [7]: from sklearn.feature_selection import mutual_info_classif


# determine the mutual information
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info

array([0.49773925, 0.42892146, 0.05164602, 0.09305195, 0.50874063,


Out[7]:
0.04991147, 0.1540715 , 0.12335889, 2.44681311, 0.08808289,
0.04700619, 0.18377026, 0.10372144, 0.04728575, 0.07277518,
0.03308988, 0.20030805, 0.1956061 , 0.11965724, 0.07815 ])

In [8]: mutual_info = pd.Series(mutual_info)


mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

localhost:8888/nbconvert/html/OneDrive/Desktop/project_information-gain.ipynb?download=false 2/5
5/10/24, 9:02 PM project_information-gain
feels_like_celsius 2.446813
Out[8]:
pressure_mb 0.508741
latitude 0.497739
longitude 0.428921
air_quality_PM2.5 0.200308
air_quality_PM10 0.195606
gust_kph 0.183770
humidity 0.154072
cloud 0.123359
air_quality_us_epa_index 0.119657
air_quality_Carbon_Monoxide 0.103721
wind_degree 0.093052
visibility_km 0.088083
air_quality_gb_defra_index 0.078150
air_quality_Nitrogen_dioxide 0.072775
wind_kph 0.051646
precip_mm 0.049911
air_quality_Ozone 0.047286
uv_index 0.047006
air_quality_Sulphur_dioxide 0.033090
dtype: float64

In [9]: #let's plot the ordered mutual_info values per feature


mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

<AxesSubplot:>
Out[9]:

In [10]: from sklearn.feature_selection import SelectKBest


#Now we Will select the top 5 important features
sel_five_cols = SelectKBest(mutual_info_classif, k=5)
sel_five_cols.fit(X_train, y_train)
X_train.columns[sel_five_cols.get_support()]

Index(['latitude', 'longitude', 'pressure_mb', 'feels_like_celsius',


Out[10]:
'air_quality_PM2.5'],
dtype='object')

Information Gain for Regression


In [11]: import pandas as pd

localhost:8888/nbconvert/html/OneDrive/Desktop/project_information-gain.ipynb?download=false 3/5
5/10/24, 9:02 PM project_information-gain

In [12]: df=pd.read_csv(r"C:\Users\Administrator\Downloads\IndianWeatherRepository - Copy.cs

In [16]: X_train.head()

Out[16]: latitude longitude wind_kph wind_degree pressure_mb precip_mm humidity cloud fe

76777 25.78 87.47 12.6 280 1016 0.0 57 0

70400 21.05 76.53 10.1 120 1014 0.0 45 10

21995 14.68 77.60 8.3 46 1009 0.0 51 26

11190 20.71 81.55 6.1 236 1004 0.0 92 24

67631 13.34 77.10 11.5 127 1014 0.0 74 64

In [19]: from sklearn.feature_selection import mutual_info_regression


# determine the mutual information
mutual_info = mutual_info_regression(X_train, y_train)
mutual_info

array([0.49507254, 0.42753681, 0.05096077, 0.09034756, 0.5081671 ,


Out[19]:
0.05317568, 0.15921893, 0.11976072, 2.44502172, 0.08630915,
0.04547204, 0.18488056, 0.09762834, 0.05107666, 0.07157416,
0.02716428, 0.19711506, 0.19254089, 0.1176653 , 0.08096175])

In [20]: mutual_info = pd.Series(mutual_info)


mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

feels_like_celsius 2.445022
Out[20]:
pressure_mb 0.508167
latitude 0.495073
longitude 0.427537
air_quality_PM2.5 0.197115
air_quality_PM10 0.192541
gust_kph 0.184881
humidity 0.159219
cloud 0.119761
air_quality_us_epa_index 0.117665
air_quality_Carbon_Monoxide 0.097628
wind_degree 0.090348
visibility_km 0.086309
air_quality_gb_defra_index 0.080962
air_quality_Nitrogen_dioxide 0.071574
precip_mm 0.053176
air_quality_Ozone 0.051077
wind_kph 0.050961
uv_index 0.045472
air_quality_Sulphur_dioxide 0.027164
dtype: float64

In [21]: mutual_info.sort_values(ascending=False).plot.bar(figsize=(15,5))

<AxesSubplot:>
Out[21]:

localhost:8888/nbconvert/html/OneDrive/Desktop/project_information-gain.ipynb?download=false 4/5
5/10/24, 9:02 PM project_information-gain

In [23]: from sklearn.feature_selection import SelectPercentile


## Selecting the top 20 percentile
selected_top_columns = SelectPercentile(mutual_info_regression, percentile=20)
selected_top_columns.fit(X_train, y_train)

SelectPercentile(percentile=20,
Out[23]:
score_func=<function mutual_info_regression at 0x000001BFAEBB9CA0
>)

In [25]: X_train.columns[selected_top_columns.get_support()]

Index(['latitude', 'longitude', 'pressure_mb', 'feels_like_celsius'], dtype='objec


Out[25]:
t')

In [ ]:

localhost:8888/nbconvert/html/OneDrive/Desktop/project_information-gain.ipynb?download=false 5/5

You might also like