0% found this document useful (0 votes)
9 views8 pages

Week 12 Assignment

Uploaded by

bhargavianjaneya
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views8 pages

Week 12 Assignment

Uploaded by

bhargavianjaneya
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

Part 1: Mutual Information Classification

# Import necessary libraries


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the loan dataset

df = pd.read_csv("C:\\Users\\bharg\\Downloads\\loan.csv")

# Display the first few rows of the dataset


print(df.head())

# Separate features (X) and the target variable (y)


target_column = 'Loan_Status'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply label encoding to handle categorical variables


label_encoder = LabelEncoder()
X_encoded = X.apply(label_encoder.fit_transform)

# Split the dataset into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y,
test_size=0.2, random_state=42)

# Perform Mutual Information Classification-based feature selection


k_features = 5 # Choose an appropriate value of K (number of features
to select)
selector = SelectKBest(score_func=mutual_info_classif, k=k_features)
X_selected = selector.fit_transform(X_train, y_train)

# Get the indices of the selected features


selected_feature_indices = selector.get_support(indices=True)

# Print the names or indices of the selected features


selected_feature_names = X_train.columns[selected_feature_indices]
print(f'Selected features: {selected_feature_names}')

# Bonus Task: Visualize mutual information scores for each feature


mi_scores = pd.Series(selector.scores_, index=X_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=mi_scores.values, y=mi_scores.index, palette='viridis')
plt.title('Mutual Information Scores for Features')
plt.xlabel('Mutual Information Score')
plt.ylabel('Features')
plt.show()

Loan_ID Gender Married Dependents Education Self_Employed \


0 LP001002 Male No 0 Graduate No
1 LP001003 Male Yes 1 Graduate No
2 LP001005 Male Yes 0 Graduate Yes
3 LP001006 Male Yes 0 Not Graduate No
4 LP001008 Male No 0 Graduate No

ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \


0 5849 0.0 NaN 360.0
1 4583 1508.0 128.0 360.0
2 3000 0.0 66.0 360.0
3 2583 2358.0 120.0 360.0
4 6000 0.0 141.0 360.0

Credit_History Property_Area Loan_Status


0 1.0 Urban Y
1 1.0 Rural N
2 1.0 Urban Y
3 1.0 Urban Y
4 1.0 Urban Y
Selected features: Index(['Dependents', 'ApplicantIncome',
'LoanAmount', 'Loan_Amount_Term',
'Credit_History'],
dtype='object')
Part 2: Mutual Information Regression
pip install pandas scikit-learn

Requirement already satisfied: pandas in c:\users\bharg\anaconda3\lib\


site-packages (2.0.3)
Requirement already satisfied: scikit-learn in c:\users\bharg\
anaconda3\lib\site-packages (1.3.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\
bharg\anaconda3\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\bharg\
anaconda3\lib\site-packages (from pandas) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\bharg\
anaconda3\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: numpy>=1.21.0 in c:\users\bharg\
anaconda3\lib\site-packages (from pandas) (1.24.3)
Requirement already satisfied: scipy>=1.5.0 in c:\users\bharg\
anaconda3\lib\site-packages (from scikit-learn) (1.11.1)
Requirement already satisfied: joblib>=1.1.1 in c:\users\bharg\
anaconda3\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\bharg\
anaconda3\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: six>=1.5 in c:\users\bharg\anaconda3\
lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

# Import necessary libraries


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,
mutual_info_regression
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Housing dataset


df = pd.read_csv('C:\\Users\\bharg\\Downloads\\housing.csv')

# Separate features (X) and the target variable (y)


target_column = 'SalePrice'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply label encoding to handle categorical variables


label_encoder = LabelEncoder()
X_encoded = X.apply(label_encoder.fit_transform)

# Split the dataset into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y,
test_size=0.2, random_state=42)
# Perform Mutual Information Regression-based feature selection
k_features = 3# Choose an appropriate value of K (number of features
to select)
selector = SelectKBest(score_func=mutual_info_regression,
k=k_features)
X_selected = selector.fit_transform(X_train, y_train)

# Get the indices of the selected features


selected_feature_indices = selector.get_support(indices=True)

# Print the names or indices of the selected features


selected_feature_names = X_train.columns[selected_feature_indices]
print(f'Selected features: {selected_feature_names}')

# Bonus Task: Visualize mutual information scores for each feature


mi_scores = pd.Series(selector.scores_, index=X_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=mi_scores.values, y=mi_scores.index, palette='viridis')
plt.title('Mutual Information Scores for Features')
plt.xlabel('Mutual Information Score')
plt.ylabel('Features')
plt.show()

Selected features: Index(['OverallQual', 'GrLivArea', 'GarageCars'],


dtype='object')
Part 3 : Linear Regression on the Housing Dataset
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Load the Housing dataset
df = pd.read_csv('C:\\Users\\bharg\\Downloads\\housing.csv')

# Separate features (X) and the target variable (y)


target_column = 'SalePrice'
X = df.drop(columns=[target_column])
y = df[target_column]

# Identify numerical and categorical features


numerical_features = X.select_dtypes(include=['int64',
'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines for numerical and categorical


features
numerical_pipeline = Pipeline([ ('imputer',
SimpleImputer(strategy='mean')),('num', 'passthrough')])

categorical_pipeline = Pipeline([('imputer',
SimpleImputer(strategy='most_frequent')),
('cat',
OneHotEncoder(handle_unknown='ignore'))])

# Combine the preprocessing pipelines using ColumnTransformer


preprocessor = ColumnTransformer(transformers=[('num',
numerical_pipeline, numerical_features),
('cat',
categorical_pipeline, categorical_features))

# Apply preprocessing on the entire dataset


X_preprocessed = preprocessor.fit_transform(X)

# Split the dataset into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y,
test_size=0.2, random_state=42)

# Initialize the Linear Regression model


linear_reg_model = LinearRegression()
# Create the final pipeline with feature preprocessing and model
training
pipeline = Pipeline([('regressor', linear_reg_model)])

# Fit the model to the training data


pipeline.fit(X_train, y_train)

# Predict house prices for the testing data


y_pred = pipeline.predict(X_test)

# Evaluate the performance of the model


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the MSE and R^2 values


print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R^2): {r2}')

# Plot a scatter plot between predicted and actual house prices


plt.scatter(y_test, y_pred)
plt.xlabel('Actual House Prices')
plt.ylabel('Predicted House Prices')
plt.title('Scatter Plot of Actual vs Predicted House Prices')
plt.show()

Mean Squared Error (MSE): 980359454.4892789


R-squared (R^2): 0.8721880363353154
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# Load the dataset (Assuming it's a CSV file)


loan_data = pd.read_csv("C:\\Users\\bharg\\Downloads\\loan.csv")

# Display the first few rows of the dataset


print("Original Dataset:")
print(loan_data.head())

# Separate features (X) and target variable (y)


X = loan_data.drop("Loan_Status", axis=1)
y = loan_data["Loan_Status"]

# Apply label encoding for categorical variables


label_encoder = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
X[column] = label_encoder.fit_transform(X[column])

# Handle null values (You may need to customize this based on your
dataset)
X.fillna(0, inplace=True) # Filling null values with 0 for simplicity

# Split the dataset into training and testing sets (80% training, 20%
testing)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

# Use SelectKBest with Mutual Information Classification


k_best = 5 # Choose an appropriate value of K
selector = SelectKBest(mutual_info_classif, k=k_best)
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature indices


selected_indices = selector.get_support(indices=True)

# Get the selected feature names


selected_features = X.columns[selected_indices]

# Print the selected features


print(f"\nSelected Features (Top {k_best}):")
print(selected_features)

You might also like