Q 1
Q 1
In [22]: #QUESTION 1
#Predicting Housing Prices (Regression & Classification)
import zipfile
import os
['.anaconda', '.conda', '.condarc', '.continuum', '.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.spyder-py3', 'anaconda3', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'Documents',
'Downloads', 'extracted_data', 'Favorites', 'house-prices-advanced-regression-techniques.zip', 'Links', 'Local Settings', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG
2', 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TM.blf', 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d7
9}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'PrintHood', 'q1.ipynb', 'Recent', 'Saved Games', 'Searches', 'SendTo', 'Start Menu', 'Templates', 'Videos']
[5 rows x 81 columns]
In [9]: # Fill missing values for numerical columns with the median
for column in df.select_dtypes(include=['float64', 'int64']).columns:
df[column] = df[column].fillna(df[column].median())
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Model Comparison:
Model MAE MSE R²
Linear Regression 25953.42903568955 1657205608.2597325 0.7839458761601398
Random Forest 19991.286801206785 908571173.644018 0.8815472576912461
SalePrice PriceCategory
0 208500 Luxury
1 181500 Luxury
2 223500 Luxury
3 140000 Affordable
4 250000 Luxury
Accuracy: 0.8732876712328768
Classification Report:
precision recall f1-score support
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Accuracy: 0.9315068493150684
Classification Report:
precision recall f1-score support
print(model_comparison)
Model Accuracy
0 Logistic Regression 0.873288
1 Random Forest 0.931507
In [ ]: #QUESTION 2
#2. Spam Email Detection (Classification & SVM)
text spam
0 Subject: naturally irresistible your corporate... 1
1 Subject: the stock trading gunslinger fanny i... 1
2 Subject: unbelievable new homes made easy im ... 1
3 Subject: 4 color printing special request add... 1
4 Subject: do not have money , get software cds ... 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 text 5728 non-null object
1 spam 5728 non-null int64
dtypes: int64(1), object(1)
memory usage: 89.6+ KB
None
text 0
spam 0
dtype: int64
In [26]: import re
def clean_text(text):
# Remove special characters and numbers
text = re.sub(r'\W', ' ', text)
# Convert to lowercase
text = text.lower()
return text
# Target variable
y = df['spam']
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [ ]: #QUESTION 3
#. Customer Churn Prediction (Classification & Tree-Based Models)
[5 rows x 21 columns]
# Convert the 'Churn' column to a binary numeric variable (1 for Yes, 0 for No)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
customerID 0
gender 0
SeniorCitizen 0
Partner 0
Dependents 0
tenure 0
PhoneService 0
MultipleLines 0
InternetService 0
OnlineSecurity 0
OnlineBackup 0
DeviceProtection 0
TechSupport 0
StreamingTV 0
StreamingMovies 0
Contract 0
PaperlessBilling 0
PaymentMethod 0
MonthlyCharges 0
TotalCharges 0
Churn 0
dtype: int64
Features shape: (7032, 29)
Target shape: (7032,)
# Initialize classifiers
logistic_model = LogisticRegression(max_iter=1000)
decision_tree_model = DecisionTreeClassifier()
# Compare performance
results = {
'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest'],
'Accuracy': [logistic_model.score(X_test, y_test),
decision_tree_model.score(X_test, y_test),
rf_model.score(X_test, y_test)]
}
In [ ]: #QUESTION 4
# Image Dimensionality Reduction (Dimensionality Reduction & Visualization)
label 1x1 1x2 1x3 1x4 1x5 1x6 1x7 1x8 1x9 ... 28x19 28x20 \
0 5 0 0 0 0 0 0 0 0 0 ... 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0
2 4 0 0 0 0 0 0 0 0 0 ... 0 0
3 1 0 0 0 0 0 0 0 0 0 ... 0 0
4 9 0 0 0 0 0 0 0 0 0 ... 0 0
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)