Recsify Technologies Assignment
Recsify Technologies Assignment
Here's a detailed explanation of each part of the code with a focus on the various steps of
machine learning, including data visualizations, data exploration insights, model
performance, and understanding the main deciding factors associated with risk.
# Data Exploration
print("First five rows of the dataset:")
print(data.head())
print("\nStatistical summary:")
print(data.describe())
2. Data Visualization
• Target Variable Distribution: A count plot is created to visualize the distribution of the
Risk_Flag variable.
• Feature Distribution: Histograms for all numerical features are plotted to understand their
distributions.
# Data Visualization
plt.figure(figsize=(10, 6))
sns.countplot(x='Risk_Flag', data=data)
plt.title('Distribution of Risk Flag')
plt.savefig('risk_flag_distribution.png')
plt.show()
3. Data Preprocessing
• Encoding Categorical Variables: Categorical features are converted to numeric using
LabelEncoder.
4. Correlation Heatmap
• Heatmap: A correlation heatmap is plotted to show the correlations between features. This
helps identify multicollinearity and the relationship between features and the target variable.
# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()
# Feature Engineering
X = data.drop(columns=['Id', 'Risk_Flag'])
y = data['Risk_Flag']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)
# Model Building
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
7. Model Evaluation
• Predictions: The model makes predictions on the test set.
• Evaluation Metrics: The classification report, confusion matrix, and accuracy score are
printed to evaluate the model's performance.
# Predictions and Evaluation
y_pred = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))
8. ROC Curve
• ROC Curve: The ROC curve and AUC score are plotted to evaluate the model's
performance in distinguishing between the classes.
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.show()
9. Feature Importance
• Feature Importance: The importance of each feature in the random forest model is
plotted to understand which features are the main deciding factors associated with risk.
# Feature Importance
feature_importances = best_model.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance':
feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance',
ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.savefig('feature_importance.png')
plt.show()
Complete Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,
accuracy_score, roc_curve, roc_auc_score
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
# Data Exploration
print("First five rows of the dataset:")
print(data.head())
print("\nStatistical summary:")
print(data.describe())
# Data Visualization
plt.figure(figsize=(10, 6))
sns.countplot(x='Risk_Flag', data=data)
plt.title('Distribution of Risk Flag')
plt.savefig('risk_flag_distribution.png')
plt.show()
# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()
# Feature Engineering
X = data.drop(columns=['Id', 'Risk_Flag'])
y = data['Risk_Flag']
# Model Building
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.show()
# Feature Importance
feature_importances = best_model.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance':
feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance',
ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.savefig('feature_importance.png')
plt.show()
Output: