Step by step data processing for ML project
Step by step data processing for ML project
python
Copy
Download
import pandas as pd
import numpy as np
df = pd.read_csv('accepted_2007_to_2018Q4.csv.gz',
compression='gzip', low_memory=True)
# Initial exploration
print("Columns:\n", df.columns.tolist())
print("Data types:\n", df.dtypes.value_counts())
python
Copy
Download
python
Copy
Download
features = [
'inq_last_6mths', 'mths_since_last_delinq',
'mths_since_last_record',
cat_cols = df.select_dtypes(include=['object']).columns
python
Copy
Download
y_class = df['default']
y_reg = df['default_amount']
X_reg_train = X_train[y_class_train == 1]
y_reg_train = y_reg[y_class_train == 1]
python
Copy
Download
clf = RandomForestClassifier(n_estimators=100,
random_state=42, class_weight='balanced')
clf.fit(X_train, y_class_train)
# Predict probabilities
y_class_pred_proba = clf.predict_proba(X_test)[:, 1]
# Evaluate
print("Classification Metrics:")
print(f"Accuracy: {accuracy_score(y_class_test,
(y_class_pred_proba > 0.5).astype(int)):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_class_test,
y_class_pred_proba):.4f}")
python
Copy
Download
reg = RandomForestRegressor(n_estimators=100,
random_state=42)
reg.fit(X_reg_train, y_reg_train)
y_reg_pred = reg.predict(X_test)
default_mask = y_class_test == 1
print(f"RMSE: {np.sqrt(mean_squared_error(y_reg[y_class_test
== 1], y_reg_pred[default_mask])):.2f}")
else:
python
Copy
Download
results = X_test.copy()
results['P(Default)'] = y_class_pred_proba
results['Predicted_Loss_Amount'] = y_reg_pred
results['Expected_Loss'] = expected_loss
results['Actual_Default'] = y_class_test
results['Actual_Loss_Amount'] = y_reg[y_class_test.index]
print(results[['P(Default)', 'Predicted_Loss_Amount',
'Expected_Loss', 'Actual_Default',
'Actual_Loss_Amount']].head(10))
python
Copy
Download
param_grid = {
}
grid_search =
GridSearchCV(RandomForestClassifier(class_weight='balanced',
random_state=42),
grid_search.fit(X_train, y_class_train)
best_clf = grid_search.best_estimator_
python
Copy
Download
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Importance': best_clf.feature_importances_
}).sort_values('Importance', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature',
data=feature_importance.head(20))
plt.show()
python
Copy
Download
plt.figure(figsize=(10, 6))
plt.show()
if sum(default_mask) > 0:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=results.loc[default_mask,
'Predicted_Loss_Amount'],
y=results.loc[default_mask, 'Actual_Loss_Amount'])
plt.plot([0, max(results.loc[default_mask,
'Actual_Loss_Amount'])],
plt.show()
2. Alternative models:
3. Economic analysis:
Key Considerations
o Oversampling/undersampling techniques
Use the LendingClub Loan Data (or similar credit risk dataset).
Download:
python
Copy
Download
import pandas as pd
df = pd.read_csv('accepted_2007_to_2018Q4.csv.gz', compression='gzip',
low_memory=True)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
python
Copy
Download
df['default_amount'] = df.apply(
axis=1
python
Copy
Download
features = [
python
Copy
Download
df = df.dropna(thresh=len(df)*0.7, axis=1)
df = df.dropna()
python
Copy
Download
categorical_cols = df.select_dtypes(include=['object']).columns
python
Copy
Download
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
Step 3: Train-Test Split
python
Copy
Download
python
Copy
Download
X_reg_train = X_train[y_class_train == 1]
y_reg_train = y_reg[y_class_train == 1]
python
Copy
Download
from sklearn.ensemble import RandomForestClassifier
clf.fit(X_train, y_class_train)
# Predict probabilities
y_class_pred_proba = clf.predict_proba(X_test)[:, 1]
# Evaluate
print("Classification Metrics:")
python
Copy
Download
reg.fit(X_reg_train, y_reg_train)
default_mask = y_class_test == 1
if sum(default_mask) > 0:
y_reg_pred = reg.predict(X_test[default_mask])
print("\nRegression Metrics:")
else:
python
Copy
Download
import joblib
# Save models
joblib.dump(clf, 'default_classifier.pkl')
joblib.dump(reg, 'loss_regressor.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
# clf = joblib.load('default_classifier.pkl')
# reg = joblib.load('loss_regressor.pkl')
# scaler = joblib.load('feature_scaler.pkl')
python
Copy
Download
loss_amount = reg.predict(X_test)
results = X_test.copy()
results['P(Default)'] = p_default.round(4)
results['Predicted_Loss'] = loss_amount.round(2)
results['Expected_Loss'] = expected_loss.round(2)
print(results.head())