0% found this document useful (0 votes)
9 views2 pages

Slip Regression Classification

Uploaded by

prime04072001
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views2 pages

Slip Regression Classification

Uploaded by

prime04072001
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

import pandas as pd import numpy as np

from sklearn.model_selection import train_test_split #Clacification #DecisionTree


import statsmodels.api as sm from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, mean_squared_error data = pd.get_dummies(data, columns=['famhist']) #no drop first
# Linear Regression model1 = DecisionTreeClassifier(criterion='gini', max_depth=6,
X = sm.add_constant(X) random_state=42) #gini
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, model3 = DecisionTreeClassifier(criterion='entropy', max_depth=6,
random_state=42) random_state=42) #info gain
reg = sm.OLS(y_train, X_train).fit() #reg.summary() model1.fit(X_train, y_train)
y_pred = reg.predict(X_test)
params = reg.params from sklearn.tree import plot_tree
p_val = reg.pvalues[1] plot_tree(model1, feature_names=list(X_features.columns),
print(f"Coefficients: b0: {params['const']}, b1: {params['Salary']}") class_names=["No CHD", "CHD"], filled=True, rounded=True)
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred)) #KNN
from sklearn.neighbors import KNeighborsClassifier
residuals = reg.resid from sklearn.utils import resample, shuffle
std_residuals = reg.get_influence().resid_studentized_internal upsampled_not_joined = resample(not_joined, replace = True,
n_samples = 4000)
influence = reg.get_influence() dfs = [joined, upsampled_not_joined]
cook_distance = influence.cooks_distance[0] #plt.stem() new_df = pd.concat(dfs)
leverage = influence.hat_matrix_diag new_df = shuffle(new_df)
leverage_threshold = 3 * (len(X_train.columns) + 1) / len(X_train)
#cooks_threshold = 1 X = pd.get_dummies(X_features, drop_first=True)
#cook_outliers = np.where(cook_distance > cooks_threshold)[0] Y = new_df.Status.map(lambda x: int(x == 'Joined'))
train_X, test_X, train_Y, test_Y = train_test_split(X,Y,train_size=0.8,
#MLR random_state=42)
from statsmodels.stats.outliers_influence knn_clf = KNeighborsClassifier()
import variance_inflation_factor knn_clf.fit( train_X, train_Y )
def get_vif_factors(input_df) :
vif = pd.DataFrame() from sklearn.model_selection import GridSearchCV
vif["Features"] = input_df.columns tuned_parameters = [{ 'n_neighbors': range(5,10),'metric': ['canberra',
vif["VIF"] = [variance_inflation_factor(input_df.values, i) for i in 'euclidean','minkowski']}]
range(input_df.shape[1])] clf = GridSearchCV(
return vif KNeighborsClassifier(),tuned_parameters,cv=10,scoring='roc_auc')
clf.fit(train_X, train_Y)
#Residual plot between standardized model.fittedvalues and standardized print("Best score for KNN is : ",clf.best_score_)
model.resid print("Best parameter for KNN is : ", clf.best_params_)
#get_standardized_values = lambda x : (x - np.mean(x))/(np.std(x))
#Logistic Regression #ensemble
X = pd.get_dummies(X_features, drop_first=True) from sklearn.ensemble import RandomForestClassifier
X = sm.add_constant(X) radm_clf = RandomForestClassifier(max_depth=10, n_estimators=10)
model_1 = sm.Logit(train_Y, train_X).fit() radm_clf.fit( train_X, train_Y )
significant_features = model_1.pvalues[model_1.pvalues < 0.05].index
#make model with significant features from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics logreg_clf = LogisticRegression()
cutoff = np.arange(0.1, 0.91, 0.01) ada_clf = AdaBoostClassifier(logreg_clf, n_estimators=50)
youdens_index = [] ada_clf.fit(train_X, train_Y)
cutoff_index = []
for i in cutoff: #SVM
predicted_values = model_2.predict(test_X) from sklearn.svm import SVC
predicted_values = (predicted_values > i).astype(int) svm_clf = SVC(kernel='linear', C=1.0, probability=True)
confusion_matrix = metrics.confusion_matrix(test_Y, predicted_values) svm_clf_poly = SVC(kernel='poly', degree=3, probability=True)
sensitivity = confusion_matrix[1][1]/(confusion_matrix[1][1] + svm_clf_rbf = SVC(kernel='rbf', gamma=0.1, probability=True)
confusion_matrix[1][0]) svm_clf_sigmoid = SVC(kernel='sigmoid', gamma='scale', coef0=0.0,
specificity = confusion_matrix[0][0]/(confusion_matrix[0][0] + probability=True)
confusion_matrix[0][1]) svm_clf.fit(train_X, train_Y)
youden_index = sensitivity + specificity - 1
youdens_index.append(youden_index)
cutoff_index.append(i)

print("Youden index : ", youdens_index)


max_youden = -40
optimal_cutoff = -1
for i in range(0, len(youdens_index)):
if(youdens_index[i] > max_youden) :
max_youden = youdens_index[i]
optimal_cutoff = cutoff_index[i]

#print( metrics.classification_report( test_Y, predicted_values ) )


'''For cost based cost = 5*confusion_matrix[0][1] + confusion_matrix[1][0]
and then find min instead of max like in youden'''

You might also like