Group 2 TH
Group 2 TH
[390]: # This Python 3 environment comes with many helpful analytics libraries␣
↪installed
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can also write temporary files to /kaggle/temp/, but they won't be saved␣
↪outside of the current session
/kaggle/input/city-u-10-f-fun-ai-final-project/10F_train.csv
/kaggle/input/city-u-10-f-fun-ai-final-project/10F_sample_submission.csv
/kaggle/input/city-u-10-f-fun-ai-final-project/10F_test.csv
# Đọc dữ liệu
train = pd.read_csv('/kaggle/input/city-u-10-f-fun-ai-final-project/10F_train.
↪csv')
test = pd.read_csv('/kaggle/input/city-u-10-f-fun-ai-final-project/10F_test.
↪csv')
[392]: train.head()
1
[392]: id CustomerId Surname CreditScore Geography Gender Age Tenure \
0 1 15749177 Okwudiliolisa 627 France Male 33.0 1
1 2 15694510 Hsueh 678 France Male 40.0 10
2 3 15741417 Kao 581 France Male 34.0 2
3 4 15766172 Chiemenam 716 Spain Male 33.0 5
4 5 15771669 Genovese 588 Germany Male 36.0 4
Exited
0 0
1 0
2 0
3 0
4 1
[393]: train.describe()
2
min 0.000000 11.580000 0.00000
25% 0.000000 74835.650000 0.00000
50% 0.000000 118024.100000 0.00000
75% 1.000000 155616.750000 0.00000
max 1.000000 199992.480000 1.00000
[394]: print(train.shape)
(132027, 14)
[395]: test.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132027 entries, 0 to 132026
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 132027 non-null int64
1 CustomerId 132027 non-null int64
2 Surname 132027 non-null object
3 CreditScore 132027 non-null int64
4 Geography 132027 non-null object
5 Gender 132027 non-null object
6 Age 132027 non-null float64
7 Tenure 132027 non-null int64
8 Balance 132027 non-null float64
9 NumOfProducts 132027 non-null int64
10 HasCrCard 132027 non-null int64
11 IsActiveMember 132027 non-null int64
12 EstimatedSalary 132027 non-null float64
13 Exited 132027 non-null int64
3
dtypes: float64(3), int64(8), object(3)
memory usage: 14.1+ MB
None
[397]: print(test.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33007 entries, 0 to 33006
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 33007 non-null int64
1 CustomerId 33007 non-null int64
2 Surname 33007 non-null object
3 CreditScore 33007 non-null int64
4 Geography 33007 non-null object
5 Gender 33007 non-null object
6 Age 33007 non-null int64
7 Tenure 33007 non-null int64
8 Balance 33007 non-null float64
9 NumOfProducts 33007 non-null int64
10 HasCrCard 33007 non-null int64
11 IsActiveMember 33007 non-null int64
12 EstimatedSalary 33007 non-null float64
dtypes: float64(2), int64(8), object(3)
memory usage: 3.3+ MB
None
id 0
CustomerId 0
Surname 0
CreditScore 0
Geography 0
Gender 0
Age 0
Tenure 0
Balance 0
NumOfProducts 0
HasCrCard 0
IsActiveMember 0
EstimatedSalary 0
Exited 0
dtype: int64
[399]: print(test.isnull().sum())
4
id 0
CustomerId 0
Surname 0
CreditScore 0
Geography 0
Gender 0
Age 0
Tenure 0
Balance 0
NumOfProducts 0
HasCrCard 0
IsActiveMember 0
EstimatedSalary 0
dtype: int64
Exited
0 104061
1 27966
Name: count, dtype: int64
5
[403]: cols = ['Gender','Geography','HasCrCard','IsActiveMember']
n_rows = 2
n_cols = 3
6
[404]: sns.histplot(data=train, x='Age', hue= 'Exited', bins = 40, kde=True)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version.
Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
7
[405]: sns.histplot (data=train, x='CreditScore', hue = 'Exited', bins = 40)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version.
Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
8
[405]: <Axes: xlabel='CreditScore', ylabel='Count'>
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version.
Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
9
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version.
Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
10
[408]: sns.histplot (data=train, x='EstimatedSalary', hue = 'Exited', bins = 40)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version.
Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
11
[409]: sns.histplot (data=train, x='NumOfProducts', hue = 'Exited', bins = 40)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version.
Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1075: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple
to get_group in a future version of pandas. Pass `(name,)` instead of `name` to
silence this warning.
data_subset = grouped_data.get_group(pd_key)
12
[409]: <Axes: xlabel='NumOfProducts', ylabel='Count'>
13
# Pipeline cho biến phân loại
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])
14
Confusion Matrix:
[[19366 1447]
[ 2551 3042]]
Classification Report:
precision recall f1-score support
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_val)
15
accuracy 0.85 26406
macro avg 0.78 0.74 0.75 26406
weighted avg 0.84 0.85 0.84 26406
16
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_val)
17
Logistic Regression Accuracy: 0.8328410209800803
Confusion Matrix:
[[19887 926]
[ 3488 2105]]
Classification Report:
precision recall f1-score support
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_val)
18
accuracy 0.83 26406
macro avg 0.77 0.67 0.69 26406
weighted avg 0.82 0.83 0.81 26406
19
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_val)
20
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_rf))
print("Classification Report:\n", classification_report(y_val, y_pred_rf))
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
rf_model.fit(X_train, y_train)
y_pred_nb = rf_model.predict(X_val)
21
0 0.88 0.96 0.92 20813
1 0.76 0.52 0.62 5593
22
[425]: #XGBClassifier test with cross validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5,␣
↪scoring='accuracy')
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
xgb_model.fit(X_train, y_train)
y_pred_nb = xgb_model.predict(X_val)
23
[ ]: #LGBMClassifier test
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix,␣
↪classification_report
# Huấn luyện lại mô hình trên toàn bộ dữ liệu huấn luyện và đánh giá trên tập␣
↪validation
lgbm_model.fit(X_train, y_train)
y_pred_lgbm = lgbm_model.predict(X_val)
[ ]: models = {
"KNN": y_pred_knn,
"Naive Bayes": y_pred_nb,
"Logistic Regression": y_pred_lr,
24
"Decision Tree": y_pred_dt,
"Random Forest": y_pred_rf,
"XGBClassifier": y_pred_xgb,
"LGBMClassifier": y_pred_lgbm
}
# Lưu kết quả vào file CSV (đảm bảo lưu đúng đường dẫn trong môi trường Kaggle)
submission.to_csv('submission.csv', index=False)
25