Test 2
Test 2
In [ ]:
#Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
In [ ]:
#load the dataset
df = pd.read_csv('../input/metaverse/metaverse.csv')
# Generate a random_state
random_state = np.random.randint(0, 10000)
# Show random_state
print(f"Random state used for train_test_split: {random_state}")
sending_address \
0 0x9d32d0bf2c00f41ce7ca01b66e174cc4dcb0c1da
1 0xd6e251c23cbf52dbd472f079147873e655d8096f
2 0x2e0925b922fed01f6a85d213ae2718f54b8ca305
3 0x93efefc25fcaf31d7695f28018d7a11ece55457f
4 0xad3b8de45d63f5cce28aef9a82cf30c397c6ceb9
5 0xa99b9a7f5c5dd37429771efd3b93c6fbe1ab2936
timestamp 0
hour_of_day 0
sending_address 0
receiving_address 0
amount 0
transaction_type 0
location_region 0
ip_prefix 0
login_frequency 0
session_duration 0
purchase_pattern 0
age_group 0
risk_score 0
anomaly 0
dtype: int64
In [ ]:
#Feature engineering
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['DayOfWeek'] = df['timestamp'].dt.dayofweek
df['DayOfMonth'] = df['timestamp'].dt.day
df['Month'] = df['timestamp'].dt.month
df['Quarter'] = df['timestamp'].dt.quarter
df['HourOfDay'] = df['timestamp'].dt.hour
Data Preprocessing
In [ ]:
# Total amount sent and received
amount_sent = df.groupby('sending_address')['amount'].sum().rename('TotalAmou
amount_received = df.groupby('receiving_address')['amount'].sum().rename('Tot
# Number of transactions
transactions_sent = df.groupby('sending_address').size().rename('NumTransacti
transactions_received = df.groupby('receiving_address').size().rename('NumTra
file:///Users/benjamin/Downloads/test_2.html 2/11
2024/6/23 20:26 test_2
In [ ]:
#normalization
numerical_features = ['amount', 'session_duration', 'TotalAmountSent', 'Total
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
In [ ]:
# Example: Normalizing session duration
df['NormalizedSessionDuration'] = (df['session_duration'] - df['session_durat
Feature Engineering
In [ ]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
In [ ]:
#drop unnecessary columns
df = df.drop(columns=['timestamp', 'sending_address', 'receiving_address'])
In [ ]:
#Ensure all columns are numerical
print(df.dtypes)
hour_of_day int64
amount float64
login_frequency int64
session_duration float64
risk_score float64
anomaly int64
DayOfWeek int32
DayOfMonth int32
Month int32
Quarter int32
HourOfDay int32
TotalAmountSent float64
TotalAmountReceived float64
NumTransactionsSent float64
NumTransactionsReceived float64
NormalizedSessionDuration float64
transaction_type_phishing bool
transaction_type_purchase bool
transaction_type_sale bool
transaction_type_scam bool
transaction_type_transfer bool
location_region_Africa bool
location_region_Asia bool
location_region_Europe bool
location_region_North America bool
location_region_South America bool
ip_prefix_10.0 bool
ip_prefix_172.0 bool
ip_prefix_172.16 bool
ip_prefix_192.0 bool
ip_prefix_192.168 bool
purchase_pattern_focused bool
purchase_pattern_high_value bool
purchase_pattern_random bool
file:///Users/benjamin/Downloads/test_2.html 3/11
2024/6/23 20:26 test_2
age_group_established bool
age_group_new bool
age_group_veteran bool
dtype: object
In [ ]:
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, rand
Feature importance
In [ ]:
# Feature importance
feature_importances = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_impo
importance_df = importance_df.sort_values(by='Importance', ascending=False)
file:///Users/benjamin/Downloads/test_2.html 4/11
2024/6/23 20:26 test_2
Reshape the training and test data into the shape required by the LSTM model, that is, each
sample is regarded as a time step.
In [ ]:
# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape
lstm_model.add(LSTM(128, return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(64, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(32, activation='relu'))
lstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))
Compile the model, use the adam optimizer with the sparse_categorical_crossentropy loss
function, and evaluate the accuracy.
In [ ]:
# Train the model
history = lstm_model.fit(X_train_reshaped, y_train_encoded, epochs=10, batch_
Epoch 1/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 12s 5ms/step - accuracy: 0.9472 - loss: 0.1503
- val_accuracy: 1.0000 - val_loss: 7.3935e-04
Epoch 2/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9970 - loss: 0.0083
- val_accuracy: 1.0000 - val_loss: 1.7374e-04
Epoch 3/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9986 - loss: 0.0043
- val_accuracy: 1.0000 - val_loss: 1.6414e-04
Epoch 4/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9996 - loss: 0.0019
- val_accuracy: 0.9998 - val_loss: 6.6632e-04
Epoch 5/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9989 - loss: 0.0036
- val_accuracy: 1.0000 - val_loss: 2.3714e-04
Epoch 6/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9993 - loss: 0.0017
- val_accuracy: 1.0000 - val_loss: 2.5755e-05
Epoch 7/10
file:///Users/benjamin/Downloads/test_2.html 6/11
2024/6/23 20:26 test_2
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9997 - loss: 0.0011
- val_accuracy: 1.0000 - val_loss: 1.8103e-04
Epoch 8/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 8s 5ms/step - accuracy: 0.9994 - loss: 0.0021
- val_accuracy: 1.0000 - val_loss: 6.3186e-06
Epoch 9/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9995 - loss: 0.0017
- val_accuracy: 1.0000 - val_loss: 8.4541e-04
Epoch 10/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9998 - loss: 8.5634e
-04 - val_accuracy: 1.0000 - val_loss: 5.7090e-06
Train the model using the training data and validation data accounting for 20%. Train for 10
epochs with 32 samples per batch.
In [ ]:
# Predict
y_pred_lstm = lstm_model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred_lstm, axis=1)
# Evaluate
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred_classes))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_classes))
Confusion Matrix:
[[ 1316 0 0]
[ 0 12690 0]
[ 0 0 1714]]
Classification Report:
precision recall f1-score support
file:///Users/benjamin/Downloads/test_2.html 7/11
2024/6/23 20:26 test_2
precision recall f1-score support
Decision Trees
In [ ]:
from sklearn.tree import DecisionTreeClassifier
XGBoost Model
In [ ]:
import xgboost as xgb
# Predictions
y_pred_xgb = best_xgb_model.predict(X_test)
# Evaluate
print("\nXGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
In [ ]:
# Define the parameter grid for hyperparameter tuning
param_grid_rf = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
param_grid_svm = {
'C': [0.1, 1, 10],
'gamma': [1, 0.1, 0.01],
'kernel': ['rbf', 'linear']
}
param_grid_dt = {
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
param_grid_xgb = {
'n_estimators': [100, 200, 300],
'max_depth': [3, 6, 9],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.7, 0.8, 0.9],
'colsample_bytree': [0.7, 0.8, 0.9]
}
best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test)
scores.append(score)
In [ ]:
# Random Forest
rf_model = RandomForestClassifier(random_state=random_state)
file:///Users/benjamin/Downloads/test_2.html 9/11
2024/6/23 20:26 test_2
rf_mean_score, rf_std_score = cross_val_evaluate_model_with_progress(rf_model
print(f"Random Forest CV Mean Accuracy: {rf_mean_score:.4f} ± {rf_std_score:.
In [ ]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=random_state)
dt_mean_score, dt_std_score = cross_val_evaluate_model_with_progress(dt_model
print(f"Decision Tree CV Mean Accuracy: {dt_mean_score:.4f} ± {dt_std_score:.
In [ ]:
# Support Vector Machine
svm_model = SVC(probability=True, random_state=random_state)
# svm_model = SVC(random_state=random_state)
svm_mean_score, svm_std_score = cross_val_evaluate_model_with_progress(svm_mo
print(f"SVM CV Mean Accuracy: {svm_mean_score:.4f} ± {svm_std_score:.4f}")
In [ ]:
# XGBoost Model
xgb_model = XGBClassifier(random_state=random_state)
xgb_mean_score, xgb_std_score = cross_val_evaluate_model_with_progress(xgb_mo
print(f"XGBoost CV Mean Accuracy: {xgb_mean_score:.4f} ± {xgb_std_score:.4f}"
In [ ]:
# LSTM Model
def create_lstm_model():
model = Sequential()
model.add(Input(shape=(1, X_train_reshaped.shape[2]))) # Adjust the inpu
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # Adjust the output layer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc
return model
# Perform cross-validation
lstm_cv_results = cross_val_score(lstm_keras_model, X_train_reshaped, y_train
In [ ]:
file:///Users/benjamin/Downloads/test_2.html 11/11