0% found this document useful (0 votes)

42 views11 pages

Test 2

Uploaded by

benjaminxin11

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

42 views11 pages

Test 2

Uploaded by

benjaminxin11

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 11

2024/6/23 20:26 test_2

In [ ]:
#Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier

2024-06-22 22:58:14.363232: E external/local_xla/xla/stream_executor/cuda/cud

a_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register facto
ry for plugin cuDNN when one has already been registered
2024-06-22 22:58:14.363349: E external/local_xla/xla/stream_executor/cuda/cud
a_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor
y for plugin cuFFT when one has already been registered
2024-06-22 22:58:14.493737: E external/local_xla/xla/stream_executor/cuda/cud
a_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register fac
tory for plugin cuBLAS when one has already been registered

In [ ]:
#load the dataset
df = pd.read_csv('../input/metaverse/metaverse.csv')

# Generate a random_state
random_state = np.random.randint(0, 10000)

# Show random_state
print(f"Random state used for train_test_split: {random_state}")

#chek the data output

print(df.head(6))

Random state used for train_test_split: 6828

timestamp hour_of_day \
0 2022-04-11 12:47:27 12
1 2022-06-14 19:12:46 19
2 2022-01-18 16:26:59 16
3 2022-06-15 09:20:04 9
4 2022-02-18 14:35:30 14
5 2022-04-05 19:05:44 19

sending_address \
0 0x9d32d0bf2c00f41ce7ca01b66e174cc4dcb0c1da
1 0xd6e251c23cbf52dbd472f079147873e655d8096f
2 0x2e0925b922fed01f6a85d213ae2718f54b8ca305
3 0x93efefc25fcaf31d7695f28018d7a11ece55457f
4 0xad3b8de45d63f5cce28aef9a82cf30c397c6ceb9
5 0xa99b9a7f5c5dd37429771efd3b93c6fbe1ab2936

receiving_address amount transaction_type \

0 0x39f82e1c09bc6d7baccc1e79e5621ff812f50572 796.949206 transfer
1 0x51e8fbe24f124e0e30a614e14401b9bbfed5384c 0.010000 purchase
2 0x52c7911879f783d590af45bda0c0ef2b8536706f 778.197390 purchase
3 0x8ac3b7bd531b3a833032f07d4e47c7af6ea7bace 300.838358 transfer
4 0x6fdc047c2391615b3facd79b4588c7e9106e49f2 775.569344 sale
file:///Users/benjamin/Downloads/test_2.html 1/11
2024/6/23 20:26 test_2
5 0x5a78c88c5fc1e9b512f6c64e266b46a9db0a7238 590.253982 transfer

location_region ip_prefix login_frequency session_duration \

0 Europe 192.000 3 48
1 South America 172.000 5 61
2 Asia 192.168 3 74
3 South America 172.000 8 111
4 Africa 172.160 6 100
5 Africa 192.168 4 66

purchase_pattern age_group risk_score anomaly

0 focused established 18.75 low_risk
1 focused established 25.00 low_risk
2 focused established 31.25 low_risk
3 high_value veteran 36.75 low_risk
4 high_value veteran 62.50 moderate_risk
5 focused established 15.75 low_risk
Check for missing values
In [ ]:
# Check for missing values
print(df.isnull().sum())

timestamp 0
hour_of_day 0
sending_address 0
receiving_address 0
amount 0
transaction_type 0
location_region 0
ip_prefix 0
login_frequency 0
session_duration 0
purchase_pattern 0
age_group 0
risk_score 0
anomaly 0
dtype: int64

In [ ]:
#Feature engineering
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['DayOfWeek'] = df['timestamp'].dt.dayofweek
df['DayOfMonth'] = df['timestamp'].dt.day
df['Month'] = df['timestamp'].dt.month
df['Quarter'] = df['timestamp'].dt.quarter
df['HourOfDay'] = df['timestamp'].dt.hour

Data Preprocessing
In [ ]:
# Total amount sent and received
amount_sent = df.groupby('sending_address')['amount'].sum().rename('TotalAmou
amount_received = df.groupby('receiving_address')['amount'].sum().rename('Tot

# Number of transactions
transactions_sent = df.groupby('sending_address').size().rename('NumTransacti
transactions_received = df.groupby('receiving_address').size().rename('NumTra

# Merging these features back into the dataframe

df = df.join(amount_sent, on='sending_address')
df = df.join(amount_received, on='receiving_address')
df = df.join(transactions_sent, on='sending_address')
df = df.join(transactions_received, on='receiving_address')

file:///Users/benjamin/Downloads/test_2.html 2/11
2024/6/23 20:26 test_2

In [ ]:
#normalization
numerical_features = ['amount', 'session_duration', 'TotalAmountSent', 'Total
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [ ]:
# Example: Normalizing session duration
df['NormalizedSessionDuration'] = (df['session_duration'] - df['session_durat

Feature Engineering
In [ ]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Encode the target variable 'Anomaly'

label_encoder = LabelEncoder()
df['anomaly'] = label_encoder.fit_transform(df['anomaly'])

# One-hot encoding for other categorical features

categorical_features = ['transaction_type', 'location_region', 'ip_prefix', '
df = pd.get_dummies(df, columns=categorical_features)

In [ ]:
#drop unnecessary columns
df = df.drop(columns=['timestamp', 'sending_address', 'receiving_address'])

In [ ]:
#Ensure all columns are numerical
print(df.dtypes)

hour_of_day int64
amount float64
login_frequency int64
session_duration float64
risk_score float64
anomaly int64
DayOfWeek int32
DayOfMonth int32
Month int32
Quarter int32
HourOfDay int32
TotalAmountSent float64
TotalAmountReceived float64
NumTransactionsSent float64
NumTransactionsReceived float64
NormalizedSessionDuration float64
transaction_type_phishing bool
transaction_type_purchase bool
transaction_type_sale bool
transaction_type_scam bool
transaction_type_transfer bool
location_region_Africa bool
location_region_Asia bool
location_region_Europe bool
location_region_North America bool
location_region_South America bool
ip_prefix_10.0 bool
ip_prefix_172.0 bool
ip_prefix_172.16 bool
ip_prefix_192.0 bool
ip_prefix_192.168 bool
purchase_pattern_focused bool
purchase_pattern_high_value bool
purchase_pattern_random bool

file:///Users/benjamin/Downloads/test_2.html 3/11
2024/6/23 20:26 test_2
age_group_established bool
age_group_new bool
age_group_veteran bool
dtype: object

In [ ]:
from sklearn.model_selection import train_test_split

# Features and target variable

X = df.drop(columns=['anomaly'])
y = df['anomaly']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, rand

Function to evaluate model performance using cross-validation

In [ ]:
def cross_val_evaluate_model(model, X, y, cv_folds=5):
kf = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
return np.mean(scores), np.std(scores)

Random Forest Model

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Train a RandomForest classifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=random_state
# Train and evaluate on test set
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("\nRandom Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Confusion Matrix:

[[ 1316 0 0]
[ 0 12690 0]
[ 0 0 1714]]

Random Forest Classification Report:

precision recall f1-score support

0 1.00 1.00 1.00 1316

1 1.00 1.00 1.00 12690
2 1.00 1.00 1.00 1714

accuracy 1.00 15720

macro avg 1.00 1.00 1.00 15720
weighted avg 1.00 1.00 1.00 15720

Feature importance
In [ ]:
# Feature importance
feature_importances = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_impo
importance_df = importance_df.sort_values(by='Importance', ascending=False)

file:///Users/benjamin/Downloads/test_2.html 4/11
2024/6/23 20:26 test_2

# Plot feature importance

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance')
plt.show()
plt.savefig('FeatureImportance_RF.png')

<Figure size 640x480 with 0 Axes>

Feature selection is the machine learning step that determines and selects a subset of the
most essential features from the input variables. This will help us enhance model
performance, interpretability, efficiency, and generalizability. The following are related to the
feature importance techniques applied in this study.
Perhaps the most significant technique for feature importance is determining the
contribution — or relevance — of every feature in a machine-learning model.
For more straightforward interpretation, it helps to elicit the most important, for instance,
those with the most critical effect on the model's prediction. Typically, importance scores for
a given feature are computed as a numerical value by which the feature reduces impurity or
error in the model when selected in the splitting decision. The higher the importance score,
the better that feature influences making an accurate model prediction. The bar plot below
represents the importance scores that vary for the different features used for our dataset.
The importance scores of the features from the below plot show that the risk_score,
transaction_type_scams, transaction_type_sales, and transaction_type_phishing get the
maximum importance scores; very likely, they make the most considerable influence over
the model's predictions. The model's predictions would show the maximum impact from all
the other features, such as regions in Asia or South America, and then some of the features
like ip_prefix, which draw minimal importance scores, imply minimal effect over the model.
LSTM Model
In [ ]:
# Encode labels as integers
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
file:///Users/benjamin/Downloads/test_2.html 5/11
2024/6/23 20:26 test_2

# Convert data to float32

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train_encoded = y_train_encoded.astype(np.float32)
y_test_encoded = y_test_encoded.astype(np.float32)

# Reshape data for LSTM model: (samples, time steps, features)

X_train_reshaped = np.expand_dims(X_train, axis=1)
X_test_reshaped = np.expand_dims(X_test, axis=1)

Reshape the training and test data into the shape required by the LSTM model, that is, each
sample is regarded as a time step.
In [ ]:
# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape
lstm_model.add(LSTM(128, return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(64, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(32, activation='relu'))
lstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))

Build LSTM model:

The first LSTM layer has 128 units and returns a sequence output. A Dropout layer is added
to reduce overfitting. The second LSTM layer has 64 units and does not return a sequence.
Another Dropout layer is added. The Dense layer has 32 neurons and uses the ReLU
activation function. The output layer uses the softmax activation function according to the
number of labels.
In [ ]:
# Compile the model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',

Compile the model, use the adam optimizer with the sparse_categorical_crossentropy loss
function, and evaluate the accuracy.
In [ ]:
# Train the model
history = lstm_model.fit(X_train_reshaped, y_train_encoded, epochs=10, batch_

Epoch 1/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 12s 5ms/step - accuracy: 0.9472 - loss: 0.1503
- val_accuracy: 1.0000 - val_loss: 7.3935e-04
Epoch 2/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9970 - loss: 0.0083
- val_accuracy: 1.0000 - val_loss: 1.7374e-04
Epoch 3/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9986 - loss: 0.0043
- val_accuracy: 1.0000 - val_loss: 1.6414e-04
Epoch 4/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9996 - loss: 0.0019
- val_accuracy: 0.9998 - val_loss: 6.6632e-04
Epoch 5/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9989 - loss: 0.0036
- val_accuracy: 1.0000 - val_loss: 2.3714e-04
Epoch 6/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9993 - loss: 0.0017
- val_accuracy: 1.0000 - val_loss: 2.5755e-05
Epoch 7/10
file:///Users/benjamin/Downloads/test_2.html 6/11
2024/6/23 20:26 test_2
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9997 - loss: 0.0011
- val_accuracy: 1.0000 - val_loss: 1.8103e-04
Epoch 8/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 8s 5ms/step - accuracy: 0.9994 - loss: 0.0021
- val_accuracy: 1.0000 - val_loss: 6.3186e-06
Epoch 9/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9995 - loss: 0.0017
- val_accuracy: 1.0000 - val_loss: 8.4541e-04
Epoch 10/10
1572/1572 ━━━━━━━━━━━━━━━━━━━━ 7s 5ms/step - accuracy: 0.9998 - loss: 8.5634e
-04 - val_accuracy: 1.0000 - val_loss: 5.7090e-06
Train the model using the training data and validation data accounting for 20%. Train for 10
epochs with 32 samples per batch.
In [ ]:
# Predict
y_pred_lstm = lstm_model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred_lstm, axis=1)

# Evaluate
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred_classes))

print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_classes))

492/492 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step

Confusion Matrix:
[[ 1316 0 0]
[ 0 12690 0]
[ 0 0 1714]]

Classification Report:
precision recall f1-score support

0.0 1.00 1.00 1.00 1316

1.0 1.00 1.00 1.00 12690
2.0 1.00 1.00 1.00 1714

accuracy 1.00 15720

macro avg 1.00 1.00 1.00 15720
weighted avg 1.00 1.00 1.00 15720

SVM Support Vector Machine

In [ ]:
from sklearn.svm import SVC
# Train an SVM classifier
svm_model = SVC(probability=True, random_state=random_state)

# Train and evaluate on test set

svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\nSVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Confusion Matrix:

[[ 1316 0 0]
[ 0 12690 0]
[ 0 0 1714]]

SVM Classification Report:

file:///Users/benjamin/Downloads/test_2.html 7/11
2024/6/23 20:26 test_2
precision recall f1-score support

0 1.00 1.00 1.00 1316

1 1.00 1.00 1.00 12690
2 1.00 1.00 1.00 1714

accuracy 1.00 15720

macro avg 1.00 1.00 1.00 15720
weighted avg 1.00 1.00 1.00 15720

Decision Trees
In [ ]:
from sklearn.tree import DecisionTreeClassifier

# Train a Decision Tree classifier

dt_model = DecisionTreeClassifier(random_state=random_state)

# Train and evaluate on test set

dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("\nDecision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

Decision Tree Confusion Matrix:

[[ 1316 0 0]
[ 0 12690 0]
[ 0 0 1714]]

Decision Tree Classification Report:

precision recall f1-score support

0 1.00 1.00 1.00 1316

1 1.00 1.00 1.00 12690
2 1.00 1.00 1.00 1714

accuracy 1.00 15720

macro avg 1.00 1.00 1.00 15720
weighted avg 1.00 1.00 1.00 15720

XGBoost Model
In [ ]:
import xgboost as xgb

# Train the best XGBoost model and evaluate on test set

best_xgb_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0
best_xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate
print("\nXGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

XGBoost Confusion Matrix:

[[ 1316 0 0]
[ 0 12690 0]
[ 0 0 1714]]

XGBoost Classification Report:

file:///Users/benjamin/Downloads/test_2.html 8/11
2024/6/23 20:26 test_2
precision recall f1-score support

0 1.00 1.00 1.00 1316

1 1.00 1.00 1.00 12690
2 1.00 1.00 1.00 1714

accuracy 1.00 15720

macro avg 1.00 1.00 1.00 15720
weighted avg 1.00 1.00 1.00 15720

In [ ]:
# Define the parameter grid for hyperparameter tuning
param_grid_rf = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}

param_grid_svm = {
'C': [0.1, 1, 10],
'gamma': [1, 0.1, 0.01],
'kernel': ['rbf', 'linear']
}

param_grid_dt = {
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}

param_grid_xgb = {
'n_estimators': [100, 200, 300],
'max_depth': [3, 6, 9],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.7, 0.8, 0.9],
'colsample_bytree': [0.7, 0.8, 0.9]
}

# Define the cross-validation strategy

outer_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=random_state)

# Define the cross-validation evaluation function with progress bar

def cross_val_evaluate_model_with_progress(model, X, y, param_grid):
outer_cv_splits = list(outer_cv.split(X))
scores = []

for train_index, test_index in tqdm(outer_cv_splits, desc="Outer CV Progr

X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test)
scores.append(score)

return np.mean(scores), np.std(scores)

In [ ]:
# Random Forest
rf_model = RandomForestClassifier(random_state=random_state)

file:///Users/benjamin/Downloads/test_2.html 9/11
2024/6/23 20:26 test_2
rf_mean_score, rf_std_score = cross_val_evaluate_model_with_progress(rf_model
print(f"Random Forest CV Mean Accuracy: {rf_mean_score:.4f} ± {rf_std_score:.

Outer CV Progress: 100%|██████████| 5/5 [47:25<00:00, 569.03s/it]

Random Forest CV Mean Accuracy: 1.0000 ± 0.0000

In [ ]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=random_state)
dt_mean_score, dt_std_score = cross_val_evaluate_model_with_progress(dt_model
print(f"Decision Tree CV Mean Accuracy: {dt_mean_score:.4f} ± {dt_std_score:.

Outer CV Progress: 100%|██████████| 5/5 [00:16<00:00, 3.22s/it]

Decision Tree CV Mean Accuracy: 1.0000 ± 0.0000

In [ ]:
# Support Vector Machine
svm_model = SVC(probability=True, random_state=random_state)
# svm_model = SVC(random_state=random_state)
svm_mean_score, svm_std_score = cross_val_evaluate_model_with_progress(svm_mo
print(f"SVM CV Mean Accuracy: {svm_mean_score:.4f} ± {svm_std_score:.4f}")

Outer CV Progress: 0%| | 0/5 [00:00<?, ?it/s]/opt/conda/lib/python

In [ ]:
# XGBoost Model
xgb_model = XGBClassifier(random_state=random_state)
xgb_mean_score, xgb_std_score = cross_val_evaluate_model_with_progress(xgb_mo
print(f"XGBoost CV Mean Accuracy: {xgb_mean_score:.4f} ± {xgb_std_score:.4f}"

Outer CV Progress: 0%| | 0/5 [00:00<?, ?it/s]/opt/conda/lib/python

3.10/site-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarn
ing: os.fork() was called. os.fork() is incompatible with multithreaded code,
and JAX is multithreaded, so this will likely lead to a deadlock.
pid = os.fork()
Outer CV Progress: 100%|██████████| 5/5 [1:46:14<00:00, 1274.83s/it]
XGBoost CV Mean Accuracy: 1.0000 ± 0.0000

In [ ]:
# LSTM Model
def create_lstm_model():
model = Sequential()
model.add(Input(shape=(1, X_train_reshaped.shape[2]))) # Adjust the inpu
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # Adjust the output layer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc
return model

# Create a KerasClassifier for the LSTM model

lstm_keras_model = KerasClassifier(model=create_lstm_model, epochs=10, batch_

# Perform cross-validation
lstm_cv_results = cross_val_score(lstm_keras_model, X_train_reshaped, y_train

# Calculate mean and standard deviation of cross-validation results

lstm_mean_score = np.mean(lstm_cv_results)
file:///Users/benjamin/Downloads/test_2.html 10/11
2024/6/23 20:26 test_2
lstm_std_score = np.std(lstm_cv_results)
print(f"LSTM CV Mean Accuracy: {lstm_mean_score:.4f} ± {lstm_std_score:.4f}")

In [ ]:

file:///Users/benjamin/Downloads/test_2.html 11/11

FRA Milestone 1 Jupyter Notebook PDF
100% (3)
FRA Milestone 1 Jupyter Notebook PDF
42 pages
Laboratory Quality Control
50% (2)
Laboratory Quality Control
19 pages
DataMiningProjectProblem1 Clustering
100% (4)
DataMiningProjectProblem1 Clustering
20 pages
Machine Learning - Project
80% (10)
Machine Learning - Project
14 pages
Data Mining - Project
100% (2)
Data Mining - Project
11 pages
Observation: As We Can See We Have Threwe Types of Datatypes I.E. (Int, Float, Object) That Means We Have Both Categorical and Numerical Data
No ratings yet
Observation: As We Can See We Have Threwe Types of Datatypes I.E. (Int, Float, Object) That Means We Have Both Categorical and Numerical Data
2 pages
Online Payments Fraud Detection Documentation
No ratings yet
Online Payments Fraud Detection Documentation
40 pages
Credit Card Fraud Detection V29.Ipynb
No ratings yet
Credit Card Fraud Detection V29.Ipynb
976 pages
Phase 2 New
No ratings yet
Phase 2 New
14 pages
IBM Credit Card Fraud Detection
No ratings yet
IBM Credit Card Fraud Detection
12 pages
Inspection Report Shore Quantity Report Ullage Report Time Sheet / Time Log Sample Report Quality Report
No ratings yet
Inspection Report Shore Quantity Report Ullage Report Time Sheet / Time Log Sample Report Quality Report
21 pages
Online Payment Fraud Detection - Ipynb
No ratings yet
Online Payment Fraud Detection - Ipynb
120 pages
Customer Churn Syntax
No ratings yet
Customer Churn Syntax
66 pages
AML Project LearnerNotebook LowCode
No ratings yet
AML Project LearnerNotebook LowCode
74 pages
Credit Card Analysis
No ratings yet
Credit Card Analysis
95 pages
Matplotlib Library in Python
No ratings yet
Matplotlib Library in Python
85 pages
Customer Segmentation in Python
No ratings yet
Customer Segmentation in Python
71 pages
Chapter 2 Exercises
No ratings yet
Chapter 2 Exercises
62 pages
Clustering
No ratings yet
Clustering
53 pages
Price Utilization and Expense Model
No ratings yet
Price Utilization and Expense Model
52 pages
Module 9 Seaborn - Loans MSIS2407 20241113 Filled
No ratings yet
Module 9 Seaborn - Loans MSIS2407 20241113 Filled
38 pages
New Green Field School
No ratings yet
New Green Field School
33 pages
EDA Process For Shopify Sales Data
No ratings yet
EDA Process For Shopify Sales Data
35 pages
Online Bank Portal PROJECT
No ratings yet
Online Bank Portal PROJECT
29 pages
AIML Lab Ex 3-5 - 1
No ratings yet
AIML Lab Ex 3-5 - 1
31 pages
Machine Learning Lab
No ratings yet
Machine Learning Lab
20 pages
Design Report of A Go Kart Vehicle
No ratings yet
Design Report of A Go Kart Vehicle
8 pages
Fraud Transaction Prediction
No ratings yet
Fraud Transaction Prediction
26 pages
Psychology and Other Disciplines
No ratings yet
Psychology and Other Disciplines
5 pages
Project Ip
No ratings yet
Project Ip
20 pages
Data Mining Project - Brahma Chari
No ratings yet
Data Mining Project - Brahma Chari
23 pages
Telecom Dataset Output
No ratings yet
Telecom Dataset Output
34 pages
Credit-Card - Notebooks - Preprocessed-Data - Data - Preprocessing - Ipynb at Main Shubhamdongarjal - Credit-Card
No ratings yet
Credit-Card - Notebooks - Preprocessed-Data - Data - Preprocessing - Ipynb at Main Shubhamdongarjal - Credit-Card
15 pages
Computer Project Final - pdf2
No ratings yet
Computer Project Final - pdf2
22 pages
A Project Report On Bank Management System
No ratings yet
A Project Report On Bank Management System
20 pages
Aiml 1-10
No ratings yet
Aiml 1-10
29 pages
Daily Transactions Problem Statement
No ratings yet
Daily Transactions Problem Statement
27 pages
Danmairo - Analysis - Ipynb - Colaboratory
No ratings yet
Danmairo - Analysis - Ipynb - Colaboratory
18 pages
Customer Churn Prediction
No ratings yet
Customer Churn Prediction
16 pages
Hands-On Activity 3.3 Random Forest Mantaring - Ipynb - Mantaring
No ratings yet
Hands-On Activity 3.3 Random Forest Mantaring - Ipynb - Mantaring
13 pages
Capstone Report: FIRST NAME: Gopalakrishnan LAST NAME: Kalarikovilagam Subramanian M12821535
No ratings yet
Capstone Report: FIRST NAME: Gopalakrishnan LAST NAME: Kalarikovilagam Subramanian M12821535
17 pages
Customer Segmentation PDF
No ratings yet
Customer Segmentation PDF
18 pages
EDA Plots Code
No ratings yet
EDA Plots Code
13 pages
Credit Card Fraud Detection With CNN 99 Accuracy
No ratings yet
Credit Card Fraud Detection With CNN 99 Accuracy
12 pages
Task 1 Vijaya Lakshman PDF
No ratings yet
Task 1 Vijaya Lakshman PDF
10 pages
Eny Eg LH5 PP LWRJQ AJCb 8 S65 HT0 Ty 8 Q
No ratings yet
Eny Eg LH5 PP LWRJQ AJCb 8 S65 HT0 Ty 8 Q
9 pages
MSML Project 1
No ratings yet
MSML Project 1
8 pages
Data Mining Ex1
No ratings yet
Data Mining Ex1
10 pages
Financial Fraud Detection Using Machine Learning
No ratings yet
Financial Fraud Detection Using Machine Learning
9 pages
Sales Data Clustering
No ratings yet
Sales Data Clustering
15 pages
AA
No ratings yet
AA
6 pages
"Normal" "Fraud": #Check For Any Null Values
No ratings yet
"Normal" "Fraud": #Check For Any Null Values
7 pages
Aosdijfpqoiew
No ratings yet
Aosdijfpqoiew
6 pages
Fraud Transaction Detection - Ipynb - Colab - Rameshkumar
No ratings yet
Fraud Transaction Detection - Ipynb - Colab - Rameshkumar
7 pages
Python Applications
No ratings yet
Python Applications
8 pages
Credit Card Fraud Detection
No ratings yet
Credit Card Fraud Detection
8 pages
Task 2 Exploratory Data Analysis
No ratings yet
Task 2 Exploratory Data Analysis
5 pages
Lab 1 ML
No ratings yet
Lab 1 ML
2 pages
Q1 Q2 Merged
No ratings yet
Q1 Q2 Merged
4 pages
Credit Card Fraud Analysis Ashutosh
No ratings yet
Credit Card Fraud Analysis Ashutosh
3 pages
Data Preprocessing 2
No ratings yet
Data Preprocessing 2
5 pages
Assignment 1 PYTHON
No ratings yet
Assignment 1 PYTHON
2 pages
The Three Lines of Defence: Audit Committee Institute
No ratings yet
The Three Lines of Defence: Audit Committee Institute
4 pages
98 - Improving Rutting Resistance Using Geosynthetics
No ratings yet
98 - Improving Rutting Resistance Using Geosynthetics
5 pages
Quality Work Life
No ratings yet
Quality Work Life
12 pages
Date Reference Description Valuedate Deposit Withdrawal Balance
No ratings yet
Date Reference Description Valuedate Deposit Withdrawal Balance
26 pages
1941 - National Building Code of Canada
No ratings yet
1941 - National Building Code of Canada
432 pages
Storage Tank Protection Using VCI 2
No ratings yet
Storage Tank Protection Using VCI 2
9 pages
ESTANERO - April 12 LP Mam Duenas
No ratings yet
ESTANERO - April 12 LP Mam Duenas
6 pages
Science: Quarter 2 - 3 Where D O I C O Mef Rom ?
No ratings yet
Science: Quarter 2 - 3 Where D O I C O Mef Rom ?
23 pages
Final Exam For Handle Mail
No ratings yet
Final Exam For Handle Mail
2 pages
Reg 216 - B520
No ratings yet
Reg 216 - B520
24 pages
MAD111 - Chap 1
No ratings yet
MAD111 - Chap 1
237 pages
Design and Optimization of Spur Gear: Second Review
No ratings yet
Design and Optimization of Spur Gear: Second Review
44 pages
Step FOPDT Lengkap
No ratings yet
Step FOPDT Lengkap
110 pages
Chapter 1 THE PROBLEM AND ITS BACKGROUND
No ratings yet
Chapter 1 THE PROBLEM AND ITS BACKGROUND
10 pages
Essay and Elocution Competition
No ratings yet
Essay and Elocution Competition
1 page
Beige Aesthetic Modern Business Plan A4 Document
No ratings yet
Beige Aesthetic Modern Business Plan A4 Document
22 pages
VOCALOID 6 Reference Manual ENG
No ratings yet
VOCALOID 6 Reference Manual ENG
88 pages
Molo District Health Center: AP (Pre-Natal)
No ratings yet
Molo District Health Center: AP (Pre-Natal)
2 pages
OD328816327605052100
No ratings yet
OD328816327605052100
1 page
Software Verification & Validation
No ratings yet
Software Verification & Validation
18 pages
Lab 12 Eca2 Version Modif
No ratings yet
Lab 12 Eca2 Version Modif
13 pages
28 October 2023 Current Affairs English
No ratings yet
28 October 2023 Current Affairs English
11 pages
Offline Schedule-Siioc2023 Version2
No ratings yet
Offline Schedule-Siioc2023 Version2
5 pages
Daftar Referensi Jurnal Enzim1
No ratings yet
Daftar Referensi Jurnal Enzim1
7 pages
Contourline / Pureline Warming Drawer: 8 Shown Above: Esw 6114
No ratings yet
Contourline / Pureline Warming Drawer: 8 Shown Above: Esw 6114
5 pages
Curriculum Vitae Indra Hermawan
No ratings yet
Curriculum Vitae Indra Hermawan
1 page
Advanced Multiplayer Game Development with Ureal Engine 5: A Comprehensive Guide to C++ Scripting
From Everand
Advanced Multiplayer Game Development with Ureal Engine 5: A Comprehensive Guide to C++ Scripting
Vladimir Kiselev
No ratings yet
ChatGPT for Business: Strategies for Success
From Everand
ChatGPT for Business: Strategies for Success
Matthew C. Smith
No ratings yet

Test 2

Uploaded by

Test 2

Uploaded by

2024/6/23 20:26 test_2

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val

2024-06-22 22:58:14.363232: E external/local_xla/xla/stream_executor/cuda/cud

#chek the data output

Random state used for train_test_split: 6828

receiving_address amount transaction_type \

location_region ip_prefix login_frequency session_duration \

purchase_pattern age_group risk_score anomaly

# Merging these features back into the dataframe

# Encode the target variable 'Anomaly'

# One-hot encoding for other categorical features

# Features and target variable

Function to evaluate model performance using cross-validation

Random Forest Model

# Train a RandomForest classifier

Random Forest Confusion Matrix:

Random Forest Classification Report:

0 1.00 1.00 1.00 1316

accuracy 1.00 15720

# Plot feature importance

<Figure size 640x480 with 0 Axes>

# Convert data to float32

# Reshape data for LSTM model: (samples, time steps, features)

Build LSTM model:

492/492 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step

0.0 1.00 1.00 1.00 1316

accuracy 1.00 15720

SVM Support Vector Machine

# Train and evaluate on test set

SVM Confusion Matrix:

SVM Classification Report:

0 1.00 1.00 1.00 1316

accuracy 1.00 15720

# Train a Decision Tree classifier

# Train and evaluate on test set

Decision Tree Confusion Matrix:

Decision Tree Classification Report:

0 1.00 1.00 1.00 1316

accuracy 1.00 15720

# Train the best XGBoost model and evaluate on test set

XGBoost Confusion Matrix:

XGBoost Classification Report:

0 1.00 1.00 1.00 1316

accuracy 1.00 15720

# Define the cross-validation strategy

# Define the cross-validation evaluation function with progress bar

for train_index, test_index in tqdm(outer_cv_splits, desc="Outer CV Progr

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv

return np.mean(scores), np.std(scores)

Outer CV Progress: 100%|██████████| 5/5 [47:25<00:00, 569.03s/it]

Outer CV Progress: 100%|██████████| 5/5 [00:16<00:00, 3.22s/it]

Outer CV Progress: 0%| | 0/5 [00:00<?, ?it/s]/opt/conda/lib/python

Outer CV Progress: 0%| | 0/5 [00:00<?, ?it/s]/opt/conda/lib/python

# Create a KerasClassifier for the LSTM model

# Calculate mean and standard deviation of cross-validation results

You might also like