Sibi 5
Sibi 5
– TIRUCHIRAPALLI
Submitted by
Introduction:
Project Objectives:
System Requirements:
Data:
• Features: step,type,amount,nameOrg,oldbalanceOrg,newbalanceOrg
Hardware:
Software:
• Operating System: Windows 10 (64-bit), macOS, or Linux (e.g.,
Ubuntu)
• Python (version 3.6 or later): https://www.python.org/downloads/
Python Libraries:
o Methodology:
o Data Preprocessing:
o 1. Data Cleaning:
• Identify missing values in the dataset.
• Decide on appropriate strategies to handle missing values,
such as imputation (mean, median, mode) or removal of
records with missing data.
• Detect outliers using statistical methods (e.g., Z-score, IQR).
• Handle outliers by either capping values, transforming them,
or removing the outlier data points if necessary.
2. Data Transformation:
• Normalize numerical features to a standard range (e.g., 0 to
1) or standardize them to have a mean of 0 and standard
deviation of 1.
• Convert categorical variables into numerical form using
techniques such as one-hot encoding or label encoding.
3. Data Splitting:
• Split the dataset into training and testing sets (e.g., 80%
training, 20% testing) to evaluate model performance.
• Further split the training set into a validation set to fine-tune
model parameters and prevent overfitting.
4. Feature Engineering:
6. Data Augmentation:
7. Data Integration:
8. Data Annotation:
Existing work:
Proposed Work:
The core of the project involves the selection and training of machine learning
models. We will leverage a traditional and advanced algorithms called
Random Forest. We have enhanced the ai model to determine whether the
transaction is fraud or not based on their recent transactions with the other users
and based on the complaints registered by them.
Flow Chart:
Implementation:
#Data Description
import pandas as pd
import numpy as np
data=pd.read_csv("/content/fraud1.csv")
data.head()
data.tail()
data.info()
data.describe()
#Null Data Handling
data.isnull()
data.notnull()
data.isnull().sum()
data.dropna()
data.fillna(0)
#Data Validation
data["type"].unique()
data["oldbalanceOrg"].unique()
data["isFraud"].unique()
#Data Reshaping
df_stacked=data.stack()
print(df_stacked.head(10))
df_unstacked=df_stacked.unstack()
print(df_unstacked.head(5))
df_melt=data.melt(id_vars=['type','isFraud'])
print(df_melt.head(10))
transposed_data=data.T
print(transposed_data)
#data merging
data1=pd.read_csv("/content/crd.csv")
merged_data=pd.merge(data, data1, on="type", how="inner")
print(merged_data)
#Data Aggregation
aggregated_df = data.groupby('type').agg({'amount': ['mean', 'sum']})
print(aggregated_df)
#data Groupby
mean_value = data.groupby('type')['amount'].mean()
sum_value = data.groupby('type')['amount'].sum()
print("Mean:", mean_value)
print("Sum:", sum_value)
#Data Analysis Techniques
#Univariate Analysis
import matplotlib.pyplot as plt
import seaborn as sns
sns.histplot(data['amount'].tail(15),bins=20)
plt.title("univariate analysis")
plt.show()
#Bivariate analysis
x=data["amount"].head(10)
y=df["oldbalanceOrg"].head(10)
plt.scatter(x,y)
plt.title("Bivariate analysis")
plt.show()
#multivariate analysis
sns.pairplot(data.head(10))
plt.title("multivariate analysis")
plt.show()
#Histogram
import matplotlib.pyplot as plt
import pandas as pd
path="/content/drive/MyDrive/fraud1.csv"
df=pd.read_csv(path)
plt.hist(df['amount'].head(10),bins=20)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()
#Bar Chart
plt.bar(df['type'].value_counts().index,df['type'].value_counts().values)
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.title('Bar Chart')
plt.show()
#Scatter Plot
plt.scatter(df['type'].head(25), df['amount'].head(25))
plt.xlabel('type')
plt.ylabel('amount')
plt.title('Scatter Plot')
plt.show()
#Box Plot
plt.boxplot(df['amount'])
plt.xlabel('Amount')
plt.ylabel('Value')
plt.title('Box Plot')
plt.show()
#Plot Pairs
sns.pairplot(df)
plt.title('Pair Plot')
plt.show()
#Interactive Scatter Plot
import plotly.express as px
fig = px.scatter(df.head(10), x='amount', y='type')
fig.show()
#Interactive Dashboards
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
app = dash.Dash(_name_)
app.layout = html.Div([
dcc.Graph(
id='interactive-plot',
figure={
'data': [
{'x': df['amount'], 'y': df['type'],
'mode': 'markers', 'type': 'scatter'}
],
'layout': {
'title': 'Interactive Scatter Plot',
'xaxis': {'title': 'amount'},
'yaxis': {'title': 'type'}
}
}
)
])
if _name_ == '_main_':
app.run_server(debug=True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy as sp
from tabulate import tabulate
import random
import tensorflow as tf
df = pd.read_csv('/content/drive/MyDrive/onlinefraud.csv')
df.head()
df.drop('isFlaggedFraud', axis=1, inplace=True)
df.info()
df.sample(5)
df.describe()
df.isnull().sum()
fraud_min_max = [
['amount', df.amount.min(), df.amount.max()],
['oldbalanceOrg', df.oldbalanceOrg.min(), df.oldbalanceOrg.max()],
['newbalanceOrig', df.newbalanceOrig.min(), df.newbalanceOrig.max()],
['oldbalanceDest', df.oldbalanceDest.min(), df.oldbalanceDest.max()],
['isFraud', df.isFraud.min(), df.isFraud.max()]
]
print(
tabulate(
fraud_min_max,
headers=['columns', 'min value', 'max value'],
showindex=True,
tablefmt='github',
numalign='right'
))
# Downcast numerical columns with smaller dtype
for col in df.columns:
if df[col].dtype == 'float64':
df[col] = pd.to_numeric(df[col], downcast='float')
if df[col].dtype == 'int64':
df[col] = pd.to_numeric(df[col], downcast='unsigned')
seed = 42
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)
X = df.copy()
X.drop(['nameOrig', 'newbalanceOrig', 'nameDest', 'newbalanceDest'], axis=1, inplace=True)
y = X.pop('isFraud')
def model_comparison_evaluate(classifiers, X, y):
print('K-Fold Cross-Validation:\n')
for name, model in classifiers.items():
print('{}:'.format(name))
print('\n')
lassifiers = { 'Random Forest Classifier':RandomForestClassifier(class_weight='balanced',
random_state=seed)}
model_comparison_evaluate(classifiers, X_train, y_train)
model = RandomForestClassifier(class_weight='balanced', random_state=seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_score = model.predict_proba(X_test)[:,1]
print('Random Forest Classifier:')
print(classification_report(y_pred, y_test, labels=[0,1], target_names=['Non-Fraud [0]', 'Fraud
[1]']), '\n')
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def intra_list_diversity(recommendation_lists):
intra_diversities = []
for recommendation_list in recommendation_lists:
list_length = len(recommendation_list)
if list_length <= 1:
intra_diversities.append(0) # If list has only one element, diversity is 0
else:
list_array = np.array(recommendation_list).reshape(1, -1) # Reshape to 2D array
similarity_matrix = cosine_similarity(list_array)
intra_diversity = 1 - np.mean(similarity_matrix)
intra_diversities.append(intra_diversity)
return intra_diversities
def inter_list_diversity(recommendation_lists):
inter_diversity = []
for i in range(len(recommendation_lists)):
for j in range(i + 1, len(recommendation_lists)):
list_i = np.array(recommendation_lists[i]).reshape(-1, 1) # Reshape to 2D array with
one column
list_j = np.array(recommendation_lists[j]).reshape(-1, 1) # Reshape to 2D array with
one column
similarity_matrix = cosine_similarity(list_i, list_j)
avg_similarity = similarity_matrix[0][0] # Only one value in the similarity matrix
inter_diversity.append(avg_similarity)
return np.mean(inter_diversity)
# Sample data
data = {
'step': [1, 1, 1, 1, 1],
'type': ['PAYMENT', 'PAYMENT', 'TRANSFER', 'CASH_OUT', 'PAYMENT'],
'amount': [9839.64, 1864.28, 181.0, 181.0, 11668.14],
'nameOrig': ['C1231006815', 'C1666544295', 'C1305486145', 'C840083671',
'C2048537720'],
'oldbalanceOrg': [170136.0, 21249.0, 181.0, 181.0, 41554.0],
'newbalanceOrig': [160296.36, 19384.72, 0.0, 0.0, 29885.86],
'nameDest': ['M1979787155', 'M2044282225', 'C553264065', 'C38997010',
'M1230701703'],
'oldbalanceDest': [0.0, 0.0, 0.0, 21182.0, 0.0],
'newbalanceDest': [0.0, 0.0, 0.0, 0.0, 0.0],
'isFraud': [0, 0, 1, 1, 0],
'isFlaggedFraud': [0, 0, 0, 0, 0]
}
# Create DataFrame
df = pd.DataFrame(data)
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
data = {
'step': [1, 1, 1, 1, 1],
'type': ['PAYMENT', 'PAYMENT', 'TRANSFER', 'CASH_OUT', 'PAYMENT'],
'amount': [9839.64, 1864.28, 181.0, 181.0, 11668.14],
'nameOrig': ['C1231006815', 'C1666544295', 'C1305486145', 'C840083671',
'C2048537720'],
'oldbalanceOrg': [170136.0, 21249.0, 181.0, 181.0, 41554.0],
'newbalanceOrig': [160296.36, 19384.72, 0.0, 0.0, 29885.86],
'nameDest': ['M1979787155', 'M2044282225', 'C553264065', 'C38997010',
'M1230701703'],
'oldbalanceDest': [0.0, 0.0, 0.0, 21182.0, 0.0],
'newbalanceDest': [0.0, 0.0, 0.0, 0.0, 0.0],
'isFraud': [0, 0, 1, 1, 0],
'isFlaggedFraud': [0, 0, 0, 0, 0]
}
# Create DataFrame
df = pd.DataFrame(data)
popularity_scores = df['amount'].values
OUTPUT:
#Data Description:
#Data Validation
#isFraud.unique()
#Data Reshaping
#transpose
#Data Merging
#Data Aggregation
#Data Analysis
#Histogram
#Bar Chart
#Scatter Plot
#Box Plot
#Multivariate Analysis
#Interactive Scatterplots
#Interactive Dashboards
#Count Plot
#Module Training
Future Enhancements:
Conclusion: