Credit - Card - Fraud - Detection Using ML - Jupyter Notebook
Credit - Card - Fraud - Detection Using ML - Jupyter Notebook
In [24]:
!pip install scikit-learn
In [2]:
#IMPORT THE REQUIRED LIBRARIES
In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler as SS
In [8]:
In [9]:
dataset=pd.read_csv('creditcard.csv')
In [10]:
#VIEWING THE DATASET USING head() and tail()
In [11]:
dataset.head()
Out[11]:
55 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.18911
54 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.12589
80 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.13909
91 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.22192
34 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.50229
In [12]:
dataset.tail()
Out[12]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ...
284802 172786.0 -11.881118 10.071785 -9.834783 -2.066656 -5.364473 -2.606837 -4.918215 7.305334 1.914428 ...
284803 172787.0 -0.732789 -0.055080 2.035030 -0.738589 0.868229 1.058415 0.024330 0.294869 0.584800 ...
284804 172788.0 1.919565 -0.301254 -3.249640 -0.557828 2.630515 3.031260 -0.296827 0.708417 0.432454 ...
284805 172788.0 -0.240440 0.530483 0.702510 0.689799 -0.377961 0.623708 -0.686180 0.679145 0.392087 ...
284806 172792.0 -0.533413 -0.189733 0.703337 -0.506271 -0.012546 -0.649617 1.577006 -0.414650 0.486180 ...
5 rows × 31 columns
In [13]:
#VIEW THE SHAPE OF THE DATASET
In [14]:
dataset.shape
Out[14]:
(284807, 31)
In [15]:
In [35]:
dataset.isnull().sum()
Out[35]:
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
dtype: int64
In [16]:
#INFORMATION ABOUT DATASET FEATURES
In [19]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Time 284807 non-null float64
1 V1 284807 non-null float64
2 V2 284807 non-null float64
3 V3 284807 non-null float64
4 V4 284807 non-null float64
5 V5 284807 non-null float64
6 V6 284807 non-null float64
7 V7 284807 non-null float64
8 V8 284807 non-null float64
9 V9 284807 non-null float64
10 V10 284807 non-null float64
11 V11 284807 non-null float64
12 V12 284807 non-null float64
13 V13 284807 non-null float64
14 V14 284807 non-null float64
15 V15 284807 non-null float64
16 V16 284807 non-null float64
17 V17 284807 non-null float64
18 V18 284807 non-null float64
19 V19 284807 non-null float64
20 V20 284807 non-null float64
21 V21 284807 non-null float64
22 V22 284807 non-null float64
23 V23 284807 non-null float64
24 V24 284807 non-null float64
25 V25 284807 non-null float64
26 V26 284807 non-null float64
27 V27 284807 non-null float64
28 V28 284807 non-null float64
29 Amount 284807 non-null float64
30 Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
In [ ]:
In [20]:
dataset.describe()
Out[20]:
Time V1 V2 V3 V4 V5 V6
8 rows × 31 columns
In [21]:
#COUNTING THE TARGET VALUES OF VALID AND FRAUD TRANSACTIONS
In [22]:
dataset['Class'].value_counts()
Out[22]:
0 284315
1 492
Name: Class, dtype: int64
In [23]:
In [24]:
real=dataset[dataset.Class==0]
fraud=dataset[dataset.Class==1]
In [25]:
#VIEW THE SHAPES OF VALID AND FRAUD TRANSACTIONS DATA
In [27]:
print(real.shape,fraud.shape)
In [ ]:
In [28]:
real.Amount.describe()
Out[28]:
count 284315.000000
mean 88.291022
std 250.105092
min 0.000000
25% 5.650000
50% 22.000000
75% 77.050000
max 25691.160000
Name: Amount, dtype: float64
In [29]:
fraud.Amount.describe()
Out[29]:
count 492.000000
mean 122.211321
std 256.683288
min 0.000000
25% 1.000000
50% 9.250000
75% 105.890000
max 2125.870000
Name: Amount, dtype: float64
In [30]:
#CONVERTING THE UNBALANCED DATASET TO EQUAL SIZE
In [31]:
real_trans=real.sample(n=492)
In [32]:
#SHAPE OF CONVERTED VALID TRANSACTIONS DATA
In [42]:
print(real_trans.shape)
(492, 31)
In [33]:
#CONCATENATING THE NEW VALID TRANSACTIONS DATA AND FRAUD DATA TO FORM NEW DATASET
In [34]:
new_dataset=pd.concat([real_trans,fraud],axis=0)
In [35]:
new_dataset.shape
Out[35]:
(984, 31)
In [45]:
print(new_dataset)
Time V1 V2 V3 V4 V5 V6 \
175010 122163.0 -0.417411 0.160325 0.603045 -2.121531 0.351510 -0.465216
169964 119951.0 0.138774 0.905687 -2.094182 0.130775 3.505227 3.309589
50495 44516.0 -1.808419 -1.330914 1.886407 0.880290 1.535955 -1.565265
180946 124758.0 1.202082 -1.952323 -3.845221 -1.479496 0.658750 -0.957501
191993 129489.0 -0.237305 -0.187512 0.749235 -1.503247 0.127986 -0.541237
... ... ... ... ... ... ... ...
279863 169142.0 -1.927883 1.125653 -4.518331 1.749293 -1.566487 -2.010494
280143 169347.0 1.378559 1.289381 -5.004247 1.411850 0.442581 -1.326536
280149 169351.0 -0.676143 1.126366 -2.213700 0.468308 -1.120541 -0.003346
281144 169966.0 -3.113832 0.585864 -5.399730 1.817092 -0.840618 -2.943548
281674 170348.0 1.991976 0.158476 -2.583441 0.408670 1.151147 -0.096695
In [36]:
#DIVIDING THE DATA INTO DEPENDENT AND INDEPENDENT VARIABLES
In [37]:
X=new_dataset.iloc[:,:-1]
y=new_dataset['Class']
In [38]:
print(X)
print(y)
Time V1 V2 V3 V4 V5 V6 \
263860 161146.0 -4.412310 2.156901 -2.183607 -1.832658 1.023210 -0.591397
249255 154316.0 -0.204154 0.910504 0.734187 -0.332162 0.466750 -0.986063
187770 127683.0 -1.154785 0.819386 1.497317 -0.554252 -0.387083 0.308089
106821 70142.0 -1.769494 0.864547 1.015048 -1.010349 -0.687608 -1.248479
205828 135929.0 -1.325690 1.369450 -0.801471 -0.269350 0.006770 -0.888574
... ... ... ... ... ... ... ...
279863 169142.0 -1.927883 1.125653 -4.518331 1.749293 -1.566487 -2.010494
280143 169347.0 1.378559 1.289381 -5.004247 1.411850 0.442581 -1.326536
280149 169351.0 -0.676143 1.126366 -2.213700 0.468308 -1.120541 -0.003346
281144 169966.0 -3.113832 0.585864 -5.399730 1.817092 -0.840618 -2.943548
281674 170348.0 1.991976 0.158476 -2.583441 0.408670 1.151147 -0.096695
In [ ]:
#DATA STANDARDIZATION USING STABDARDSCALER LIBRARY
In [39]:
In [44]:
scaler=SS()
scaler.fit(X)
standard_X=scaler.fit_transform(X)
In [93]:
print(standard_X)
In [45]:
#SPLITTING THE DATASET INTO TRAINING AND TESTING
In [46]:
x_train,x_test,y_train,y_test=train_test_split(standard_X,y,test_size=0.1,stratify=y,random_state=42)
In [92]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
In [ ]:
In [47]:
classifier=LogisticRegression()
classifier.fit(x_train,y_train)
Out[47]:
▾ LogisticRegression
LogisticRegression()
In [48]:
#CHECKING THE ACCURACY ON TRAINING DATA
In [49]:
train_acc=classifier.predict(x_train)
train_acc_score=accuracy_score(train_acc,y_train)
print("The Accuracy on training data is :",train_acc_score)
In [50]:
#PREDICT THE OUTPUTS USING TEST DATA ON THE MODEL
In [51]:
y_pred=classifier.predict(x_test)
In [52]:
print(y_pred)
[1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1
0 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1]
In [97]:
print(y_test)
198868 1
100928 0
163310 0
6707 0
269672 0
..
259266 0
30100 1
123238 1
127259 0
262560 1
Name: Class, Length: 99, dtype: int64
In [53]:
#CHECKING THE ACCURACY OF PREDICTED OUTPUTS BY THE MODEL
In [55]:
accuracy=accuracy_score(y_pred,y_test)
print("The Accuracy Score of the model is : ",accuracy)
In [56]:
In [57]:
input_data=[166205.0,-1.359807134, -0.072781173,2.536346738,1.378155224,-0.33832077,
0.462387778,0.239598554,0.098697901,0.3637869,0.090794172,-0.551599533,
-0.617800856,-0.991389847,-0.311169354,1.468176972,-0.470400525,
0.207971242,0.02579058,0.40399296,0.251412098,-0.018306778,0.277837576,
-0.11047391,0.066928075,0.128539358,-0.189114844,0.133558377,-0.21053053,149.62]
In [58]:
#CHANGING THIS INPUT DATA INTO NUMPY ARRAY
In [60]:
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
In [61]:
#STANDARDIZE THE RESHAPED ARRAY DATA
In [63]:
stand_input_data=scaler.transform(input_data_reshaped)
print(stand_input_data)
In [64]:
#PREDICT THE USER OF THIS UNKNOWN DATA
In [69]:
prediction_label=classifier.predict(stand_input_data)
In [71]:
if prediction_label:
print("Fraud Transaction..............👎")
else:
print("Valid Transaction...............👍")
Valid Transaction...............👍
In [73]:
#-----------LET US FIT THE RandomForestClassifier ON THE SAME DATA AND CHECK THE ACCURACY AND RESLUT----------
In [77]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
pred_rfc=rfc.predict(x_test)
print("Predicted lables using RFC\n",pred_rfc)
In [78]:
acc_score_rfc=accuracy_score(pred_rfc,y_test)
print("Accuracy Score using RFC is :",acc_score_rfc)
In [79]:
prediction_label_rfc=rfc.predict(stand_input_data)
In [80]:
if prediction_label_rfc:
print("Fraud Transaction..............👎")
else:
print("Valid Transaction...............👍")
Valid Transaction...............👍
In [ ]: