0% found this document useful (0 votes)
55 views

Cyber Security Coding in Python

Uploaded by

rasheedmumuni
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
55 views

Cyber Security Coding in Python

Uploaded by

rasheedmumuni
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 9

# This Python 3 environment comes with many helpful analytics libraries

installed
# It is defined by the python Docker image
# For example, here's several helpful packages to load

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O
import matplotlib.pyplot as plt # Data Visualization
import seaborn as sns # Data Visualization

# Input data files are available in the read-only "../input/" directory


# For example, running this will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146:
UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this
version of SciPy (detected version 1.23.5
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
/input/cyber-security-breaches/CyberSecurityBreaches.csv
In [2]:
# Load dataset
health_data =
pd.read_csv('/input/cyber-security-breaches/CyberSecurityBreaches.csv')
# Displaying first 5 columns
health_data.head()
Out[2]:

S/n Individual
Name Type Date Breach Location Associate Description
0 s

A binder containing
Brooke Army Healthcare 2009-
0 1 1000 Theft Paper/Films False the protected health
Medical Center Provider 10-21
information

Five desktop
Kidney Stone Healthcare 2009-
1 2 1000 Theft Network Server False computers containing
Association, LLC Provider 10-28
unencrypted

Department of Other Portable


Healthcare 2009-
2 3 Health and Social 501 Theft Electronic False \N
Provider 10-30
Services Device

3 4 Health Services for Health Plan 3800 2009- Loss Laptop False A laptop was lost by
S/n Individual
Name Type Date Breach Location Associate Description
0 s

Children with an employee while in


11-17
Special Need transit

A shared Computer
Healthcare 2009- Desktop
4 5 Douglas Carlson 5257 Theft False that was used for
Provider 11-20 Computer
backup was hacked

In [3]:
# Display the number of rows and columns of the DataFrame
health_data.shape
Out[3]:
(1151, 10)
In [4]:
# Display the summary of the DataFrame
health_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151 entries, 0 to 1150
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 1151 non-null int64
1 Name.of.Covered.Entity 1151 non-null object
2 State 1151 non-null object
3 Covered.Entity.Type 1151 non-null object
4 Individuals.Affected 1151 non-null int64
5 Breach.Submission.Date 1151 non-null object
6 Type.of.Breach 1151 non-null object
7 Location.of.Breached.Information 1151 non-null object
8 Business.Associate.Present 1151 non-null bool
9 Web.Description 1101 non-null object
dtypes: bool(1), int64(2), object(7)
memory usage: 82.2+ KB

 ### Data Cleaning

In [5]:
# Drop unwanted columns
health_data.drop(columns=['Unnamed: 0'], inplace=True)
In [6]:
# Replace '.' wuth ' ' in column names
health_data.columns = health_data.columns.str.replace('.', ' ', regex=False)
In [7]:
# Check for missing values
health_data.isna().sum()
Out[7]:
Name of Covered Entity 0
State 0
Covered Entity Type 0
Individuals Affected 0
Breach Submission Date 0
Type of Breach 0
Location of Breached Information 0
Business Associate Present 0
Web Description 50
dtype: int64
In [8]:
# Handling missing values in the 'Web Description' column by filling them
with 'Not Available'
health_data['Web Description'].fillna('Not Available', inplace=True)
# Replace problematic characters '\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a' with
a single apostrophe "'"
health_data['Web Description'] = health_data['Web
Description'].str.replace('\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a', "'")

# Remove any '\n\\' characters by escaping the backlashes


health_data['Web Description'] = health_data['Web
Description'].str.replace('\\n\\\\', '', regex=False)
In [9]:
# Remove duplicate rows from the DataFrame
health_data_no_duplicates = health_data.drop_duplicates()

# Print the shape of the DataFrame before and after removing duplicates
print("Shape of DataFrame before removing duplicates:", health_data.shape)
print("Shape of DataFrame after removing duplicates:",
health_data_no_duplicates.shape)
Shape of DataFrame before removing duplicates: (1151, 9)
Shape of DataFrame after removing duplicates: (1151, 9)
In [10]:
unique_location = health_data['Location of Breached
Information'].value_counts()
print(unique_location)
Paper/Films
254
Laptop
222
Other
132
Network Server
127
Desktop Computer
108
Other Portable Electronic Device
68
Email
66
Other, Other Portable Electronic Device
44
Electronic Medical Record
30
Laptop, Other Portable Electronic Device
11
Desktop Computer, Laptop
10
Desktop Computer, Network Server
8
Laptop, Paper/Films
6
Desktop Computer, Email, Laptop, Network Server
6
Desktop Computer, Paper/Films
5
Other, Paper/Films
4
Desktop Computer, Laptop, Other Portable Electronic Device
4
Email, Network Server
4
Electronic Medical Record, Other
4
Electronic Medical Record, Paper/Films
4
Desktop Computer, Electronic Medical Record
2
Electronic Medical Record, Laptop
2
Email, Other
2
Desktop Computer, Other Portable Electronic Device
2
Laptop, Network Server
2
Email, Other Portable Electronic Device
2
Electronic Medical Record, Other, Other Portable Electronic Device
2
Electronic Medical Record, Network Server
1
Email, Laptop
1
Desktop Computer, Network Server, Paper/Films
1
Desktop Computer, Email, Laptop, Network Server, Other, Other Portable
Electronic Device 1
Desktop Computer, Email
1
Laptop, Other
1
Desktop Computer, Laptop, Network Server
1
Email, Laptop, Other Portable Electronic Device
1
Email, Laptop, Network Server
1
Desktop Computer, Network Server, Other, Other Portable Electronic Device
1
Desktop Computer, Other
1
Desktop Computer, Electronic Medical Record, Email, Network Server,
Paper/Films 1
Network Server, Other
1
Desktop Computer, Electronic Medical Record, Email, Laptop, Network
Server, Other, Other Portable Electronic Device, Paper/Films 1
Laptop, Other Portable Electronic Device, Paper/Films
1
Desktop Computer, Electronic Medical Record, Email, Laptop, Network
Server, Other, Other Portable Electronic Device 1
Desktop Computer, Other, Other Portable Electronic Device
1
Desktop Computer, Laptop, Other, Other Portable Electronic Device
1
Desktop Computer, Electronic Medical Record, Network Server
1
Other Portable Electronic Device, Paper/Films
1
Name: Location of Breached Information, dtype: int64
In [11]:
location_mapping = {
'Paper/Films': 'Physical',
'Laptop': 'Electronic',
'Network Server': 'Electronic',
'Desktop Computer': 'Electronic',
'Other Portable Electronic Device': 'Electronic',
'Email': 'Electronic',
'Electronic Medical Record': 'Electronic',
'Other': 'Other'
}
health_data['Location Group'] = health_data['Location of Breached
Information'].map(location_mapping)
In [12]:
unique_breach_types = health_data['Type of Breach'].value_counts()
print(unique_breach_types)
Theft 577
Unauthorized Access/Disclosure 183
Other 89
Loss 79
Hacking/IT Incident 77
Improper Disposal 42
Theft, Unauthorized Access/Disclosure 24
Loss, Theft 15
Hacking/IT Incident, Unauthorized Access/Disclosure 10
Unknown 10
Other, Unauthorized Access/Disclosure 7
Other, Theft 5
Loss, Unauthorized Access/Disclosure 5
Improper Disposal, Loss, Theft 3
Hacking/IT Incident, Theft, Unauthorized Access/Disclosure 3
Improper Disposal, Loss 3
Loss, Other 2
Improper Disposal, Unauthorized Access/Disclosure 2
Other, Theft, Unauthorized Access/Disclosure 2
Loss, Unknown 2
Other, Unknown 2
Hacking/IT Incident, Other 2
Loss, Unauthorized Access/Disclosure, Unknown 1
Hacking/IT Incident, Other, Unauthorized Access/Disclosure 1
Hacking/IT Incident, Theft 1
Loss, Other, Theft 1
Improper Disposal, Theft, Unauthorized Access/Disclosure 1
Unauthorized Access/Disclosure 1
Theft, Unauthorized Access/Disclosure, Unknown 1
Name: Type of Breach, dtype: int64
In [13]:
# Define a dictionary to map breach types to groups
breach_type_mapping = {
'Hacking/IT Incident': 'IT Incident',
'Improper Disposal': 'Physical Loss',
'Loss': 'Physical Loss',
'Theft': 'Physical Theft',
'Unauthorized Access/Disclosure': 'Unauthorized Access',
'Unknown': 'Unknown',
'Other': 'Other',
}

# Create a new column 'Breach Type Group' based on the mapping


health_data['Breach Type Group'] = health_data['Type of
Breach'].map(breach_type_mapping)

# Check the unique values in the new 'Breach Type Group' column
unique_breach_type_groups = health_data['Breach Type Group'].unique()
print(unique_breach_type_groups)
['Physical Theft' 'Physical Loss' 'Other' 'Unauthorized Access'
'IT Incident' nan 'Unknown']
In [14]:
health_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151 entries, 0 to 1150
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name of Covered Entity 1151 non-null object
1 State 1151 non-null object
2 Covered Entity Type 1151 non-null object
3 Individuals Affected 1151 non-null int64
4 Breach Submission Date 1151 non-null object
5 Type of Breach 1151 non-null object
6 Location of Breached Information 1151 non-null object
7 Business Associate Present 1151 non-null bool
8 Web Description 1151 non-null object
9 Location Group 1007 non-null object
10 Breach Type Group 1057 non-null object
dtypes: bool(1), int64(1), object(9)
memory usage: 91.2+ KB

 ### Data Visualization

 #### Univariate Visualization


 ##### Bar Chart
 ###### Pie Chart

In [15]:
# Group data by breach type and count the number of breaches in each category
breach_type_counts = health_data['Breach Type Group'].value_counts()

# Create a bar chart


plt.figure(figsize=(10, 6))
breach_type_counts.plot(kind='bar', color='skyblue')
plt.xlabel('Breach Type')
plt.ylabel('Number of Breaches')
plt.title('Distribution of Breach Types')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [16]:
# Group data by covered entity type and count the number of breaches in each
category
covered_entity_counts = health_data['Covered Entity Type'].value_counts()

# Create a pie chart


plt.figure(figsize=(8, 8))
plt.pie(covered_entity_counts, labels=covered_entity_counts.index,
autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a
circle.
plt.title('Distribution of Breaches by Covered Entity Type')

plt.show()
 #### Bivariate Visualization
 ##### Scatter Plot
 ###### Stacked BarChart

In [17]:
health_data['Breach Submission Date'] = pd.to_datetime(health_data['Breach
Submission Date'])
health_data['Year'] = health_data['Breach Submission Date'].dt.year
# Create a scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Year', y='Individuals Affected', data=health_data,
alpha=0.5, color='purple')
plt.ylabel('Number of Individuals Affected')
plt.xlabel('Year')
plt.title('Trends in Cybersecurity Breaches Impact Over Time')
plt.axhline(y=500, color='red', linestyle='--', label='Threshold for
Reporting (500)')
plt.legend()
# Explain
plt.text(2012, 2000000, 'More Individuals\nAffected', fontsize=12,
color='red')
plt.text(2014, 1000, 'Fewer Individuals\nAffected', fontsize=12,
color='green')
plt.text(2010, 1000, 'Reporting\nThreshold', fontsize=12, color='blue')

# context
plt.annotate(
'Cybersecurity breaches reported\nsince 2009\nThreshold for reporting
breaches: 500',
xy=(2010, 500),
xytext=(2010, 2000000),
fontsize=12,
arrowprops=dict(arrowstyle='->', color='gray')
)
plt.tight_layout()
plt.show()

In [18]:
linkcode
# Group data by year and breach type, count the number of breaches in each
category
grouped_data = health_data.groupby(['Year', 'Breach Type
Group']).size().unstack().fillna(0)

# Create a stacked bar chart with the 'pastel' color palette


plt.figure(figsize=(12, 6))
grouped_data.plot(kind='bar', stacked=True, colormap='GnBu')
plt.xlabel('Year')
plt.ylabel('Number of Breaches')
plt.title('Breach Type vs. Year')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Breach Type Group', loc='upper left')
plt.tight_layout()
plt.show()
<Figure size 1200x600 with 0 Axes>

You might also like