Cyber Security Coding in Python
Cyber Security Coding in Python
installed
# It is defined by the python Docker image
# For example, here's several helpful packages to load
import os
for dirname, _, filenames in os.walk('/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146:
UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this
version of SciPy (detected version 1.23.5
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
/input/cyber-security-breaches/CyberSecurityBreaches.csv
In [2]:
# Load dataset
health_data =
pd.read_csv('/input/cyber-security-breaches/CyberSecurityBreaches.csv')
# Displaying first 5 columns
health_data.head()
Out[2]:
S/n Individual
Name Type Date Breach Location Associate Description
0 s
A binder containing
Brooke Army Healthcare 2009-
0 1 1000 Theft Paper/Films False the protected health
Medical Center Provider 10-21
information
Five desktop
Kidney Stone Healthcare 2009-
1 2 1000 Theft Network Server False computers containing
Association, LLC Provider 10-28
unencrypted
3 4 Health Services for Health Plan 3800 2009- Loss Laptop False A laptop was lost by
S/n Individual
Name Type Date Breach Location Associate Description
0 s
A shared Computer
Healthcare 2009- Desktop
4 5 Douglas Carlson 5257 Theft False that was used for
Provider 11-20 Computer
backup was hacked
In [3]:
# Display the number of rows and columns of the DataFrame
health_data.shape
Out[3]:
(1151, 10)
In [4]:
# Display the summary of the DataFrame
health_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151 entries, 0 to 1150
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 1151 non-null int64
1 Name.of.Covered.Entity 1151 non-null object
2 State 1151 non-null object
3 Covered.Entity.Type 1151 non-null object
4 Individuals.Affected 1151 non-null int64
5 Breach.Submission.Date 1151 non-null object
6 Type.of.Breach 1151 non-null object
7 Location.of.Breached.Information 1151 non-null object
8 Business.Associate.Present 1151 non-null bool
9 Web.Description 1101 non-null object
dtypes: bool(1), int64(2), object(7)
memory usage: 82.2+ KB
In [5]:
# Drop unwanted columns
health_data.drop(columns=['Unnamed: 0'], inplace=True)
In [6]:
# Replace '.' wuth ' ' in column names
health_data.columns = health_data.columns.str.replace('.', ' ', regex=False)
In [7]:
# Check for missing values
health_data.isna().sum()
Out[7]:
Name of Covered Entity 0
State 0
Covered Entity Type 0
Individuals Affected 0
Breach Submission Date 0
Type of Breach 0
Location of Breached Information 0
Business Associate Present 0
Web Description 50
dtype: int64
In [8]:
# Handling missing values in the 'Web Description' column by filling them
with 'Not Available'
health_data['Web Description'].fillna('Not Available', inplace=True)
# Replace problematic characters '\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a' with
a single apostrophe "'"
health_data['Web Description'] = health_data['Web
Description'].str.replace('\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a', "'")
# Print the shape of the DataFrame before and after removing duplicates
print("Shape of DataFrame before removing duplicates:", health_data.shape)
print("Shape of DataFrame after removing duplicates:",
health_data_no_duplicates.shape)
Shape of DataFrame before removing duplicates: (1151, 9)
Shape of DataFrame after removing duplicates: (1151, 9)
In [10]:
unique_location = health_data['Location of Breached
Information'].value_counts()
print(unique_location)
Paper/Films
254
Laptop
222
Other
132
Network Server
127
Desktop Computer
108
Other Portable Electronic Device
68
Email
66
Other, Other Portable Electronic Device
44
Electronic Medical Record
30
Laptop, Other Portable Electronic Device
11
Desktop Computer, Laptop
10
Desktop Computer, Network Server
8
Laptop, Paper/Films
6
Desktop Computer, Email, Laptop, Network Server
6
Desktop Computer, Paper/Films
5
Other, Paper/Films
4
Desktop Computer, Laptop, Other Portable Electronic Device
4
Email, Network Server
4
Electronic Medical Record, Other
4
Electronic Medical Record, Paper/Films
4
Desktop Computer, Electronic Medical Record
2
Electronic Medical Record, Laptop
2
Email, Other
2
Desktop Computer, Other Portable Electronic Device
2
Laptop, Network Server
2
Email, Other Portable Electronic Device
2
Electronic Medical Record, Other, Other Portable Electronic Device
2
Electronic Medical Record, Network Server
1
Email, Laptop
1
Desktop Computer, Network Server, Paper/Films
1
Desktop Computer, Email, Laptop, Network Server, Other, Other Portable
Electronic Device 1
Desktop Computer, Email
1
Laptop, Other
1
Desktop Computer, Laptop, Network Server
1
Email, Laptop, Other Portable Electronic Device
1
Email, Laptop, Network Server
1
Desktop Computer, Network Server, Other, Other Portable Electronic Device
1
Desktop Computer, Other
1
Desktop Computer, Electronic Medical Record, Email, Network Server,
Paper/Films 1
Network Server, Other
1
Desktop Computer, Electronic Medical Record, Email, Laptop, Network
Server, Other, Other Portable Electronic Device, Paper/Films 1
Laptop, Other Portable Electronic Device, Paper/Films
1
Desktop Computer, Electronic Medical Record, Email, Laptop, Network
Server, Other, Other Portable Electronic Device 1
Desktop Computer, Other, Other Portable Electronic Device
1
Desktop Computer, Laptop, Other, Other Portable Electronic Device
1
Desktop Computer, Electronic Medical Record, Network Server
1
Other Portable Electronic Device, Paper/Films
1
Name: Location of Breached Information, dtype: int64
In [11]:
location_mapping = {
'Paper/Films': 'Physical',
'Laptop': 'Electronic',
'Network Server': 'Electronic',
'Desktop Computer': 'Electronic',
'Other Portable Electronic Device': 'Electronic',
'Email': 'Electronic',
'Electronic Medical Record': 'Electronic',
'Other': 'Other'
}
health_data['Location Group'] = health_data['Location of Breached
Information'].map(location_mapping)
In [12]:
unique_breach_types = health_data['Type of Breach'].value_counts()
print(unique_breach_types)
Theft 577
Unauthorized Access/Disclosure 183
Other 89
Loss 79
Hacking/IT Incident 77
Improper Disposal 42
Theft, Unauthorized Access/Disclosure 24
Loss, Theft 15
Hacking/IT Incident, Unauthorized Access/Disclosure 10
Unknown 10
Other, Unauthorized Access/Disclosure 7
Other, Theft 5
Loss, Unauthorized Access/Disclosure 5
Improper Disposal, Loss, Theft 3
Hacking/IT Incident, Theft, Unauthorized Access/Disclosure 3
Improper Disposal, Loss 3
Loss, Other 2
Improper Disposal, Unauthorized Access/Disclosure 2
Other, Theft, Unauthorized Access/Disclosure 2
Loss, Unknown 2
Other, Unknown 2
Hacking/IT Incident, Other 2
Loss, Unauthorized Access/Disclosure, Unknown 1
Hacking/IT Incident, Other, Unauthorized Access/Disclosure 1
Hacking/IT Incident, Theft 1
Loss, Other, Theft 1
Improper Disposal, Theft, Unauthorized Access/Disclosure 1
Unauthorized Access/Disclosure 1
Theft, Unauthorized Access/Disclosure, Unknown 1
Name: Type of Breach, dtype: int64
In [13]:
# Define a dictionary to map breach types to groups
breach_type_mapping = {
'Hacking/IT Incident': 'IT Incident',
'Improper Disposal': 'Physical Loss',
'Loss': 'Physical Loss',
'Theft': 'Physical Theft',
'Unauthorized Access/Disclosure': 'Unauthorized Access',
'Unknown': 'Unknown',
'Other': 'Other',
}
# Check the unique values in the new 'Breach Type Group' column
unique_breach_type_groups = health_data['Breach Type Group'].unique()
print(unique_breach_type_groups)
['Physical Theft' 'Physical Loss' 'Other' 'Unauthorized Access'
'IT Incident' nan 'Unknown']
In [14]:
health_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151 entries, 0 to 1150
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name of Covered Entity 1151 non-null object
1 State 1151 non-null object
2 Covered Entity Type 1151 non-null object
3 Individuals Affected 1151 non-null int64
4 Breach Submission Date 1151 non-null object
5 Type of Breach 1151 non-null object
6 Location of Breached Information 1151 non-null object
7 Business Associate Present 1151 non-null bool
8 Web Description 1151 non-null object
9 Location Group 1007 non-null object
10 Breach Type Group 1057 non-null object
dtypes: bool(1), int64(1), object(9)
memory usage: 91.2+ KB
In [15]:
# Group data by breach type and count the number of breaches in each category
breach_type_counts = health_data['Breach Type Group'].value_counts()
In [16]:
# Group data by covered entity type and count the number of breaches in each
category
covered_entity_counts = health_data['Covered Entity Type'].value_counts()
plt.show()
#### Bivariate Visualization
##### Scatter Plot
###### Stacked BarChart
In [17]:
health_data['Breach Submission Date'] = pd.to_datetime(health_data['Breach
Submission Date'])
health_data['Year'] = health_data['Breach Submission Date'].dt.year
# Create a scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Year', y='Individuals Affected', data=health_data,
alpha=0.5, color='purple')
plt.ylabel('Number of Individuals Affected')
plt.xlabel('Year')
plt.title('Trends in Cybersecurity Breaches Impact Over Time')
plt.axhline(y=500, color='red', linestyle='--', label='Threshold for
Reporting (500)')
plt.legend()
# Explain
plt.text(2012, 2000000, 'More Individuals\nAffected', fontsize=12,
color='red')
plt.text(2014, 1000, 'Fewer Individuals\nAffected', fontsize=12,
color='green')
plt.text(2010, 1000, 'Reporting\nThreshold', fontsize=12, color='blue')
# context
plt.annotate(
'Cybersecurity breaches reported\nsince 2009\nThreshold for reporting
breaches: 500',
xy=(2010, 500),
xytext=(2010, 2000000),
fontsize=12,
arrowprops=dict(arrowstyle='->', color='gray')
)
plt.tight_layout()
plt.show()
In [18]:
linkcode
# Group data by year and breach type, count the number of breaches in each
category
grouped_data = health_data.groupby(['Year', 'Breach Type
Group']).size().unstack().fillna(0)