0% found this document useful (0 votes)
10 views29 pages

Assignment 1, Codeandssfile

The document outlines a comprehensive data cleaning and analysis process for fire incident data, including filling missing values, applying linear regression for predictions, and visualizing trends over time. It also describes spatial analysis techniques such as K-means clustering and heatmap generation to identify incident hotspots. Additionally, it discusses the impact of incident types on resource utilization and proposes predictive modeling for resource allocation based on incident characteristics.

Uploaded by

tali66261
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views29 pages

Assignment 1, Codeandssfile

The document outlines a comprehensive data cleaning and analysis process for fire incident data, including filling missing values, applying linear regression for predictions, and visualizing trends over time. It also describes spatial analysis techniques such as K-means clustering and heatmap generation to identify incident hotspots. Additionally, it discusses the impact of incident types on resource utilization and proposes predictive modeling for resource allocation based on incident characteristics.

Uploaded by

tali66261
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 29

Data Cleaning:

Code:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
from sklearn.linear_model import LinearRegression

def fill_postcode_from_district(df):
# Identify rows where Postcode_full is NaN and Postcode_district is not NaN
mask = df['Postcode_full'].isna() & ~df['Postcode_district'].isna()

# Create a mapping from Postcode_district to Postcode_full for non-NaN values


district_to_postcode =
df[~df['Postcode_full'].isna()].drop_duplicates('Postcode_district').set_index('Postcode_district')['Postcode_full']

# Use vectorized operations to fill NaN values


df.loc[mask, 'Postcode_full'] = df.loc[mask, 'Postcode_district'].map(district_to_postcode)

def fill_lat_lon_from_postcode_efficiently(df):
# Create a mapping from Postcode_full to Latitude and Longitude for rows where they are not NaN
postcode_to_lat_lon = df.dropna(subset=['Latitude', 'Longitude', 'Postcode_full'])[['Postcode_full', 'Latitude',
'Longitude']].drop_duplicates('Postcode_full')

# Use transform to efficiently broadcast values to NaN cells


df['Latitude'] = df['Latitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')
['Latitude']))
df['Longitude'] = df['Longitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')
['Longitude']))

def fill_postcode_from_district_efficiently(df):
# Create a mapping from Postcode_district to Postcode_full for non-NaN values
postcode_map =
df.dropna(subset=['Postcode_full']).drop_duplicates('Postcode_district').set_index('Postcode_district')
['Postcode_full'].to_dict()

# Use vectorized operations to fill NaN values


df['Postcode_full'] = df['Postcode_full'].fillna(df['Postcode_district'].map(postcode_map))

def fill_incgeo_wardcode_from_propercase(df):
# Create a mapping from ProperCase to IncGeo_WardCode for rows where IncGeo_WardCode is not NaN
propercase_to_wardcode =
df.dropna(subset=['IncGeo_WardCode']).drop_duplicates('ProperCase').set_index('ProperCase')
['IncGeo_WardCode'].to_dict()

# Use vectorized operations to fill NaN values


mask = (df['IncGeo_WardCode'].isna()) & (~df['ProperCase'].isna())
df.loc[mask, 'IncGeo_WardCode'] = df.loc[mask, 'ProperCase'].map(propercase_to_wardcode)

def fill_easting_from_uprn(df):
# Create a mapping from UPRN to Easting_m for rows where they are not zero
uprn_to_easting = df[df['UPRN'] != 0][['UPRN', 'Easting_m']].drop_duplicates('UPRN')

# Use transform to efficiently broadcast values to NaN cells


df['Easting_m'] = df['Easting_m'].mask((df['Easting_m'].isna()) & (df['UPRN'] != 0),
df['UPRN'].map(uprn_to_easting.set_index('UPRN')['Easting_m']))

# Main code
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Check for empty cells in each column


empty_cells_per_column = df.isnull().sum()
print("Empty cells per column:")
print(empty_cells_per_column)

# Filter rows where StopCodeDescription is 'Special Service'


special_service_df = df[df['StopCodeDescription'] == 'Special Service']

# Count NaN values in SpecialServiceType within the filtered rows


nan_count = special_service_df['SpecialServiceType'].isna().sum()
print("Number of 'Special Service' entries with NaN in 'SpecialServiceType':", nan_count)

# Replace NaN values in SpecialServiceType with 'Not applicable'


df['SpecialServiceType'] = df['SpecialServiceType'].fillna('Not applicable')

# Apply functions to fill data


fill_postcode_from_district(df)
fill_postcode_from_district_efficiently(df)
fill_incgeo_wardcode_from_propercase(df)
fill_lat_lon_from_postcode_efficiently(df)

# Filling blank cells'


df['IncGeo_WardCode'].fillna('Unknown', inplace=True)
df['IncGeo_WardName'].fillna('Unknown', inplace=True)
df['IncGeo_WardNameNew'].fillna('Unknown', inplace=True)
df['IncidentStationGround'].fillna('Unknown', inplace=True)
df['FirstPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)
df['FirstPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)
df['SecondPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)
df['SecondPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)
df['NumCalls'].fillna(0, inplace=True)
df['NumPumpsAttending'].fillna(0, inplace=True)
df['PumpCount'].fillna(0, inplace=True)
df['PumpHoursRoundUp'].fillna(0, inplace=True)
df['NumStationsWithPumpsAttending'].fillna(0, inplace=True)
df['Notional Cost (£)'] = df['PumpHoursRoundUp'] * 333

# Prepare the data for linear regression model


df_model = df.dropna(subset=['Easting_m', 'Longitude'])
X = df_model[['Longitude']]
y = df_model['Easting_m']
# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Predict missing Easting_m values using the model
missing_easting_index = df['Easting_m'].isnull()
predicted_easting = model.predict(df.loc[missing_easting_index, ['Longitude']])
# Fill in the missing Easting_m values with the predictions
df.loc[missing_easting_index, 'Easting_m'] = predicted_easting
# Check again for missing values in Easting_m after filling
missing_easting_final = df['Easting_m'].isnull().sum()
missing_easting_final

# Prepare the data for linear regression model to predict Northing_m


df_model_northing = df.dropna(subset=['Northing_m', 'Longitude'])
X_northing = df_model_northing[['Longitude']] # Using Longitude as predictor
y_northing = df_model_northing['Northing_m']
# Fit the linear regression model for Northing_m
model_northing = LinearRegression()
model_northing.fit(X_northing, y_northing)
# Predict missing Northing_m values using the model
missing_northing_index = df['Northing_m'].isnull()
predicted_northing = model_northing.predict(df.loc[missing_northing_index, ['Longitude']])
# Fill in the missing Northing_m values with the predictions
df.loc[missing_northing_index, 'Northing_m'] = predicted_northing
# Check again for missing values in Northing_m after filling
missing_northing_final = df['Northing_m'].isnull().sum()

missing_northing_final

# Check for empty cells again after the replacement


empty_cells_after_fill = df.isnull().sum()
print("\nEmpty cells per column after replacement:")
print(empty_cells_after_fill)

df.to_csv('C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv', index=False)
Screen Shots:
Before Cleaning:

After Cleaning:
Data Behavior:
Code:
import pandas as pd
import matplotlib.pyplot as plt

file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Convert DateOfCall to datetime


df['DateOfCall'] = pd.to_datetime(df['DateOfCall'])

# Trend of Incidents Over Time


# Grouping data by month and year
monthly_incidents = df.resample('M', on='DateOfCall').size()

# Plotting the trend of incidents over time


plt.figure(figsize=(14, 7))
monthly_incidents.plot(title='Monthly Trend of Incidents (2019-2022)')
plt.xlabel('Month')
plt.ylabel('Number of Incidents')
plt.grid(True)
plt.show()

# Trend of Incident Types


incident_types = df['IncidentGroup'].value_counts()

# Plotting the frequency of different types of incidents


plt.figure(figsize=(10, 6))
incident_types.plot(kind='bar', title='Frequency of Incident Types')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

# Filtering out rows where FirstPumpArriving_AttendanceTime is 'Unknown' or missing


filtered_data = df[df['FirstPumpArriving_AttendanceTime'] != 'Unknown']
filtered_data['FirstPumpArriving_AttendanceTime'] =
pd.to_numeric(filtered_data['FirstPumpArriving_AttendanceTime'])

# Trend of First Pump Arriving Attendance Time


# Calculating the average first pump arriving attendance time by month
average_response_time_monthly = filtered_data.resample('M', on='DateOfCall')
['FirstPumpArriving_AttendanceTime'].mean()

# Plotting the trend of average first pump arriving attendance time over time
plt.figure(figsize=(14, 7))
average_response_time_monthly.plot(title='Monthly Trend of First Pump Arriving Attendance Time (2019-
2022)', color='orange')
plt.xlabel('Month')
plt.ylabel('Average Attendance Time (Seconds)')
plt.grid(True)
plt.show()

# Incidents by Hour of Call


incidents_by_hour = df['HourOfCall'].value_counts().sort_index()

# Plotting incidents by hour of the day


plt.figure(figsize=(12, 6))
incidents_by_hour.plot(kind='bar', title='Incidents by Hour of Call')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

# Analyzing the Notional Cost of Incidents Over Time


# Ensure the 'Notional Cost (£)' column is numeric
df['Notional Cost (£)'] = pd.to_numeric(df['Notional Cost (£)'], errors='coerce')

# Calculating the total notional cost by month


monthly_notional_cost = df.resample('M', on='DateOfCall')['Notional Cost (£)'].sum()

# Plotting the trend of notional cost over time


plt.figure(figsize=(14, 7))
monthly_notional_cost.plot(title='Monthly Notional Cost of Incidents (2019-2022)', color='green')
plt.xlabel('Month')
plt.ylabel('Total Notional Cost (£)')
plt.grid(True)
plt.show()

Screen shots:
Spatial Analysis for Fire Incident Hotspots:
 Data Preprocessing:
 Validate and clean the geographical data ( Latitude, Longitude,
Postcode_district).
 Feature Engineering:
 Extract additional spatial features if available or create spatial bins to
categorize incidents into specific areas.
 Clustering:
 Apply spatial clustering techniques like DBSCAN or k-means to identify
hotspots.
 Heat Maps:
 Generate heat maps using GIS tools to visually represent the concentration of
incidents.
 Association Rule Mining:
 Explore associations between incident hotspots and other factors like time or
type of incident.

Code:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(filepath, usecols=None):


"""Load the dataset and preprocess."""
df = pd.read_csv(filepath, usecols=usecols)
# Ensure all data is valid
df = df[(df['Latitude'].between(-90, 90)) & (df['Longitude'].between(-180, 180))]
df['Postcode_district'] = df['Postcode_district'].fillna('Unknown')
# Impute missing values for Latitude and Longitude if necessary
df['Latitude'].fillna(df['Latitude'].mean(), inplace=True)
df['Longitude'].fillna(df['Longitude'].mean(), inplace=True)
return df

def apply_kmeans(df, n_clusters=5):


"""Apply K-means clustering to the dataset."""
# Extracting Latitude and Longitude for clustering
coords = df[['Latitude', 'Longitude']]
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(coords)
df['Cluster'] = kmeans.labels_
return df, kmeans

def plot_clusters(df, kmeans, plot_size=(12, 10), alpha=0.3):


"""Plot the clustered data and centroids."""
plt.figure(figsize=plot_size)
sns.scatterplot(x='Longitude', y='Latitude', hue='Cluster', data=df, palette='viridis', alpha=alpha)
plt.scatter(kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 0], s=300, c='red', label='Centroids',
marker='*')
plt.title('Fire Incident Clusters')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()

def generate_heatmap(df, plot_size=(12, 10)):


"""Generate a heatmap based on the density of incidents."""
plt.figure(figsize=plot_size)
sns.kdeplot(x=df['Longitude'], y=df['Latitude'], hue=df['Cluster'], fill=True, levels=100, cmap="viridis",
alpha=0.7)
plt.title('Heatmap of Fire Incident Density by Cluster')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()

def analyze_clusters(df):
"""Prints out detailed information about each cluster."""
for cluster_label in df['Cluster'].unique():
cluster_data = df[df['Cluster'] == cluster_label]
print(f"Cluster {cluster_label}:")
print(f"Number of incidents: {cluster_data.shape[0]}")
print(f"Centroid: (Latitude: {cluster_data['Latitude'].mean()}, Longitude:
{cluster_data['Longitude'].mean()})")
print(f"Bounding box: {cluster_data['Latitude'].min()} to {cluster_data['Latitude'].max()},
{cluster_data['Longitude'].min()} to {cluster_data['Longitude'].max()}")
print()

if __name__ == "__main__":
filepath = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv' # Update this to your file path
cols_to_use = ['Latitude', 'Longitude', 'Postcode_district', 'IncidentGroup', 'TimeOfCall']
df = load_and_preprocess_data(filepath, usecols=cols_to_use)
df, kmeans = apply_kmeans(df, n_clusters=5) # Adjust n_clusters based on your analysis needs
analyze_clusters(df)
plot_clusters(df, kmeans, plot_size=(12, 10), alpha=0.3)
generate_heatmap(df, plot_size=(12, 10))
Screen shots:
Heat Map
Impact of Incident Types on Resource
Utilization:
 Data Preprocessing:
 Organize and categorize incident types ( IncidentGroup,
StopCodeDescription).
 Descriptive Statistics:
 Calculate the frequency and distribution of resources used
(NumStationsWithPumpsAttending, NumPumpsAttending) by incident type.
 Predictive Modeling:
 Develop regression models or decision trees to predict the level of resources
needed based on incident characteristics.
 Cost Analysis:
 Incorporate Notional Cost (£) to assess the financial impact of different
types of incidents on resources.

Code:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv' # Update this with your actual file path
data = pd.read_csv(file_path)
# Grouping data by 'IncidentGroup' and aggregating unique 'StopCodeDescription' values
grouped_incidents = data.groupby('IncidentGroup')['StopCodeDescription'].unique()
# Convert the series to a more readable DataFrame
categorized_incidents_df = grouped_incidents.reset_index()
categorized_incidents_df.columns = ['IncidentGroup', 'UniqueStopCodeDescriptions']
print(categorized_incidents_df)

# For NumStationsWithPumpsAttending
stations_freq = data.groupby('IncidentGroup')
['NumStationsWithPumpsAttending'].value_counts().unstack(fill_value=0)
# For NumPumpsAttending
pumps_freq = data.groupby('IncidentGroup')['NumPumpsAttending'].value_counts().unstack(fill_value=0)
# For NumStationsWithPumpsAttending
stations_distribution = data.groupby('IncidentGroup')['NumStationsWithPumpsAttending'].describe()
# For NumPumpsAttending
pumps_distribution = data.groupby('IncidentGroup')['NumPumpsAttending'].describe()
# Plotting Frequency of Stations Attending
stations_freq.plot(kind='bar', figsize=(14, 7))
plt.title('Frequency of Stations Attending by Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.legend(title='Num Stations Attending')
plt.show()
# Plotting Frequency of Pumps Attending
pumps_freq.plot(kind='bar', figsize=(14, 7))
plt.title('Frequency of Pumps Attending by Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.legend(title='Num Pumps Attending')
plt.show()
# Box plot for NumStationsWithPumpsAttending
plt.figure(figsize=(10, 6))
data.boxplot(column='NumStationsWithPumpsAttending', by='IncidentGroup')
plt.title('Distribution of Stations Attending by Incident Type')
plt.suptitle('') # Suppress the automatic 'by' title generated by pandas
plt.xlabel('Incident Type')
plt.ylabel('Num Stations Attending')
plt.show()
# Box plot for NumPumpsAttending
plt.figure(figsize=(10, 6))
data.boxplot(column='NumPumpsAttending', by='IncidentGroup')
plt.title('Distribution of Pumps Attending by Incident Type')
plt.suptitle('') # Suppress the automatic 'by' title generated by pandas
plt.xlabel('Incident Type')
plt.ylabel('Num Pumps Attending')
plt.show()

# Feature Engineering: Extract hour from TimeOfCall (if not already done)
data['HourOfCall'] = pd.to_datetime(data['TimeOfCall'], format='%H:%M:%S').dt.hour

# Selecting features and target for the regression model


features = data[['IncidentGroup', 'HourOfCall', 'PropertyType']]
target = data['NumPumpsAttending']

# Encoding categorical variables


categorical_features = ['IncidentGroup', 'PropertyType']
one_hot = OneHotEncoder(handle_unknown='ignore')
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Splitting the dataset


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Creating the model pipeline


model = Pipeline([
('transformer', transformer),
('regressor', DecisionTreeRegressor(random_state=42))
])

# Training the model


model.fit(X_train, y_train)

# Predicting and evaluating


predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

# Plot actual vs. predicted values for the regression model


plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue', label='Predicted', alpha=0.5)
plt.plot(y_test, y_test, color='red', label='Actual') # Line of perfect prediction
plt.title('Actual vs. Predicted Number of Pumps Attending')
plt.xlabel('Actual Number of Pumps Attending')
plt.ylabel('Predicted Number of Pumps Attending')
plt.legend()
plt.show()

print(f"Mean Absolute Error: {mae}")

# Cost Analysis: Assess the financial impact of different types of incidents


# Aggregating notional costs by incident type for overall financial impact
total_costs_by_incident = data.groupby('IncidentGroup')['Notional Cost (£)'].sum()

# Calculating average cost per incident within each incident group


average_costs_by_incident = data.groupby('IncidentGroup')['Notional Cost (£)'].mean()

# Plotting Total and Average Notional Cost by Incident Type


fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

# Total Costs
total_costs_by_incident.plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Total Notional Cost by Incident Type')
axes[0].set_ylabel('Total Notional Cost (£)')
axes[0].set_xlabel('Incident Group')

# Average Costs
average_costs_by_incident.plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Average Notional Cost by Incident Type')
axes[1].set_ylabel('Average Notional Cost (£)')
axes[1].set_xlabel('Incident Group')

plt.tight_layout()
plt.show()
Screen shot:
Regression Model Output:
Efficiency of Incident Call Processing:
 Data Preprocessing:
 Clean and prepare NumCalls data for analysis.
 Feature Engineering:
 Create new features such as time to first response, call-to-incident ratio, etc.
 Regression Analysis:
 Conduct regression analysis to determine the factors that affect call
processing times.
 Pattern Recognition:
 Identify patterns in the data related to the volume of calls and processing
efficiency.

Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset


file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
data = pd.read_csv(file_path)

# Convert time columns to numeric, coercing errors to NaN


data['FirstPumpArriving_AttendanceTime'] = pd.to_numeric(data['FirstPumpArriving_AttendanceTime'],
errors='coerce')
data['SecondPumpArriving_AttendanceTime'] = pd.to_numeric(data['SecondPumpArriving_AttendanceTime'],
errors='coerce')

# Replace zeros to avoid division by zero errors in feature engineering


data['NumStationsWithPumpsAttending'].replace(0, 1, inplace=True)

# Feature Engineering
data['CallToIncidentRatio'] = data['NumCalls'] / data['NumStationsWithPumpsAttending']

# Identify numerical columns for median filling


numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
# Replace missing values for numerical columns with the median
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

features = ['CallToIncidentRatio', 'NumStationsWithPumpsAttending', 'NumCalls']


target = 'FirstPumpArriving_AttendanceTime'

# Splitting the data into training and testing sets


X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
rf_predictions = rf.predict(X_test)

# Evaluation
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

# Output Random Forest results


print("Random Forest Results:")
print(f"Mean Squared Error: {rf_mse}")
print(f"Mean Absolute Error (MAE): {rf_mae}")
print(f"R-squared (R2 Score): {rf_r2}")

# Visualization for RandomForest predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Since y_test is aligned with rf_predictions by index, we directly plot these
plt.scatter(y_test.index, y_test, color='red', label='Actual', alpha=0.6)
plt.scatter(y_test.index, rf_predictions, color='blue', label='Predicted_RF', alpha=0.6)
plt.xlabel('Index')
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for RandomForest')
plt.legend()
# Initialize and train the XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

# Predictions
xgb_predictions = xgb.predict(X_test)

# Evaluation
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

# Output XGBoost results


print("XGBoost Results:")
print(f"Mean Squared Error: {xgb_mse}")
print(f"Mean Absolute Error (MAE): {xgb_mae}")
print(f"R-squared (R2 Score): {xgb_r2}")

# Visualization for XGBoost predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Plot actual values
sns.scatterplot(x=y_test.index, y=y_test, color='red', label='Actual', alpha=0.6)
# Plot predicted values
sns.scatterplot(x=y_test.index, y=xgb_predictions, color='green', label='Predicted_XGB', alpha=0.6)
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for XGBoost')
plt.legend()
plt.show()
# Assuming the dataset contains a datetime column 'DateOfCall' that represents when the call was made
data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])
data['DayOfWeek'] = data['DateOfCall'].dt.day_name()
data['HourOfDay'] = data['HourOfCall']

# Visualization for XGBoost predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Plot actual values
sns.scatterplot(x=y_test.index, y=y_test, color='red', label='Actual', alpha=0.6)
# Plot predicted values
sns.scatterplot(x=y_test.index, y=xgb_predictions, color='green', label='Predicted_XGB', alpha=0.6)
plt.xlabel('Index')
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for XGBoost')
plt.legend()
plt.show()

# Visualize call volume by hour of the day


plt.figure(figsize=(10, 6))
sns.countplot(x='HourOfCall', data=data)
plt.title('Call Volume by Hour of the Day')
plt.ylabel('Number of Calls')
plt.show()

# Visualizing call volume by day of the week


plt.figure(figsize=(10, 6))
sns.countplot(x='DayOfWeek', data=data, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
'Saturday', 'Sunday'])
plt.title('Call Volume by Day of the Week')
plt.ylabel('Number of Calls')
plt.xticks(rotation=45)
plt.show()

# Combine actual and predicted values into DataFrames


rf_results_df = pd.DataFrame({'Actual': y_test, 'Predicted_RF': rf_predictions})
xgb_results_df = pd.DataFrame({'Actual': y_test, 'Predicted_XGB': xgb_predictions})

# Print the first few actual vs predicted values for RandomForest


print("Random Forest Actual vs Predicted Values:")
print(rf_results_df.head(50))

# Print the first few actual vs predicted values for XGBoost


print("\nXGBoost Actual vs Predicted Values:")
print(xgb_results_df.head(50))
Screen Shots:
Incident Response Cost Analysis:
 Data Preprocessing:
 Clean Notional Cost (£) and ensure all costs are correctly recorded and
categorized.
 Descriptive Analytics:
 Summarize costs by incident type, location, and other relevant factors.
 Predictive Modeling:
 Develop models to forecast the potential costs of future incidents based on
historical data.
 Cost-Benefit Analysis:
 Perform a cost-benefit analysis of different response strategies to determine
the most cost-effective approach.

Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset


file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
data = pd.read_csv(file_path)

# Inspect the "Notional Cost (£)" column for data type and missing values
print(data['Notional Cost (£)'].describe())
print(data['Notional Cost (£)'].isnull().sum())

# Convert "Notional Cost (£)" to a numeric format, assuming it's stored as a string with potential non-numeric
characters
# We'll remove any commas and currency symbols before conversion
data['Notional Cost (£)'] = data['Notional Cost (£)'].replace('[£,]', '', regex=True).astype(float)

# Handle missing values, if any. Options include dropping them, filling with a placeholder (like 0), or imputing
based on other values (e.g., mean or median)
# Here's how you might fill missing values with the median, as an example:
data['Notional Cost (£)'].fillna(data['Notional Cost (£)'].median(), inplace=True)

# After cleaning, you may want to check the data again to ensure it's in the right format and properly cleaned
print(data['Notional Cost (£)'].describe())

# Example for adapting the summary and visualization code


grouped_data = data.groupby(['IncidentGroup', 'IncGeo_WardName'])['Notional Cost (£)'].agg(['sum', 'mean',
'median', 'count'])
print(grouped_data)

# For visualization, choose the aggregation level and statistic most relevant to your analysis goals
average_cost_per_incident_type = data.groupby('IncidentGroup')['Notional Cost
(£)'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
average_cost_per_incident_type.plot(kind='bar')
plt.title('Average Cost by Incident Type')
plt.ylabel('Average Cost')
plt.xlabel('Incident Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Convert "Notional Cost (£)" to a numeric format and handle missing values
data['Notional Cost (£)'] = data['Notional Cost (£)'].replace('[£,]', '', regex=True).astype(float)
data['Notional Cost (£)'].fillna(data['Notional Cost (£)'].median(), inplace=True)

# Define the relevant columns and separate features and target variable
relevant_columns = [
'CalYear', 'HourOfCall', 'IncidentGroup', 'StopCodeDescription',
'PropertyCategory', 'PropertyType', 'Postcode_district',
'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount', 'PumpHoursRoundUp',
'Notional Cost (£)'
]
data_relevant = data[relevant_columns]

X = data_relevant.drop('Notional Cost (£)', axis=1)


y = data_relevant['Notional Cost (£)']

# Splitting the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical and categorical features


numerical_features = ['CalYear', 'HourOfCall', 'NumStationsWithPumpsAttending', 'NumPumpsAttending',
'PumpCount', 'PumpHoursRoundUp']
categorical_features = ['IncidentGroup', 'StopCodeDescription', 'PropertyCategory', 'PropertyType',
'Postcode_district']

numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
('num', numerical_pipeline, numerical_features),
('cat', categorical_pipeline, categorical_features)
])

# Create a pipeline with GradientBoostingRegressor


model_pipeline = Pipeline([
('preprocessor', preprocessor),
('model', GradientBoostingRegressor(random_state=42))
])

# Fit the model


model_pipeline.fit(X_train, y_train)

# Predictions and evaluation


train_predictions = model_pipeline.predict(X_train)
test_predictions = model_pipeline.predict(X_test)

train_mae = mean_absolute_error(y_train, train_predictions)


test_mae = mean_absolute_error(y_test, test_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

print(f"Train MAE: {train_mae}")


print(f"Test MAE: {test_mae}")
print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

# Assuming test_predictions and y_test are already defined as your model's predictions and the actual values,
respectively
plt.figure(figsize=(10, 6))

# Scatter plot of actual vs. predicted values


plt.scatter(y_test, test_predictions, alpha=0.5)

# Line of best fit


m, b = np.polyfit(y_test, test_predictions, 1)
plt.plot(y_test, m*y_test + b, color="red") # Adds a line of best fit

plt.xlabel('Actual Notional Cost (£)')


plt.ylabel('Predicted Notional Cost (£)')
plt.title('Actual vs. Predicted Notional Cost')
plt.show()

Screen shots:

You might also like