Report
Report
• Incident Patterns: Analyze patterns and trends in incidents over time, by type, or by
location.
• Response Times: Study the average response times and their impact on incident outcomes.
• Cost Analysis: Assess the financial impact of incidents by category and type to inform
budgetary and resource allocation decisions.
Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
# Convert 'DateOfCall' to datetime and extract day and month for further analysis
data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])
data['DayOfWeek'] = data['DateOfCall'].dt.day_name()
data['Month'] = data['DateOfCall'].dt.month_name()
weekly_counts = data['DayOfWeek'].value_counts().reindex([
])
weekly_counts.plot(kind='bar')
plt.xlabel('Day of Week')
plt.ylabel('Number of Incidents')
plt.show()
monthly_counts = data['Month'].value_counts()
monthly_counts.plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Number of Incidents')
plt.show()
data['Latitude'].fillna(data['Latitude'].median(), inplace=True)
data['Longitude'].fillna(data['Longitude'].median(), inplace=True)
missing_values = data.isnull().sum()
print(missing_values)
print(missing_percentage)
Data Cleaning and data filling:
import pandas as pd
def fill_postcode_from_district(df):
district_to_postcode =
df[~df['Postcode_full'].isna()].drop_duplicates('Postcode_district').set_index('Postcode_district')['P
ostcode_full']
def fill_lat_lon_from_postcode_efficiently(df):
# Create a mapping from Postcode_full to Latitude and Longitude for rows where they are not
NaN
postcode_to_lat_lon = df.dropna(subset=['Latitude', 'Longitude', 'Postcode_full'])[['Postcode_full',
'Latitude', 'Longitude']].drop_duplicates('Postcode_full')
df['Latitude'] =
df['Latitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')['Latitude
']))
df['Longitude'] =
df['Longitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')['Longit
ude']))
def fill_postcode_from_district_efficiently(df):
postcode_map =
df.dropna(subset=['Postcode_full']).drop_duplicates('Postcode_district').set_index('Postcode_distri
ct')['Postcode_full'].to_dict()
df['Postcode_full'] = df['Postcode_full'].fillna(df['Postcode_district'].map(postcode_map))
def fill_incgeo_wardcode_from_propercase(df):
propercase_to_wardcode =
df.dropna(subset=['IncGeo_WardCode']).drop_duplicates('ProperCase').set_index('ProperCase')['In
cGeo_WardCode'].to_dict()
def fill_easting_from_uprn(df):
# Create a mapping from UPRN to Easting_m for rows where they are not zero
# Main code
df = pd.read_csv(file_path, encoding='latin1')
empty_cells_per_column = df.isnull().sum()
print(empty_cells_per_column)
nan_count = special_service_df['SpecialServiceType'].isna().sum()
fill_postcode_from_district(df)
fill_postcode_from_district_efficiently(df)
fill_incgeo_wardcode_from_propercase(df)
fill_lat_lon_from_postcode_efficiently(df)
df['IncGeo_WardCode'].fillna('Unknown', inplace=True)
df['IncGeo_WardName'].fillna('Unknown', inplace=True)
df['IncGeo_WardNameNew'].fillna('Unknown', inplace=True)
df['IncidentStationGround'].fillna('Unknown', inplace=True)
df['FirstPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)
df['FirstPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)
df['SecondPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)
df['SecondPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)
df['NumCalls'].fillna(0, inplace=True)
df['NumPumpsAttending'].fillna(0, inplace=True)
df['PumpCount'].fillna(0, inplace=True)
df['PumpHoursRoundUp'].fillna(0, inplace=True)
df['NumStationsWithPumpsAttending'].fillna(0, inplace=True)
X = df_model[['Longitude']]
y = df_model['Easting_m']
model = LinearRegression()
model.fit(X, y)
missing_easting_index = df['Easting_m'].isnull()
missing_easting_final = df['Easting_m'].isnull().sum()
missing_easting_final
y_northing = df_model_northing['Northing_m']
model_northing = LinearRegression()
model_northing.fit(X_northing, y_northing)
missing_northing_index = df['Northing_m'].isnull()
missing_northing_final = df['Northing_m'].isnull().sum()
missing_northing_final
empty_cells_after_fill = df.isnull().sum()
print(empty_cells_after_fill)
After Cleaning:
Business Problems and Model analysis:
data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])
data['MonthYear'] = data['DateOfCall'].dt.to_period('M')
monthly_data = data.groupby('MonthYear').size().reset_index(name='IncidentCounts')
# Prepare features
monthly_data['Month'] = monthly_data['MonthYear'].dt.month
monthly_data['Year'] = monthly_data['MonthYear'].dt.year
monthly_data['MonthYearStr'] = monthly_data['MonthYear'].astype(str)
y = monthly_data['IncidentCounts'] # Target
# Split data into train and test sets
plt.figure(figsize=(10, 5))
plt.ylabel('Number of Incidents')
plt.legend()
plt.show()
X_test['MonthYearStr'] = pd.to_datetime(X_test['MonthYearStr'])
# Create a DataFrame for actual and predicted values
plt.figure(figsize=(12, 8))
plt.xlabel('Actual Values')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()
Response Times: Study the average response times and
their impact on incident outcomes.
import pandas as pd
import numpy as np
# Load data
data['ResponseTimeMinutes'] = data['ResponseTime'].dt.total_seconds() / 60
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[['ResponseTimeMinutes']])
# K-means Clustering
data['Cluster'] = kmeans.fit_predict(data_scaled)
# Visualize Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='DateOfCall', y='ResponseTimeMinutes', hue='Cluster',
palette='viridis')
plt.show()
plt.figure(figsize=(8, 4))
plt.show()
# ARIMA Forecasting
data['MonthYear'] = data['DateOfCall'].dt.to_period('M')
monthly_response = data.groupby('MonthYear')['ResponseTimeMinutes'].mean()
results = model.fit()
forecast = results.get_forecast(steps=6)
conf_int = forecast.conf_int()
# Plot Forecast
plt.figure(figsize=(12, 6))
plt.legend()
plt.show()
plt.figure(figsize=(8, 4))
plt.show()
# Note: Change file paths and tweak parameters as necessary for your specific setup.
cost Analysis: Assess the financial impact of incidents by
category and type to inform budgetary and resource
allocation decisions.
import pandas as pd
import numpy as np
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
le = LabelEncoder()
label_encoders[col] = le
X = data[features]
y = data[target]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Assuming future_data only needs the features actually used in the model:
future_data = pd.DataFrame({
'IncidentGroup': [1, 2, 3], # Ensure these values match the encoded training data categories
})
# Only apply label encoders to the columns used in the model and present in future_data
le = label_encoders[col]
# Handle previously unseen labels by replacing them with a default value (e.g., -1)
future_predictions = model.predict(future_data)
plt.figure(figsize=(10, 6))
plt.legend()
plt.show()
# Heatmap of feature importance
feature_importance = pd.Series(model.feature_importances_,
index=features).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
plt.show()
plt.figure(figsize=(10, 6))
plt.show()
plt.figure(figsize=(10, 6))