Assignment 1, Codeandssfile
Assignment 1, Codeandssfile
Code:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
from sklearn.linear_model import LinearRegression
def fill_postcode_from_district(df):
# Identify rows where Postcode_full is NaN and Postcode_district is not NaN
mask = df['Postcode_full'].isna() & ~df['Postcode_district'].isna()
def fill_lat_lon_from_postcode_efficiently(df):
# Create a mapping from Postcode_full to Latitude and Longitude for rows where they are not NaN
postcode_to_lat_lon = df.dropna(subset=['Latitude', 'Longitude', 'Postcode_full'])[['Postcode_full', 'Latitude',
'Longitude']].drop_duplicates('Postcode_full')
def fill_postcode_from_district_efficiently(df):
# Create a mapping from Postcode_district to Postcode_full for non-NaN values
postcode_map =
df.dropna(subset=['Postcode_full']).drop_duplicates('Postcode_district').set_index('Postcode_district')
['Postcode_full'].to_dict()
def fill_incgeo_wardcode_from_propercase(df):
# Create a mapping from ProperCase to IncGeo_WardCode for rows where IncGeo_WardCode is not NaN
propercase_to_wardcode =
df.dropna(subset=['IncGeo_WardCode']).drop_duplicates('ProperCase').set_index('ProperCase')
['IncGeo_WardCode'].to_dict()
def fill_easting_from_uprn(df):
# Create a mapping from UPRN to Easting_m for rows where they are not zero
uprn_to_easting = df[df['UPRN'] != 0][['UPRN', 'Easting_m']].drop_duplicates('UPRN')
# Main code
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22.csv'
df = pd.read_csv(file_path, encoding='latin1')
missing_northing_final
df.to_csv('C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv', index=False)
Screen Shots:
Before Cleaning:
After Cleaning:
Data Behavior:
Code:
import pandas as pd
import matplotlib.pyplot as plt
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
df = pd.read_csv(file_path, encoding='latin1')
# Plotting the trend of average first pump arriving attendance time over time
plt.figure(figsize=(14, 7))
average_response_time_monthly.plot(title='Monthly Trend of First Pump Arriving Attendance Time (2019-
2022)', color='orange')
plt.xlabel('Month')
plt.ylabel('Average Attendance Time (Seconds)')
plt.grid(True)
plt.show()
Screen shots:
Spatial Analysis for Fire Incident Hotspots:
Data Preprocessing:
Validate and clean the geographical data ( Latitude, Longitude,
Postcode_district).
Feature Engineering:
Extract additional spatial features if available or create spatial bins to
categorize incidents into specific areas.
Clustering:
Apply spatial clustering techniques like DBSCAN or k-means to identify
hotspots.
Heat Maps:
Generate heat maps using GIS tools to visually represent the concentration of
incidents.
Association Rule Mining:
Explore associations between incident hotspots and other factors like time or
type of incident.
Code:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
def analyze_clusters(df):
"""Prints out detailed information about each cluster."""
for cluster_label in df['Cluster'].unique():
cluster_data = df[df['Cluster'] == cluster_label]
print(f"Cluster {cluster_label}:")
print(f"Number of incidents: {cluster_data.shape[0]}")
print(f"Centroid: (Latitude: {cluster_data['Latitude'].mean()}, Longitude:
{cluster_data['Longitude'].mean()})")
print(f"Bounding box: {cluster_data['Latitude'].min()} to {cluster_data['Latitude'].max()},
{cluster_data['Longitude'].min()} to {cluster_data['Longitude'].max()}")
print()
if __name__ == "__main__":
filepath = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv' # Update this to your file path
cols_to_use = ['Latitude', 'Longitude', 'Postcode_district', 'IncidentGroup', 'TimeOfCall']
df = load_and_preprocess_data(filepath, usecols=cols_to_use)
df, kmeans = apply_kmeans(df, n_clusters=5) # Adjust n_clusters based on your analysis needs
analyze_clusters(df)
plot_clusters(df, kmeans, plot_size=(12, 10), alpha=0.3)
generate_heatmap(df, plot_size=(12, 10))
Screen shots:
Heat Map
Impact of Incident Types on Resource
Utilization:
Data Preprocessing:
Organize and categorize incident types ( IncidentGroup,
StopCodeDescription).
Descriptive Statistics:
Calculate the frequency and distribution of resources used
(NumStationsWithPumpsAttending, NumPumpsAttending) by incident type.
Predictive Modeling:
Develop regression models or decision trees to predict the level of resources
needed based on incident characteristics.
Cost Analysis:
Incorporate Notional Cost (£) to assess the financial impact of different
types of incidents on resources.
Code:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv' # Update this with your actual file path
data = pd.read_csv(file_path)
# Grouping data by 'IncidentGroup' and aggregating unique 'StopCodeDescription' values
grouped_incidents = data.groupby('IncidentGroup')['StopCodeDescription'].unique()
# Convert the series to a more readable DataFrame
categorized_incidents_df = grouped_incidents.reset_index()
categorized_incidents_df.columns = ['IncidentGroup', 'UniqueStopCodeDescriptions']
print(categorized_incidents_df)
# For NumStationsWithPumpsAttending
stations_freq = data.groupby('IncidentGroup')
['NumStationsWithPumpsAttending'].value_counts().unstack(fill_value=0)
# For NumPumpsAttending
pumps_freq = data.groupby('IncidentGroup')['NumPumpsAttending'].value_counts().unstack(fill_value=0)
# For NumStationsWithPumpsAttending
stations_distribution = data.groupby('IncidentGroup')['NumStationsWithPumpsAttending'].describe()
# For NumPumpsAttending
pumps_distribution = data.groupby('IncidentGroup')['NumPumpsAttending'].describe()
# Plotting Frequency of Stations Attending
stations_freq.plot(kind='bar', figsize=(14, 7))
plt.title('Frequency of Stations Attending by Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.legend(title='Num Stations Attending')
plt.show()
# Plotting Frequency of Pumps Attending
pumps_freq.plot(kind='bar', figsize=(14, 7))
plt.title('Frequency of Pumps Attending by Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.legend(title='Num Pumps Attending')
plt.show()
# Box plot for NumStationsWithPumpsAttending
plt.figure(figsize=(10, 6))
data.boxplot(column='NumStationsWithPumpsAttending', by='IncidentGroup')
plt.title('Distribution of Stations Attending by Incident Type')
plt.suptitle('') # Suppress the automatic 'by' title generated by pandas
plt.xlabel('Incident Type')
plt.ylabel('Num Stations Attending')
plt.show()
# Box plot for NumPumpsAttending
plt.figure(figsize=(10, 6))
data.boxplot(column='NumPumpsAttending', by='IncidentGroup')
plt.title('Distribution of Pumps Attending by Incident Type')
plt.suptitle('') # Suppress the automatic 'by' title generated by pandas
plt.xlabel('Incident Type')
plt.ylabel('Num Pumps Attending')
plt.show()
# Feature Engineering: Extract hour from TimeOfCall (if not already done)
data['HourOfCall'] = pd.to_datetime(data['TimeOfCall'], format='%H:%M:%S').dt.hour
# Total Costs
total_costs_by_incident.plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Total Notional Cost by Incident Type')
axes[0].set_ylabel('Total Notional Cost (£)')
axes[0].set_xlabel('Incident Group')
# Average Costs
average_costs_by_incident.plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Average Notional Cost by Incident Type')
axes[1].set_ylabel('Average Notional Cost (£)')
axes[1].set_xlabel('Incident Group')
plt.tight_layout()
plt.show()
Screen shot:
Regression Model Output:
Efficiency of Incident Call Processing:
Data Preprocessing:
Clean and prepare NumCalls data for analysis.
Feature Engineering:
Create new features such as time to first response, call-to-incident ratio, etc.
Regression Analysis:
Conduct regression analysis to determine the factors that affect call
processing times.
Pattern Recognition:
Identify patterns in the data related to the volume of calls and processing
efficiency.
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt
# Feature Engineering
data['CallToIncidentRatio'] = data['NumCalls'] / data['NumStationsWithPumpsAttending']
# Predictions
rf_predictions = rf.predict(X_test)
# Evaluation
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
# Visualization for RandomForest predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Since y_test is aligned with rf_predictions by index, we directly plot these
plt.scatter(y_test.index, y_test, color='red', label='Actual', alpha=0.6)
plt.scatter(y_test.index, rf_predictions, color='blue', label='Predicted_RF', alpha=0.6)
plt.xlabel('Index')
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for RandomForest')
plt.legend()
# Initialize and train the XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
# Predictions
xgb_predictions = xgb.predict(X_test)
# Evaluation
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)
# Visualization for XGBoost predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Plot actual values
sns.scatterplot(x=y_test.index, y=y_test, color='red', label='Actual', alpha=0.6)
# Plot predicted values
sns.scatterplot(x=y_test.index, y=xgb_predictions, color='green', label='Predicted_XGB', alpha=0.6)
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for XGBoost')
plt.legend()
plt.show()
# Assuming the dataset contains a datetime column 'DateOfCall' that represents when the call was made
data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])
data['DayOfWeek'] = data['DateOfCall'].dt.day_name()
data['HourOfDay'] = data['HourOfCall']
# Visualization for XGBoost predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Plot actual values
sns.scatterplot(x=y_test.index, y=y_test, color='red', label='Actual', alpha=0.6)
# Plot predicted values
sns.scatterplot(x=y_test.index, y=xgb_predictions, color='green', label='Predicted_XGB', alpha=0.6)
plt.xlabel('Index')
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for XGBoost')
plt.legend()
plt.show()
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import numpy as np
# Inspect the "Notional Cost (£)" column for data type and missing values
print(data['Notional Cost (£)'].describe())
print(data['Notional Cost (£)'].isnull().sum())
# Convert "Notional Cost (£)" to a numeric format, assuming it's stored as a string with potential non-numeric
characters
# We'll remove any commas and currency symbols before conversion
data['Notional Cost (£)'] = data['Notional Cost (£)'].replace('[£,]', '', regex=True).astype(float)
# Handle missing values, if any. Options include dropping them, filling with a placeholder (like 0), or imputing
based on other values (e.g., mean or median)
# Here's how you might fill missing values with the median, as an example:
data['Notional Cost (£)'].fillna(data['Notional Cost (£)'].median(), inplace=True)
# After cleaning, you may want to check the data again to ensure it's in the right format and properly cleaned
print(data['Notional Cost (£)'].describe())
# For visualization, choose the aggregation level and statistic most relevant to your analysis goals
average_cost_per_incident_type = data.groupby('IncidentGroup')['Notional Cost
(£)'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
average_cost_per_incident_type.plot(kind='bar')
plt.title('Average Cost by Incident Type')
plt.ylabel('Average Cost')
plt.xlabel('Incident Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
## Convert "Notional Cost (£)" to a numeric format and handle missing values
data['Notional Cost (£)'] = data['Notional Cost (£)'].replace('[£,]', '', regex=True).astype(float)
data['Notional Cost (£)'].fillna(data['Notional Cost (£)'].median(), inplace=True)
# Define the relevant columns and separate features and target variable
relevant_columns = [
'CalYear', 'HourOfCall', 'IncidentGroup', 'StopCodeDescription',
'PropertyCategory', 'PropertyType', 'Postcode_district',
'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount', 'PumpHoursRoundUp',
'Notional Cost (£)'
]
data_relevant = data[relevant_columns]
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
('num', numerical_pipeline, numerical_features),
('cat', categorical_pipeline, categorical_features)
])
# Assuming test_predictions and y_test are already defined as your model's predictions and the actual values,
respectively
plt.figure(figsize=(10, 6))
Screen shots: