0% found this document useful (0 votes)

10 views29 pages

Assignment 1, Codeandssfile

The document outlines a comprehensive data cleaning and analysis process for fire incident data, including filling missing values, applying linear regression for predictions, and visualizing trends over time. It also describes spatial analysis techniques such as K-means clustering and heatmap generation to identify incident hotspots. Additionally, it discusses the impact of incident types on resource utilization and proposes predictive modeling for resource allocation based on incident characteristics.

Uploaded by

tali66261

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views29 pages

Assignment 1, Codeandssfile

Uploaded by

tali66261

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 29

Data Cleaning:

Code:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
from sklearn.linear_model import LinearRegression

def fill_postcode_from_district(df):
# Identify rows where Postcode_full is NaN and Postcode_district is not NaN
mask = df['Postcode_full'].isna() & ~df['Postcode_district'].isna()

# Create a mapping from Postcode_district to Postcode_full for non-NaN values

district_to_postcode =
df[~df['Postcode_full'].isna()].drop_duplicates('Postcode_district').set_index('Postcode_district')['Postcode_full']

# Use vectorized operations to fill NaN values

df.loc[mask, 'Postcode_full'] = df.loc[mask, 'Postcode_district'].map(district_to_postcode)

def fill_lat_lon_from_postcode_efficiently(df):
# Create a mapping from Postcode_full to Latitude and Longitude for rows where they are not NaN
postcode_to_lat_lon = df.dropna(subset=['Latitude', 'Longitude', 'Postcode_full'])[['Postcode_full', 'Latitude',
'Longitude']].drop_duplicates('Postcode_full')

# Use transform to efficiently broadcast values to NaN cells

df['Latitude'] = df['Latitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')
['Latitude']))
df['Longitude'] = df['Longitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')
['Longitude']))

def fill_postcode_from_district_efficiently(df):
# Create a mapping from Postcode_district to Postcode_full for non-NaN values
postcode_map =
df.dropna(subset=['Postcode_full']).drop_duplicates('Postcode_district').set_index('Postcode_district')
['Postcode_full'].to_dict()

# Use vectorized operations to fill NaN values

df['Postcode_full'] = df['Postcode_full'].fillna(df['Postcode_district'].map(postcode_map))

def fill_incgeo_wardcode_from_propercase(df):
# Create a mapping from ProperCase to IncGeo_WardCode for rows where IncGeo_WardCode is not NaN
propercase_to_wardcode =
df.dropna(subset=['IncGeo_WardCode']).drop_duplicates('ProperCase').set_index('ProperCase')
['IncGeo_WardCode'].to_dict()

# Use vectorized operations to fill NaN values

mask = (df['IncGeo_WardCode'].isna()) & (~df['ProperCase'].isna())
df.loc[mask, 'IncGeo_WardCode'] = df.loc[mask, 'ProperCase'].map(propercase_to_wardcode)

def fill_easting_from_uprn(df):
# Create a mapping from UPRN to Easting_m for rows where they are not zero
uprn_to_easting = df[df['UPRN'] != 0][['UPRN', 'Easting_m']].drop_duplicates('UPRN')

# Use transform to efficiently broadcast values to NaN cells

df['Easting_m'] = df['Easting_m'].mask((df['Easting_m'].isna()) & (df['UPRN'] != 0),
df['UPRN'].map(uprn_to_easting.set_index('UPRN')['Easting_m']))

# Main code
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Check for empty cells in each column

empty_cells_per_column = df.isnull().sum()
print("Empty cells per column:")
print(empty_cells_per_column)

# Filter rows where StopCodeDescription is 'Special Service'

special_service_df = df[df['StopCodeDescription'] == 'Special Service']

# Count NaN values in SpecialServiceType within the filtered rows

nan_count = special_service_df['SpecialServiceType'].isna().sum()
print("Number of 'Special Service' entries with NaN in 'SpecialServiceType':", nan_count)

# Replace NaN values in SpecialServiceType with 'Not applicable'

df['SpecialServiceType'] = df['SpecialServiceType'].fillna('Not applicable')

# Apply functions to fill data

fill_postcode_from_district(df)
fill_postcode_from_district_efficiently(df)
fill_incgeo_wardcode_from_propercase(df)
fill_lat_lon_from_postcode_efficiently(df)

# Filling blank cells'

df['IncGeo_WardCode'].fillna('Unknown', inplace=True)
df['IncGeo_WardName'].fillna('Unknown', inplace=True)
df['IncGeo_WardNameNew'].fillna('Unknown', inplace=True)
df['IncidentStationGround'].fillna('Unknown', inplace=True)
df['FirstPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)
df['FirstPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)
df['SecondPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)
df['SecondPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)
df['NumCalls'].fillna(0, inplace=True)
df['NumPumpsAttending'].fillna(0, inplace=True)
df['PumpCount'].fillna(0, inplace=True)
df['PumpHoursRoundUp'].fillna(0, inplace=True)
df['NumStationsWithPumpsAttending'].fillna(0, inplace=True)
df['Notional Cost (£)'] = df['PumpHoursRoundUp'] * 333

# Prepare the data for linear regression model

df_model = df.dropna(subset=['Easting_m', 'Longitude'])
X = df_model[['Longitude']]
y = df_model['Easting_m']
# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Predict missing Easting_m values using the model
missing_easting_index = df['Easting_m'].isnull()
predicted_easting = model.predict(df.loc[missing_easting_index, ['Longitude']])
# Fill in the missing Easting_m values with the predictions
df.loc[missing_easting_index, 'Easting_m'] = predicted_easting
# Check again for missing values in Easting_m after filling
missing_easting_final = df['Easting_m'].isnull().sum()
missing_easting_final

# Prepare the data for linear regression model to predict Northing_m

df_model_northing = df.dropna(subset=['Northing_m', 'Longitude'])
X_northing = df_model_northing[['Longitude']] # Using Longitude as predictor
y_northing = df_model_northing['Northing_m']
# Fit the linear regression model for Northing_m
model_northing = LinearRegression()
model_northing.fit(X_northing, y_northing)
# Predict missing Northing_m values using the model
missing_northing_index = df['Northing_m'].isnull()
predicted_northing = model_northing.predict(df.loc[missing_northing_index, ['Longitude']])
# Fill in the missing Northing_m values with the predictions
df.loc[missing_northing_index, 'Northing_m'] = predicted_northing
# Check again for missing values in Northing_m after filling
missing_northing_final = df['Northing_m'].isnull().sum()

missing_northing_final

# Check for empty cells again after the replacement

empty_cells_after_fill = df.isnull().sum()
print("\nEmpty cells per column after replacement:")
print(empty_cells_after_fill)

df.to_csv('C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv', index=False)
Screen Shots:
Before Cleaning:

After Cleaning:
Data Behavior:
Code:
import pandas as pd
import matplotlib.pyplot as plt

file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Convert DateOfCall to datetime

df['DateOfCall'] = pd.to_datetime(df['DateOfCall'])

# Trend of Incidents Over Time

# Grouping data by month and year
monthly_incidents = df.resample('M', on='DateOfCall').size()

# Plotting the trend of incidents over time

plt.figure(figsize=(14, 7))
monthly_incidents.plot(title='Monthly Trend of Incidents (2019-2022)')
plt.xlabel('Month')
plt.ylabel('Number of Incidents')
plt.grid(True)
plt.show()

# Trend of Incident Types

incident_types = df['IncidentGroup'].value_counts()

# Plotting the frequency of different types of incidents

plt.figure(figsize=(10, 6))
incident_types.plot(kind='bar', title='Frequency of Incident Types')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

# Filtering out rows where FirstPumpArriving_AttendanceTime is 'Unknown' or missing

filtered_data = df[df['FirstPumpArriving_AttendanceTime'] != 'Unknown']
filtered_data['FirstPumpArriving_AttendanceTime'] =
pd.to_numeric(filtered_data['FirstPumpArriving_AttendanceTime'])

# Trend of First Pump Arriving Attendance Time

# Calculating the average first pump arriving attendance time by month
average_response_time_monthly = filtered_data.resample('M', on='DateOfCall')
['FirstPumpArriving_AttendanceTime'].mean()

# Plotting the trend of average first pump arriving attendance time over time
plt.figure(figsize=(14, 7))
average_response_time_monthly.plot(title='Monthly Trend of First Pump Arriving Attendance Time (2019-
2022)', color='orange')
plt.xlabel('Month')
plt.ylabel('Average Attendance Time (Seconds)')
plt.grid(True)
plt.show()

# Incidents by Hour of Call

incidents_by_hour = df['HourOfCall'].value_counts().sort_index()

# Plotting incidents by hour of the day

plt.figure(figsize=(12, 6))
incidents_by_hour.plot(kind='bar', title='Incidents by Hour of Call')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

# Analyzing the Notional Cost of Incidents Over Time

# Ensure the 'Notional Cost (£)' column is numeric
df['Notional Cost (£)'] = pd.to_numeric(df['Notional Cost (Â£)'], errors='coerce')

# Calculating the total notional cost by month

monthly_notional_cost = df.resample('M', on='DateOfCall')['Notional Cost (£)'].sum()

# Plotting the trend of notional cost over time

plt.figure(figsize=(14, 7))
monthly_notional_cost.plot(title='Monthly Notional Cost of Incidents (2019-2022)', color='green')
plt.xlabel('Month')
plt.ylabel('Total Notional Cost (£)')
plt.grid(True)
plt.show()

Screen shots:
Spatial Analysis for Fire Incident Hotspots:
 Data Preprocessing:
 Validate and clean the geographical data ( Latitude, Longitude,
Postcode_district).
 Feature Engineering:
 Extract additional spatial features if available or create spatial bins to
categorize incidents into specific areas.
 Clustering:
 Apply spatial clustering techniques like DBSCAN or k-means to identify
hotspots.
 Heat Maps:
 Generate heat maps using GIS tools to visually represent the concentration of
incidents.
 Association Rule Mining:
 Explore associations between incident hotspots and other factors like time or
type of incident.

Code:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(filepath, usecols=None):

"""Load the dataset and preprocess."""
df = pd.read_csv(filepath, usecols=usecols)
# Ensure all data is valid
df = df[(df['Latitude'].between(-90, 90)) & (df['Longitude'].between(-180, 180))]
df['Postcode_district'] = df['Postcode_district'].fillna('Unknown')
# Impute missing values for Latitude and Longitude if necessary
df['Latitude'].fillna(df['Latitude'].mean(), inplace=True)
df['Longitude'].fillna(df['Longitude'].mean(), inplace=True)
return df

def apply_kmeans(df, n_clusters=5):

"""Apply K-means clustering to the dataset."""
# Extracting Latitude and Longitude for clustering
coords = df[['Latitude', 'Longitude']]
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(coords)
df['Cluster'] = kmeans.labels_
return df, kmeans

def plot_clusters(df, kmeans, plot_size=(12, 10), alpha=0.3):

"""Plot the clustered data and centroids."""
plt.figure(figsize=plot_size)
sns.scatterplot(x='Longitude', y='Latitude', hue='Cluster', data=df, palette='viridis', alpha=alpha)
plt.scatter(kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 0], s=300, c='red', label='Centroids',
marker='*')
plt.title('Fire Incident Clusters')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()

def generate_heatmap(df, plot_size=(12, 10)):

"""Generate a heatmap based on the density of incidents."""
plt.figure(figsize=plot_size)
sns.kdeplot(x=df['Longitude'], y=df['Latitude'], hue=df['Cluster'], fill=True, levels=100, cmap="viridis",
alpha=0.7)
plt.title('Heatmap of Fire Incident Density by Cluster')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()

def analyze_clusters(df):
"""Prints out detailed information about each cluster."""
for cluster_label in df['Cluster'].unique():
cluster_data = df[df['Cluster'] == cluster_label]
print(f"Cluster {cluster_label}:")
print(f"Number of incidents: {cluster_data.shape[0]}")
print(f"Centroid: (Latitude: {cluster_data['Latitude'].mean()}, Longitude:
{cluster_data['Longitude'].mean()})")
print(f"Bounding box: {cluster_data['Latitude'].min()} to {cluster_data['Latitude'].max()},
{cluster_data['Longitude'].min()} to {cluster_data['Longitude'].max()}")
print()

if __name__ == "__main__":
filepath = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv' # Update this to your file path
cols_to_use = ['Latitude', 'Longitude', 'Postcode_district', 'IncidentGroup', 'TimeOfCall']
df = load_and_preprocess_data(filepath, usecols=cols_to_use)
df, kmeans = apply_kmeans(df, n_clusters=5) # Adjust n_clusters based on your analysis needs
analyze_clusters(df)
plot_clusters(df, kmeans, plot_size=(12, 10), alpha=0.3)
generate_heatmap(df, plot_size=(12, 10))
Screen shots:
Heat Map
Impact of Incident Types on Resource
Utilization:
 Data Preprocessing:
 Organize and categorize incident types ( IncidentGroup,
StopCodeDescription).
 Descriptive Statistics:
 Calculate the frequency and distribution of resources used
(NumStationsWithPumpsAttending, NumPumpsAttending) by incident type.
 Predictive Modeling:
 Develop regression models or decision trees to predict the level of resources
needed based on incident characteristics.
 Cost Analysis:
 Incorporate Notional Cost (£) to assess the financial impact of different
types of incidents on resources.

Code:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv' # Update this with your actual file path
data = pd.read_csv(file_path)
# Grouping data by 'IncidentGroup' and aggregating unique 'StopCodeDescription' values
grouped_incidents = data.groupby('IncidentGroup')['StopCodeDescription'].unique()
# Convert the series to a more readable DataFrame
categorized_incidents_df = grouped_incidents.reset_index()
categorized_incidents_df.columns = ['IncidentGroup', 'UniqueStopCodeDescriptions']
print(categorized_incidents_df)

# For NumStationsWithPumpsAttending
stations_freq = data.groupby('IncidentGroup')
['NumStationsWithPumpsAttending'].value_counts().unstack(fill_value=0)
# For NumPumpsAttending
pumps_freq = data.groupby('IncidentGroup')['NumPumpsAttending'].value_counts().unstack(fill_value=0)
# For NumStationsWithPumpsAttending
stations_distribution = data.groupby('IncidentGroup')['NumStationsWithPumpsAttending'].describe()
# For NumPumpsAttending
pumps_distribution = data.groupby('IncidentGroup')['NumPumpsAttending'].describe()
# Plotting Frequency of Stations Attending
stations_freq.plot(kind='bar', figsize=(14, 7))
plt.title('Frequency of Stations Attending by Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.legend(title='Num Stations Attending')
plt.show()
# Plotting Frequency of Pumps Attending
pumps_freq.plot(kind='bar', figsize=(14, 7))
plt.title('Frequency of Pumps Attending by Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Frequency')
plt.legend(title='Num Pumps Attending')
plt.show()
# Box plot for NumStationsWithPumpsAttending
plt.figure(figsize=(10, 6))
data.boxplot(column='NumStationsWithPumpsAttending', by='IncidentGroup')
plt.title('Distribution of Stations Attending by Incident Type')
plt.suptitle('') # Suppress the automatic 'by' title generated by pandas
plt.xlabel('Incident Type')
plt.ylabel('Num Stations Attending')
plt.show()
# Box plot for NumPumpsAttending
plt.figure(figsize=(10, 6))
data.boxplot(column='NumPumpsAttending', by='IncidentGroup')
plt.title('Distribution of Pumps Attending by Incident Type')
plt.suptitle('') # Suppress the automatic 'by' title generated by pandas
plt.xlabel('Incident Type')
plt.ylabel('Num Pumps Attending')
plt.show()

# Feature Engineering: Extract hour from TimeOfCall (if not already done)
data['HourOfCall'] = pd.to_datetime(data['TimeOfCall'], format='%H:%M:%S').dt.hour

# Selecting features and target for the regression model

features = data[['IncidentGroup', 'HourOfCall', 'PropertyType']]
target = data['NumPumpsAttending']

# Encoding categorical variables

categorical_features = ['IncidentGroup', 'PropertyType']
one_hot = OneHotEncoder(handle_unknown='ignore')
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Creating the model pipeline

model = Pipeline([
('transformer', transformer),
('regressor', DecisionTreeRegressor(random_state=42))
])

# Training the model

model.fit(X_train, y_train)

# Predicting and evaluating

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

# Plot actual vs. predicted values for the regression model

plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue', label='Predicted', alpha=0.5)
plt.plot(y_test, y_test, color='red', label='Actual') # Line of perfect prediction
plt.title('Actual vs. Predicted Number of Pumps Attending')
plt.xlabel('Actual Number of Pumps Attending')
plt.ylabel('Predicted Number of Pumps Attending')
plt.legend()
plt.show()

print(f"Mean Absolute Error: {mae}")

# Cost Analysis: Assess the financial impact of different types of incidents

# Aggregating notional costs by incident type for overall financial impact
total_costs_by_incident = data.groupby('IncidentGroup')['Notional Cost (£)'].sum()

# Calculating average cost per incident within each incident group

average_costs_by_incident = data.groupby('IncidentGroup')['Notional Cost (£)'].mean()

# Plotting Total and Average Notional Cost by Incident Type

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

# Total Costs
total_costs_by_incident.plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Total Notional Cost by Incident Type')
axes[0].set_ylabel('Total Notional Cost (£)')
axes[0].set_xlabel('Incident Group')

# Average Costs
average_costs_by_incident.plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Average Notional Cost by Incident Type')
axes[1].set_ylabel('Average Notional Cost (£)')
axes[1].set_xlabel('Incident Group')

plt.tight_layout()
plt.show()
Screen shot:
Regression Model Output:
Efficiency of Incident Call Processing:
 Data Preprocessing:
 Clean and prepare NumCalls data for analysis.
 Feature Engineering:
 Create new features such as time to first response, call-to-incident ratio, etc.
 Regression Analysis:
 Conduct regression analysis to determine the factors that affect call
processing times.
 Pattern Recognition:
 Identify patterns in the data related to the volume of calls and processing
efficiency.

Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset

file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
data = pd.read_csv(file_path)

# Convert time columns to numeric, coercing errors to NaN

data['FirstPumpArriving_AttendanceTime'] = pd.to_numeric(data['FirstPumpArriving_AttendanceTime'],
errors='coerce')
data['SecondPumpArriving_AttendanceTime'] = pd.to_numeric(data['SecondPumpArriving_AttendanceTime'],
errors='coerce')

# Replace zeros to avoid division by zero errors in feature engineering

data['NumStationsWithPumpsAttending'].replace(0, 1, inplace=True)

# Feature Engineering
data['CallToIncidentRatio'] = data['NumCalls'] / data['NumStationsWithPumpsAttending']

# Identify numerical columns for median filling

numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
# Replace missing values for numerical columns with the median
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

features = ['CallToIncidentRatio', 'NumStationsWithPumpsAttending', 'NumCalls']

target = 'FirstPumpArriving_AttendanceTime'

# Splitting the data into training and testing sets

X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
rf_predictions = rf.predict(X_test)

# Evaluation
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

# Output Random Forest results

print("Random Forest Results:")
print(f"Mean Squared Error: {rf_mse}")
print(f"Mean Absolute Error (MAE): {rf_mae}")
print(f"R-squared (R2 Score): {rf_r2}")

# Visualization for RandomForest predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Since y_test is aligned with rf_predictions by index, we directly plot these
plt.scatter(y_test.index, y_test, color='red', label='Actual', alpha=0.6)
plt.scatter(y_test.index, rf_predictions, color='blue', label='Predicted_RF', alpha=0.6)
plt.xlabel('Index')
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for RandomForest')
plt.legend()
# Initialize and train the XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

# Predictions
xgb_predictions = xgb.predict(X_test)

# Evaluation
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

# Output XGBoost results

print("XGBoost Results:")
print(f"Mean Squared Error: {xgb_mse}")
print(f"Mean Absolute Error (MAE): {xgb_mae}")
print(f"R-squared (R2 Score): {xgb_r2}")

# Visualization for XGBoost predictions with different colors for actual and predicted values
plt.figure(figsize=(10, 6))
# Plot actual values
sns.scatterplot(x=y_test.index, y=y_test, color='red', label='Actual', alpha=0.6)
# Plot predicted values
sns.scatterplot(x=y_test.index, y=xgb_predictions, color='green', label='Predicted_XGB', alpha=0.6)
plt.ylabel('Time to First Response')
plt.title('Actual vs Predicted Values for XGBoost')
plt.legend()
plt.show()
# Assuming the dataset contains a datetime column 'DateOfCall' that represents when the call was made
data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])
data['DayOfWeek'] = data['DateOfCall'].dt.day_name()
data['HourOfDay'] = data['HourOfCall']

# Visualize call volume by hour of the day

plt.figure(figsize=(10, 6))
sns.countplot(x='HourOfCall', data=data)
plt.title('Call Volume by Hour of the Day')
plt.ylabel('Number of Calls')
plt.show()

# Visualizing call volume by day of the week

plt.figure(figsize=(10, 6))
sns.countplot(x='DayOfWeek', data=data, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
'Saturday', 'Sunday'])
plt.title('Call Volume by Day of the Week')
plt.ylabel('Number of Calls')
plt.xticks(rotation=45)
plt.show()

# Combine actual and predicted values into DataFrames

rf_results_df = pd.DataFrame({'Actual': y_test, 'Predicted_RF': rf_predictions})
xgb_results_df = pd.DataFrame({'Actual': y_test, 'Predicted_XGB': xgb_predictions})

# Print the first few actual vs predicted values for RandomForest

print("Random Forest Actual vs Predicted Values:")
print(rf_results_df.head(50))

# Print the first few actual vs predicted values for XGBoost

print("\nXGBoost Actual vs Predicted Values:")
print(xgb_results_df.head(50))
Screen Shots:
Incident Response Cost Analysis:
 Data Preprocessing:
 Clean Notional Cost (£) and ensure all costs are correctly recorded and
categorized.
 Descriptive Analytics:
 Summarize costs by incident type, location, and other relevant factors.
 Predictive Modeling:
 Develop models to forecast the potential costs of future incidents based on
historical data.
 Cost-Benefit Analysis:
 Perform a cost-benefit analysis of different response strategies to determine
the most cost-effective approach.

Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset

file_path = 'C:\\Users\\PMLS\\Desktop\\LFB_2019-22_cleaned.csv'
data = pd.read_csv(file_path)

# Inspect the "Notional Cost (£)" column for data type and missing values
print(data['Notional Cost (£)'].describe())
print(data['Notional Cost (£)'].isnull().sum())

# Convert "Notional Cost (£)" to a numeric format, assuming it's stored as a string with potential non-numeric
characters
# We'll remove any commas and currency symbols before conversion
data['Notional Cost (£)'] = data['Notional Cost (£)'].replace('[£,]', '', regex=True).astype(float)

# Handle missing values, if any. Options include dropping them, filling with a placeholder (like 0), or imputing
based on other values (e.g., mean or median)
# Here's how you might fill missing values with the median, as an example:
data['Notional Cost (£)'].fillna(data['Notional Cost (£)'].median(), inplace=True)

# After cleaning, you may want to check the data again to ensure it's in the right format and properly cleaned
print(data['Notional Cost (£)'].describe())

# Example for adapting the summary and visualization code

grouped_data = data.groupby(['IncidentGroup', 'IncGeo_WardName'])['Notional Cost (£)'].agg(['sum', 'mean',
'median', 'count'])
print(grouped_data)

# For visualization, choose the aggregation level and statistic most relevant to your analysis goals
average_cost_per_incident_type = data.groupby('IncidentGroup')['Notional Cost
(£)'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
average_cost_per_incident_type.plot(kind='bar')
plt.title('Average Cost by Incident Type')
plt.ylabel('Average Cost')
plt.xlabel('Incident Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Convert "Notional Cost (£)" to a numeric format and handle missing values
data['Notional Cost (£)'] = data['Notional Cost (£)'].replace('[£,]', '', regex=True).astype(float)
data['Notional Cost (£)'].fillna(data['Notional Cost (£)'].median(), inplace=True)

# Define the relevant columns and separate features and target variable
relevant_columns = [
'CalYear', 'HourOfCall', 'IncidentGroup', 'StopCodeDescription',
'PropertyCategory', 'PropertyType', 'Postcode_district',
'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount', 'PumpHoursRoundUp',
'Notional Cost (£)'
]
data_relevant = data[relevant_columns]

X = data_relevant.drop('Notional Cost (£)', axis=1)

y = data_relevant['Notional Cost (£)']

# Splitting the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical and categorical features

numerical_features = ['CalYear', 'HourOfCall', 'NumStationsWithPumpsAttending', 'NumPumpsAttending',
'PumpCount', 'PumpHoursRoundUp']
categorical_features = ['IncidentGroup', 'StopCodeDescription', 'PropertyCategory', 'PropertyType',
'Postcode_district']

numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
('num', numerical_pipeline, numerical_features),
('cat', categorical_pipeline, categorical_features)
])

# Create a pipeline with GradientBoostingRegressor

model_pipeline = Pipeline([
('preprocessor', preprocessor),
('model', GradientBoostingRegressor(random_state=42))
])

# Fit the model

model_pipeline.fit(X_train, y_train)

# Predictions and evaluation

train_predictions = model_pipeline.predict(X_train)
test_predictions = model_pipeline.predict(X_test)

train_mae = mean_absolute_error(y_train, train_predictions)

test_mae = mean_absolute_error(y_test, test_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

print(f"Train MAE: {train_mae}")

print(f"Test MAE: {test_mae}")
print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

# Assuming test_predictions and y_test are already defined as your model's predictions and the actual values,
respectively
plt.figure(figsize=(10, 6))

# Scatter plot of actual vs. predicted values

plt.scatter(y_test, test_predictions, alpha=0.5)

# Line of best fit

m, b = np.polyfit(y_test, test_predictions, 1)
plt.plot(y_test, m*y_test + b, color="red") # Adds a line of best fit

plt.xlabel('Actual Notional Cost (£)')

plt.ylabel('Predicted Notional Cost (£)')
plt.title('Actual vs. Predicted Notional Cost')
plt.show()

Screen shots:

Data Analytics Lab Manual
No ratings yet
Data Analytics Lab Manual
47 pages
Data Cleaning With Python and Pandas
No ratings yet
Data Cleaning With Python and Pandas
49 pages
Applied Regression Analysis: Third Edition
0% (1)
Applied Regression Analysis: Third Edition
9 pages
ML Notes-1
No ratings yet
ML Notes-1
59 pages
Case Study-3 (Forecasting)
100% (1)
Case Study-3 (Forecasting)
4 pages
Practical 1
No ratings yet
Practical 1
6 pages
ML Unit 1 MCQ
100% (1)
ML Unit 1 MCQ
9 pages
Report
No ratings yet
Report
25 pages
Lab File
No ratings yet
Lab File
96 pages
Bank Loan Case Study
No ratings yet
Bank Loan Case Study
71 pages
A Data Analytics Tutorial Building Predictive
No ratings yet
A Data Analytics Tutorial Building Predictive
15 pages
Data Cleaning Process
No ratings yet
Data Cleaning Process
92 pages
Situationer and Corn Forecast (Edition 1)
No ratings yet
Situationer and Corn Forecast (Edition 1)
25 pages
AIML
No ratings yet
AIML
13 pages
Chapter 3 Forecasting
No ratings yet
Chapter 3 Forecasting
55 pages
Assignment No 1 Output
No ratings yet
Assignment No 1 Output
42 pages
Nonparametric Econometrics. A Primer
No ratings yet
Nonparametric Econometrics. A Primer
103 pages
Uber Drive Practice DP PDF
No ratings yet
Uber Drive Practice DP PDF
10 pages
ML#05
No ratings yet
ML#05
35 pages
Learning Outcomes
0% (1)
Learning Outcomes
48 pages
Data Cleaning and Fill Missing Values
No ratings yet
Data Cleaning and Fill Missing Values
15 pages
DA Lab
No ratings yet
DA Lab
27 pages
Unit 5 Python
No ratings yet
Unit 5 Python
30 pages
19 20DecTestPICMIC
No ratings yet
19 20DecTestPICMIC
28 pages
Python (Unit - 2)
No ratings yet
Python (Unit - 2)
22 pages
Data Cleaning On Melbourne Housing
No ratings yet
Data Cleaning On Melbourne Housing
16 pages
Outlook Module3
No ratings yet
Outlook Module3
21 pages
Kenny-230722-Data Cleaning With Python and Pandas - Detecting Missing Values
No ratings yet
Kenny-230722-Data Cleaning With Python and Pandas - Detecting Missing Values
13 pages
Exp-12 Iaiml
No ratings yet
Exp-12 Iaiml
13 pages
Dealing With Missing Values
No ratings yet
Dealing With Missing Values
19 pages
Ass 1 ML
No ratings yet
Ass 1 ML
21 pages
Exp 3
No ratings yet
Exp 3
10 pages
Overview of Data Cleaning
No ratings yet
Overview of Data Cleaning
17 pages
Data Cleaning
No ratings yet
Data Cleaning
22 pages
Data Cleaning
No ratings yet
Data Cleaning
20 pages
cdp201 10 11 2023
No ratings yet
cdp201 10 11 2023
17 pages
ML Practical 1
No ratings yet
ML Practical 1
15 pages
Data Cleaning
No ratings yet
Data Cleaning
13 pages
Lab Exercise 2-CS0017
No ratings yet
Lab Exercise 2-CS0017
17 pages
Dev Lab Record
No ratings yet
Dev Lab Record
21 pages
Group A Assignment No2 Writeup
No ratings yet
Group A Assignment No2 Writeup
9 pages
Module 3
No ratings yet
Module 3
20 pages
Data Cleaningin ML
No ratings yet
Data Cleaningin ML
15 pages
EcommerceAnalysis 1680541297
No ratings yet
EcommerceAnalysis 1680541297
11 pages
Practice Questions2
No ratings yet
Practice Questions2
2 pages
String (Pandas) - Removing $ After Int Sales ( Revenue') Sales ( Revenue') .STR - Strip ( $') #Convert String To Int
No ratings yet
String (Pandas) - Removing $ After Int Sales ( Revenue') Sales ( Revenue') .STR - Strip ( $') #Convert String To Int
12 pages
Lab1.ipynb - Colaboratory
No ratings yet
Lab1.ipynb - Colaboratory
9 pages
10) Merging Dataframes: # Detecting Duplicates
No ratings yet
10) Merging Dataframes: # Detecting Duplicates
7 pages
Handling Missing Values in Python
No ratings yet
Handling Missing Values in Python
9 pages
Cleaning Data in Python
No ratings yet
Cleaning Data in Python
8 pages
Data Cleaning in Python
No ratings yet
Data Cleaning in Python
6 pages
All Code Explanations
No ratings yet
All Code Explanations
8 pages
Data Cleaning in Python
No ratings yet
Data Cleaning in Python
14 pages
Day 10 Pandasdatacleaning
No ratings yet
Day 10 Pandasdatacleaning
6 pages
P1) Code Uber
No ratings yet
P1) Code Uber
6 pages
Predict The Price of The Uber Ride From A Given Pickup Point To The Agreed Drop-Off Location
No ratings yet
Predict The Price of The Uber Ride From A Given Pickup Point To The Agreed Drop-Off Location
9 pages
PW2 DataCleaning
No ratings yet
PW2 DataCleaning
6 pages
Week1 Numpy, Pandas (178) .Ipynb Colab
No ratings yet
Week1 Numpy, Pandas (178) .Ipynb Colab
6 pages
Ps 1
No ratings yet
Ps 1
16 pages
Pandas
No ratings yet
Pandas
4 pages
2777959-Day 8 - Data Wrangling
No ratings yet
2777959-Day 8 - Data Wrangling
2 pages
Lab 3 DWM
No ratings yet
Lab 3 DWM
5 pages
Data Preprocessing 1
No ratings yet
Data Preprocessing 1
6 pages
Practical 3
No ratings yet
Practical 3
2 pages
DataCleaning Techniques
No ratings yet
DataCleaning Techniques
20 pages
Fda Exp2 E0323040
No ratings yet
Fda Exp2 E0323040
3 pages
Adiat2019 Article PredictionOfGroundwaterLevelIn
No ratings yet
Adiat2019 Article PredictionOfGroundwaterLevelIn
14 pages
7 Cleaning Data w3s.............................................
No ratings yet
7 Cleaning Data w3s.............................................
2 pages
Code With Dates HARDCODED
No ratings yet
Code With Dates HARDCODED
2 pages
Simple Regression Model: Erbil Technology Institute
No ratings yet
Simple Regression Model: Erbil Technology Institute
9 pages
Empirical Risk Minimization
No ratings yet
Empirical Risk Minimization
3 pages
House Price Prediction Using Regression Techniques: A Comparative Study
No ratings yet
House Price Prediction Using Regression Techniques: A Comparative Study
5 pages
An Optimization Method For Radial Forging Process
No ratings yet
An Optimization Method For Radial Forging Process
9 pages
Sales Prediction of Walmart Based On Regression Models: Abstract
No ratings yet
Sales Prediction of Walmart Based On Regression Models: Abstract
10 pages
Carrato, McGuire, Scarth (2016) - A Practioner's Introduction To Stocha...
No ratings yet
Carrato, McGuire, Scarth (2016) - A Practioner's Introduction To Stocha...
75 pages
Chapter9 Regression Multicollinearity
No ratings yet
Chapter9 Regression Multicollinearity
25 pages
Machine Learning
No ratings yet
Machine Learning
24 pages
FML File Final
No ratings yet
FML File Final
36 pages
Lecture03 MachineLearning
No ratings yet
Lecture03 MachineLearning
78 pages
Rabbani, E, Sharif, F, Koolivand Salooki, M and Moradzadeh, A 2012 Application of Neural
No ratings yet
Rabbani, E, Sharif, F, Koolivand Salooki, M and Moradzadeh, A 2012 Application of Neural
12 pages
Week 14-15
No ratings yet
Week 14-15
33 pages
Sample Final Solutions
No ratings yet
Sample Final Solutions
12 pages
Assignment 3 Week 3
No ratings yet
Assignment 3 Week 3
3 pages
ch-6 - Demand Forcasting
No ratings yet
ch-6 - Demand Forcasting
32 pages
CA1 and CA2 Solution ML
No ratings yet
CA1 and CA2 Solution ML
23 pages
Assignment 2
No ratings yet
Assignment 2
3 pages
1 s2.0 S1877050919302789 Main
No ratings yet
1 s2.0 S1877050919302789 Main
7 pages
Merits and Demerits of Measure of Average and Measure of Dispersion
No ratings yet
Merits and Demerits of Measure of Average and Measure of Dispersion
5 pages
Grade Sheet Is 651 Cryptography (Spring 2025) 12 Jun 25
No ratings yet
Grade Sheet Is 651 Cryptography (Spring 2025) 12 Jun 25
5 pages
Estimation of Weibull Shape Parameter by Shrinkage Towards An Interval Under Failure Censored Sampling
No ratings yet
Estimation of Weibull Shape Parameter by Shrinkage Towards An Interval Under Failure Censored Sampling
22 pages
Cover Letter QUMULO
No ratings yet
Cover Letter QUMULO
1 page
Cover Letter For Akbar
No ratings yet
Cover Letter For Akbar
1 page
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
C Language Programming Codes
From Everand
C Language Programming Codes
Durgesh
No ratings yet

Assignment 1, Codeandssfile

Uploaded by

Assignment 1, Codeandssfile

Uploaded by

Data Cleaning:

# Create a mapping from Postcode_district to Postcode_full for non-NaN values

# Use vectorized operations to fill NaN values

# Use transform to efficiently broadcast values to NaN cells

# Use vectorized operations to fill NaN values

# Use vectorized operations to fill NaN values

# Use transform to efficiently broadcast values to NaN cells

# Check for empty cells in each column

# Filter rows where StopCodeDescription is 'Special Service'

# Count NaN values in SpecialServiceType within the filtered rows

# Replace NaN values in SpecialServiceType with 'Not applicable'

# Apply functions to fill data

# Filling blank cells'

# Prepare the data for linear regression model

# Prepare the data for linear regression model to predict Northing_m

# Check for empty cells again after the replacement

# Convert DateOfCall to datetime

# Trend of Incidents Over Time

# Plotting the trend of incidents over time

# Trend of Incident Types

# Plotting the frequency of different types of incidents

# Filtering out rows where FirstPumpArriving_AttendanceTime is 'Unknown' or missing

# Trend of First Pump Arriving Attendance Time

# Incidents by Hour of Call

# Plotting incidents by hour of the day

# Analyzing the Notional Cost of Incidents Over Time

# Calculating the total notional cost by month

# Plotting the trend of notional cost over time

def load_and_preprocess_data(filepath, usecols=None):

def apply_kmeans(df, n_clusters=5):

def plot_clusters(df, kmeans, plot_size=(12, 10), alpha=0.3):

def generate_heatmap(df, plot_size=(12, 10)):

# Selecting features and target for the regression model

# Encoding categorical variables

# Splitting the dataset

# Creating the model pipeline

# Training the model

# Predicting and evaluating

# Plot actual vs. predicted values for the regression model

print(f"Mean Absolute Error: {mae}")

# Cost Analysis: Assess the financial impact of different types of incidents

# Calculating average cost per incident within each incident group

# Plotting Total and Average Notional Cost by Incident Type

# Load the dataset

# Convert time columns to numeric, coercing errors to NaN

# Replace zeros to avoid division by zero errors in feature engineering

# Identify numerical columns for median filling

features = ['CallToIncidentRatio', 'NumStationsWithPumpsAttending', 'NumCalls']

# Splitting the data into training and testing sets

# Output Random Forest results

# Output XGBoost results

# Visualize call volume by hour of the day

# Visualizing call volume by day of the week

# Combine actual and predicted values into DataFrames

# Print the first few actual vs predicted values for RandomForest

# Print the first few actual vs predicted values for XGBoost

# Load the dataset

# Example for adapting the summary and visualization code

X = data_relevant.drop('Notional Cost (£)', axis=1)

# Splitting the dataset into training and testing sets

# Preprocessing for numerical and categorical features

# Create a pipeline with GradientBoostingRegressor

# Fit the model

# Predictions and evaluation

train_mae = mean_absolute_error(y_train, train_predictions)

print(f"Train MAE: {train_mae}")

# Scatter plot of actual vs. predicted values

# Line of best fit

plt.xlabel('Actual Notional Cost (£)')

You might also like