0% found this document useful (0 votes)
22 views25 pages

Report

The document outlines a preliminary analysis of business problems related to incident patterns, response times, and cost analysis. It includes data cleaning, filling missing values, and applying machine learning models to predict incidents and response times. The analysis aims to inform budgetary decisions and improve resource allocation based on the findings.

Uploaded by

tali66261
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views25 pages

Report

The document outlines a preliminary analysis of business problems related to incident patterns, response times, and cost analysis. It includes data cleaning, filling missing values, and applying machine learning models to predict incidents and response times. The analysis aims to inform budgetary decisions and improve resource allocation based on the findings.

Uploaded by

tali66261
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 25

Based on this preliminary analysis, potential business problems you could explore include:

• Incident Patterns: Analyze patterns and trends in incidents over time, by type, or by
location.
• Response Times: Study the average response times and their impact on incident outcomes.
• Cost Analysis: Assess the financial impact of incidents by category and type to inform
budgetary and resource allocation decisions.

Data Analysis
import pandas as pd
import matplotlib.pyplot as plt

# Load the data

data = pd.read_csv('C:\\Users\\PMLS\\Desktop\\newham data (1).csv', encoding='ISO-8859-1')

# Convert 'DateOfCall' to datetime and extract day and month for further analysis

data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])

data['DayOfWeek'] = data['DateOfCall'].dt.day_name()

data['Month'] = data['DateOfCall'].dt.month_name()

# Plot incident frequency by day of the week

weekly_counts = data['DayOfWeek'].value_counts().reindex([

"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"

])

weekly_counts.plot(kind='bar')

plt.title('Incident Frequency by Day of Week')

plt.xlabel('Day of Week')

plt.ylabel('Number of Incidents')

plt.show()

# Plot incident frequency by month

monthly_counts = data['Month'].value_counts()

monthly_counts.plot(kind='bar')

plt.title('Incident Frequency by Month')

plt.xlabel('Month')

plt.ylabel('Number of Incidents')

plt.show()

# Fill missing 'SpecialServiceType' with 'Not Applicable'

data['SpecialServiceType'].fillna('Not Applicable', inplace=True)


# For geographical coordinates, it might be better to drop rows when analyzing geographic patterns

# or fill with the median (demonstrating both approaches)

data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Drop rows where Latitude or


Longitude is missing

# Alternatively, fill missing values with the median

data['Latitude'].fillna(data['Latitude'].median(), inplace=True)

data['Longitude'].fillna(data['Longitude'].median(), inplace=True)

# Check for any duplicated rows in the dataframe

row_unique = data.duplicated().any() # Returns False if all rows are unique

print(f"All rows are unique: {not row_unique}")

# Calculate the number of missing entries in each column

missing_values = data.isnull().sum()

# To get a percentage of missing values in each column

missing_percentage = (data.isnull().sum() / len(data)) * 100

# Print the number and percentage of missing entries per column

print("Number of missing entries per column:")

print(missing_values)

print("\nPercentage of missing entries per column:")

print(missing_percentage)
Data Cleaning and data filling:
import pandas as pd

from sklearn.base import TransformerMixin

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from geopy.exc import GeocoderTimedOut

from geopy.geocoders import Nominatim

from sklearn.linear_model import LinearRegression

def fill_postcode_from_district(df):

# Identify rows where Postcode_full is NaN and Postcode_district is not NaN

mask = df['Postcode_full'].isna() & ~df['Postcode_district'].isna()

# Create a mapping from Postcode_district to Postcode_full for non-NaN values

district_to_postcode =
df[~df['Postcode_full'].isna()].drop_duplicates('Postcode_district').set_index('Postcode_district')['P
ostcode_full']

# Use vectorized operations to fill NaN values

df.loc[mask, 'Postcode_full'] = df.loc[mask, 'Postcode_district'].map(district_to_postcode)

def fill_lat_lon_from_postcode_efficiently(df):

# Create a mapping from Postcode_full to Latitude and Longitude for rows where they are not
NaN
postcode_to_lat_lon = df.dropna(subset=['Latitude', 'Longitude', 'Postcode_full'])[['Postcode_full',
'Latitude', 'Longitude']].drop_duplicates('Postcode_full')

# Use transform to efficiently broadcast values to NaN cells

df['Latitude'] =
df['Latitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')['Latitude
']))

df['Longitude'] =
df['Longitude'].fillna(df['Postcode_full'].map(postcode_to_lat_lon.set_index('Postcode_full')['Longit
ude']))

def fill_postcode_from_district_efficiently(df):

# Create a mapping from Postcode_district to Postcode_full for non-NaN values

postcode_map =
df.dropna(subset=['Postcode_full']).drop_duplicates('Postcode_district').set_index('Postcode_distri
ct')['Postcode_full'].to_dict()

# Use vectorized operations to fill NaN values

df['Postcode_full'] = df['Postcode_full'].fillna(df['Postcode_district'].map(postcode_map))

def fill_incgeo_wardcode_from_propercase(df):

# Create a mapping from ProperCase to IncGeo_WardCode for rows where IncGeo_WardCode is


not NaN

propercase_to_wardcode =
df.dropna(subset=['IncGeo_WardCode']).drop_duplicates('ProperCase').set_index('ProperCase')['In
cGeo_WardCode'].to_dict()

# Use vectorized operations to fill NaN values

mask = (df['IncGeo_WardCode'].isna()) & (~df['ProperCase'].isna())

df.loc[mask, 'IncGeo_WardCode'] = df.loc[mask, 'ProperCase'].map(propercase_to_wardcode)

def fill_easting_from_uprn(df):
# Create a mapping from UPRN to Easting_m for rows where they are not zero

uprn_to_easting = df[df['UPRN'] != 0][['UPRN', 'Easting_m']].drop_duplicates('UPRN')

# Use transform to efficiently broadcast values to NaN cells

df['Easting_m'] = df['Easting_m'].mask((df['Easting_m'].isna()) & (df['UPRN'] != 0),


df['UPRN'].map(uprn_to_easting.set_index('UPRN')['Easting_m']))

# Main code

file_path = 'C:\\Users\\PMLS\\Desktop\\newham data (1).csv'

df = pd.read_csv(file_path, encoding='latin1')

# Check for empty cells in each column

empty_cells_per_column = df.isnull().sum()

print("Empty cells per column:")

print(empty_cells_per_column)

# Filter rows where StopCodeDescription is 'Special Service'

special_service_df = df[df['StopCodeDescription'] == 'Special Service']

# Count NaN values in SpecialServiceType within the filtered rows

nan_count = special_service_df['SpecialServiceType'].isna().sum()

print("Number of 'Special Service' entries with NaN in 'SpecialServiceType':", nan_count)

# Replace NaN values in SpecialServiceType with 'Not applicable'

df['SpecialServiceType'] = df['SpecialServiceType'].fillna('Not applicable')

# Apply functions to fill data

fill_postcode_from_district(df)

fill_postcode_from_district_efficiently(df)
fill_incgeo_wardcode_from_propercase(df)

fill_lat_lon_from_postcode_efficiently(df)

# Filling blank cells'

df['IncGeo_WardCode'].fillna('Unknown', inplace=True)

df['IncGeo_WardName'].fillna('Unknown', inplace=True)

df['IncGeo_WardNameNew'].fillna('Unknown', inplace=True)

df['IncidentStationGround'].fillna('Unknown', inplace=True)

df['FirstPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)

df['FirstPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)

df['SecondPumpArriving_AttendanceTime'].fillna('Unknown', inplace=True)

df['SecondPumpArriving_DeployedFromStation'].fillna('Unknown', inplace=True)

df['NumCalls'].fillna(0, inplace=True)

df['NumPumpsAttending'].fillna(0, inplace=True)

df['PumpCount'].fillna(0, inplace=True)

df['PumpHoursRoundUp'].fillna(0, inplace=True)

df['NumStationsWithPumpsAttending'].fillna(0, inplace=True)

df['Notional Cost (£)'] = df['PumpHoursRoundUp'] * 333

# Prepare the data for linear regression model

df_model = df.dropna(subset=['Easting_m', 'Longitude'])

X = df_model[['Longitude']]

y = df_model['Easting_m']

# Fit the linear regression model

model = LinearRegression()

model.fit(X, y)

# Predict missing Easting_m values using the model

missing_easting_index = df['Easting_m'].isnull()

predicted_easting = model.predict(df.loc[missing_easting_index, ['Longitude']])


# Fill in the missing Easting_m values with the predictions

df.loc[missing_easting_index, 'Easting_m'] = predicted_easting

# Check again for missing values in Easting_m after filling

missing_easting_final = df['Easting_m'].isnull().sum()

missing_easting_final

# Prepare the data for linear regression model to predict Northing_m

df_model_northing = df.dropna(subset=['Northing_m', 'Longitude'])

X_northing = df_model_northing[['Longitude']] # Using Longitude as predictor

y_northing = df_model_northing['Northing_m']

# Fit the linear regression model for Northing_m

model_northing = LinearRegression()

model_northing.fit(X_northing, y_northing)

# Predict missing Northing_m values using the model

missing_northing_index = df['Northing_m'].isnull()

predicted_northing = model_northing.predict(df.loc[missing_northing_index, ['Longitude']])

# Fill in the missing Northing_m values with the predictions

df.loc[missing_northing_index, 'Northing_m'] = predicted_northing

# Check again for missing values in Northing_m after filling

missing_northing_final = df['Northing_m'].isnull().sum()

missing_northing_final

# Check for empty cells again after the replacement

empty_cells_after_fill = df.isnull().sum()

print("\nEmpty cells per column after replacement:")

print(empty_cells_after_fill)

df.to_csv('C:\\Users\\PMLS\\Desktop\\newham data new(1).csv', index=False)


Before Cleaning:

After Cleaning:
Business Problems and Model analysis:

Incident Patterns: Analyze patterns and trends in incidents


over time, by type, or by location.
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

import seaborn as sns

# Load and preprocess data

data = pd.read_csv('C:\\Users\\PMLS\\Desktop\\newham data new(1).csv', encoding='ISO-8859-1')

data['DateOfCall'] = pd.to_datetime(data['DateOfCall'])

data['MonthYear'] = data['DateOfCall'].dt.to_period('M')

# Aggregate data by month and count incidents

monthly_data = data.groupby('MonthYear').size().reset_index(name='IncidentCounts')

# Prepare features

monthly_data['Month'] = monthly_data['MonthYear'].dt.month

monthly_data['Year'] = monthly_data['MonthYear'].dt.year

# Include MonthYear as a string for plotting purposes

monthly_data['MonthYearStr'] = monthly_data['MonthYear'].astype(str)

X = monthly_data[['Month', 'Year', 'MonthYearStr']] # Features including MonthYear for plotting

y = monthly_data['IncidentCounts'] # Target
# Split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the model

model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train.drop(columns=['MonthYearStr']), y_train) # Train without the MonthYearStr


column

# Predict on test data

predictions = model.predict(X_test.drop(columns=['MonthYearStr'])) # Predict without the


MonthYearStr column

# Evaluate the model

mse = mean_squared_error(y_test, predictions)

print(f"Mean Squared Error: {mse}")

# Plot the results

plt.figure(figsize=(10, 5))

plt.scatter(X_test['MonthYearStr'], y_test, color='blue', label='Actual') # Use MonthYearStr for


plotting

plt.scatter(X_test['MonthYearStr'], predictions, color='red', label='Predicted')

plt.title('Monthly Incident Predictions')

plt.xlabel('Month and Year')

plt.ylabel('Number of Incidents')

plt.xticks(rotation=45) # Rotate x-axis labels for better readability

plt.legend()

plt.show()

# Convert X_test['MonthYearStr'] to datetime for sorting

X_test['MonthYearStr'] = pd.to_datetime(X_test['MonthYearStr'])
# Create a DataFrame for actual and predicted values

results = pd.DataFrame({'MonthYearStr': X_test['MonthYearStr'], 'Actual': y_test, 'Predicted':


predictions})

# Pivot the DataFrame for heatmap

heatmap_data = results.pivot(index='MonthYearStr', columns='Actual', values='Predicted')

# Plot the heatmap

plt.figure(figsize=(12, 8))

sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt=".1f")

plt.title('Actual vs Predicted Heatmap')

plt.xlabel('Actual Values')

plt.ylabel('Month and Year')

plt.xticks(rotation=45)

plt.yticks(rotation=0)

plt.show()
Response Times: Study the average response times and
their impact on incident outcomes.
import pandas as pd

import numpy as np

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler

from statsmodels.tsa.arima.model import ARIMA

import matplotlib.pyplot as plt

import seaborn as sns

# Load data

data = pd.read_csv('C:\\Users\\PMLS\\Desktop\\newham data new(1).csv', encoding='ISO-8859-1')

data['DateOfCall'] = pd.to_datetime(data['DateOfCall']) # Convert 'DateOfCall' to datetime format

data['ResponseTime'] = pd.to_datetime(data['TimeOfCall']) - data['DateOfCall']

# Convert ResponseTime to minutes for easier analysis

data['ResponseTimeMinutes'] = data['ResponseTime'].dt.total_seconds() / 60

# Standardize the response times for clustering

scaler = StandardScaler()

data_scaled = scaler.fit_transform(data[['ResponseTimeMinutes']])

# K-means Clustering

kmeans = KMeans(n_clusters=3, random_state=42)

data['Cluster'] = kmeans.fit_predict(data_scaled)

# Visualize Clusters

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='DateOfCall', y='ResponseTimeMinutes', hue='Cluster',
palette='viridis')

plt.title('Response Time Clusters Over Time')

plt.show()

# Heatmap of Cluster Centers

plt.figure(figsize=(8, 4))

sns.heatmap(kmeans.cluster_centers_, annot=True, cmap='coolwarm', fmt=".2f")

plt.title('Cluster Centers for Response Times')

plt.show()

# ARIMA Forecasting

data['MonthYear'] = data['DateOfCall'].dt.to_period('M')

monthly_response = data.groupby('MonthYear')['ResponseTimeMinutes'].mean()

# Fit ARIMA model

model = ARIMA(monthly_response, order=(1, 1, 1))

results = model.fit()

# Forecast next 6 months

forecast = results.get_forecast(steps=6)

conf_int = forecast.conf_int()

# Plot Forecast

plt.figure(figsize=(12, 6))

plt.plot(monthly_response.index.to_timestamp(), monthly_response, label='Historical Average


Response Time')

plt.plot(pd.date_range(start=monthly_response.index[-1].to_timestamp(), periods=7, freq='M')[1:],


forecast.predicted_mean, label='Forecasted Response Time')
plt.fill_between(pd.date_range(start=monthly_response.index[-1].to_timestamp(), periods=7,
freq='M')[1:], conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.3)

plt.title('ARIMA Forecast of Response Times')

plt.legend()

plt.show()

# Heatmap of ARIMA Forecast

plt.figure(figsize=(8, 4))

sns.heatmap(forecast.predicted_mean.to_frame().T, annot=True, cmap='coolwarm', fmt=".2f")

plt.title('ARIMA Forecast Heatmap')

plt.show()

# Note: Change file paths and tweak parameters as necessary for your specific setup.
cost Analysis: Assess the financial impact of incidents by
category and type to inform budgetary and resource
allocation decisions.
import pandas as pd

import numpy as np

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import LabelEncoder

# Load your dataset

data = pd.read_csv('C:\\Users\\PMLS\\Desktop\\newham data new(1).csv')


# Check for any categorical columns and convert them

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

label_encoders = {}

for col in categorical_cols:

le = LabelEncoder()

data[col] = le.fit_transform(data[col].astype(str)) # Ensure all input to LabelEncoder is string type

label_encoders[col] = le

# Selecting relevant features for the model

features = ['NumPumpsAttending', 'PumpHoursRoundUp', 'IncidentGroup', 'PropertyType',


'FirstPumpArriving_AttendanceTime']

target = 'Notional Cost (£)'

# Preparing the data for training and testing

X = data[features]

y = data[target]

# Splitting the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating and training the Random Forest model

model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

# Predicting on the test set

y_pred = model.predict(X_test)

# Evaluating the model's performance

mse = mean_squared_error(y_test, y_pred)


print(f"Mean Squared Error: {mse}")

# Assuming future_data only needs the features actually used in the model:

future_data = pd.DataFrame({

'NumPumpsAttending': [3, 2, 1],

'PumpHoursRoundUp': [4, 5, 6],

'IncidentGroup': [1, 2, 3], # Ensure these values match the encoded training data categories

'PropertyType': [2, 1, 3], # Same as above

'FirstPumpArriving_AttendanceTime': [12, 13, 14]

})

# Only apply label encoders to the columns used in the model and present in future_data

for col in set(label_encoders.keys()).intersection(set(future_data.columns)):

le = label_encoders[col]

# Handle previously unseen labels by replacing them with a default value (e.g., -1)

future_data[col] = future_data[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Make predictions for future data

future_predictions = model.predict(future_data)

# Visualize Future vs Predicted values

plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred, color='blue', label='Actual vs Predicted')

plt.scatter(future_data.index, future_predictions, color='red', label='Future Predictions')

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='green', linestyle='--',


label='Ideal Prediction')

plt.title('Future vs Predicted Notional Cost')

plt.xlabel('Actual Notional Cost (£)')

plt.ylabel('Predicted Notional Cost (£)')

plt.legend()

plt.show()
# Heatmap of feature importance

feature_importance = pd.Series(model.feature_importances_,
index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 6))

sns.heatmap(data[features].corr(), annot=True, cmap='coolwarm', fmt=".2f")

plt.title('Correlation Heatmap of Features')

plt.show()

# Visualization: Average Notional Cost by Number of Pumps Attending

plt.figure(figsize=(10, 6))

sns.barplot(x='NumPumpsAttending', y='Notional Cost (£)',


data=data.groupby('NumPumpsAttending')['Notional Cost (£)'].mean().reset_index())

plt.title('Average Notional Cost by Number of Pumps Attending')

plt.xlabel('Number of Pumps Attending')

plt.ylabel('Average Notional Cost (£)')

plt.show()

# Visualization: Average Notional Cost by Pump Hours

plt.figure(figsize=(10, 6))

sns.lineplot(x='PumpHoursRoundUp', y='Notional Cost (£)',


data=data.groupby('PumpHoursRoundUp')['Notional Cost (£)'].mean().reset_index())

plt.title('Average Notional Cost by Pump Hours Rounded Up')

plt.xlabel('Pump Hours Rounded Up')

plt.ylabel('Average Notional Cost (£)')


plt.show()

You might also like