0% found this document useful (0 votes)
23 views13 pages

Proyecto Final Model

The document discusses installing and using the cartopy Python library for geospatial data visualization. It provides code snippets for cleaning and preprocessing geospatial data, creating random forest and naive Bayes classification models, evaluating model performance, and saving models to disk.

Uploaded by

luis fuentes
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views13 pages

Proyecto Final Model

The document discusses installing and using the cartopy Python library for geospatial data visualization. It provides code snippets for cleaning and preprocessing geospatial data, creating random forest and naive Bayes classification models, evaluating model performance, and saving models to disk.

Uploaded by

luis fuentes
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

Libraries

pip install cartopy

Collecting cartopy
Downloading Cartopy-0.23.0-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.6/11.6 MB 39.0 MB/s eta
0:00:00
ent already satisfied: numpy>=1.21 in /usr/local/lib/python3.10/dist-
packages (from cartopy) (1.25.2)
Requirement already satisfied: matplotlib>=3.5 in
/usr/local/lib/python3.10/dist-packages (from cartopy) (3.7.1)
Requirement already satisfied: shapely>=1.7 in
/usr/local/lib/python3.10/dist-packages (from cartopy) (2.0.4)
Requirement already satisfied: packaging>=20 in
/usr/local/lib/python3.10/dist-packages (from cartopy) (24.0)
Requirement already satisfied: pyshp>=2.3 in
/usr/local/lib/python3.10/dist-packages (from cartopy) (2.3.1)
Requirement already satisfied: pyproj>=3.3.1 in
/usr/local/lib/python3.10/dist-packages (from cartopy) (3.6.1)
Requirement already satisfied: contourpy>=1.0.1 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (1.2.1)
Requirement already satisfied: cycler>=0.10 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (4.51.0)
Requirement already satisfied: kiwisolver>=1.0.1 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (1.4.5)
Requirement already satisfied: pillow>=6.2.0 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in
/usr/local/lib/python3.10/dist-packages (from matplotlib>=3.5-
>cartopy) (2.8.2)
Requirement already satisfied: certifi in
/usr/local/lib/python3.10/dist-packages (from pyproj>=3.3.1->cartopy)
(2024.2.2)
Requirement already satisfied: six>=1.5 in
/usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7-
>matplotlib>=3.5->cartopy) (1.16.0)
Installing collected packages: cartopy
Successfully installed cartopy-0.23.0

from sklearn.metrics import confusion_matrix, accuracy_score,


precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
import os

Clean data
# Seleccion de variables y nombre de las columnas
def change_column_names(df):
columns = {
"latitude": "latitude", "longitude": "longitude",
"population_density": "population_density", "General class":
"land_cover_type",
"class": "land_cover_subtype", "Sub-class":
"vegetation_percent", "date": "date", "ws": "wind_speed", "vpd":
"vapor_pressure_deficit",
"vap": "vapor_pressure", "tmin": "minimum_temperature",
"tmax": "maximum_temperature", "swe": "snow_water_equivalent",
"srad": "surface_shortwave_radiation", "soil":
"soil_moisture", "q": "runoff", "ppt": "precipitation_accumulation",
"pet": "Reference_evapotranspiration", "def":
"climate_water_deficit", "aet": "actual_Evapotranspiration",
"PDSI": "palmer_drought_severity_index", "brightness":
"brightness_temperature", "scan": "scan_fire_size", "track":
"track_fire_size",
"confidence": "confidence", "frp": "fire_radiative_power",
"daynight": "daynight", "type": "fire_type", "n_pixels":
"n_pixels_ndvi",
"vim": "ndvi", "vim_avg": "ndvi_long_term_average", "viq":
"ndvi_anomaly_percent", "year": "year"
}

return df.dropna().rename(columns=columns)[columns.values()]

# Asignacion de valores de variable objetivo


def asign_class(value):
if value in ["l", "n", "h"]:
return value

x = int(value)
if 0 <= x < 30:
return "l"
elif 30 <= x < 80:
return "n"

return "h"

# Conversion de variables categoricas


def clean_dtypes(df, target_name):
columns =
df.drop(columns=[target_name]).select_dtypes(exclude=['number']).colum
ns
df[target_name] = df[target_name].astype(str)
dict_values = {}

for column_name in columns:


nor_values = df[column_name].values
cat_values = df[column_name].astype('category').cat.codes
dict_values[column_name] = {f"{value[0]}":value[1] for value
in set(zip(nor_values, cat_values))}
df[column_name] = cat_values

return df, dict_values

# Asignacion de una estacion


def set_season(date_value):
if 3 <= date_value.month <= 5 or 9 <= date_value.month <= 11:
return "rainy season"
return "dry season"

def get_general_dataframe(df_values, target):


df_values['seasons'] = df_values['date'].apply(set_season)
df_values['month'] = df_values['date'].dt.month
df_values['day'] = df_values['date'].dt.day
df, dict_values = clean_dtypes(df_values.drop(columns=['date']),
target)

return df, dict_values

models
def get_models_path():
models_path = "./models"

if not os.path.exists(models_path):
os.makedirs(models_path)

return models_path

def get_root_directory():
root_directory_path = "./datasets"

if not os.path.exists(root_directory_path):
os.makedirs(root_directory_path)

return root_directory_path

# Medidas del modelo


def get_info_model(y_test, y_pred, class_names, average='micro'):
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=average)
recall = recall_score(y_test, y_pred, average=average)
f1 = f1_score(y_test, y_pred, average=average)
cm = confusion_matrix(y_test, y_pred, labels=class_names)

return accuracy, precision, recall, f1, cm

# Entrenamiento del modelo


def get_avg_training(X, y, model):
avg_scores = []
cm_total = np.zeros((3, 3))
class_names = ['l', 'n', 'h']
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):


X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy, precision, recall, f1, cm = get_info_model(y_test,


y_pred, class_names)
avg_scores.append([accuracy, precision, recall, f1])
cm_total += cm

# Obtenemos el promedio de todas las metricas


avg_scores = np.array(avg_scores)
accuracy_avg = np.mean(avg_scores[:, 0])
precision_avg = np.mean(avg_scores[:, 1])
recall_avg = np.mean(avg_scores[:, 2])
f1_avg = np.mean(avg_scores[:, 3])
cm_total_df = pd.DataFrame(cm_total, index=class_names,
columns=class_names)
return accuracy_avg, precision_avg, recall_avg, f1_avg,
cm_total_df

# Creacion de ambos modelos


def create_models(df, target):
X = df.drop(columns=[target]).values
y = df[target].values

random_forest_model = RandomForestClassifier(n_estimators=120,
max_depth=10, class_weight="balanced", random_state=42)
random_forest_info = get_avg_training(X, y, random_forest_model)

bayesian_model = GaussianNB()
bayesian_info = get_avg_training(X, y, bayesian_model)

return random_forest_model, random_forest_info, bayesian_model,


bayesian_info

def save_models(dict_values, random_forest_model, bayesian_model,


models_path):
joblib.dump(random_forest_model,
f"{models_path}/random_forest_model.joblib")
joblib.dump(bayesian_model,
f"{models_path}/bayesian_model.joblib")
joblib.dump(dict_values, f"{models_path}/dict_values.joblib")

Describing data
models_path = get_models_path()
root_directory_path = get_root_directory()

df_final = pd.read_pickle(f'{root_directory_path}/final_dataset.pkl')

target = "confidence"

df_final[target] = df_final[target].apply(asign_class)
df_final = change_column_names(df_final)

df_final.head()

{"type":"dataframe","variable_name":"df_final"}

df_final.to_csv(f'{root_directory_path}/final_dataset.csv',
index=False)

df_final_cleanned, dict_values =
get_general_dataframe(df_final.copy(), target)
df_final_cleanned_codes = df_final_cleanned.copy()
df_final_cleanned_codes[target] =
df_final_cleanned_codes[target].astype('category').cat.codes
df_final_cleanned[target].value_counts()

confidence
n 1521844
h 208636
l 100449
Name: count, dtype: int64

df_final_cleanned.columns

Index(['latitude', 'longitude', 'population_density',


'land_cover_type',
'land_cover_subtype', 'vegetation_percent', 'wind_speed',
'vapor_pressure_deficit', 'vapor_pressure',
'minimum_temperature',
'maximum_temperature', 'snow_water_equivalent',
'surface_shortwave_radiation', 'soil_moisture', 'runoff',
'precipitation_accumulation', 'Reference_evapotranspiration',
'climate_water_deficit', 'actual_Evapotranspiration',
'palmer_drought_severity_index', 'brightness_temperature',
'scan_fire_size', 'track_fire_size', 'confidence',
'fire_radiative_power', 'daynight', 'fire_type',
'n_pixels_ndvi',
'ndvi', 'ndvi_long_term_average', 'ndvi_anomaly_percent',
'year',
'seasons', 'month', 'day'],
dtype='object')

corr = df_final_cleanned_codes.corr()
plt.figure(figsize=(16, 14))
sns.set(font_scale=0.7)

sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")


plt.title('Matriz de Correlación de Pearson')
plt.show()
# Se establece un umbral de correlacion
threshold = 0.85
correlation_matrix = corr.abs()
upper_tri =
correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),
k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if
any(upper_tri[column] > threshold)]
reduced_df = df_final_cleanned.drop(columns=to_drop) # Se quita 1 de 2
columnas altamente correlacionadas
Creating model
reduced_df.columns

Index(['latitude', 'longitude', 'population_density',


'land_cover_type',
'land_cover_subtype', 'vegetation_percent', 'wind_speed',
'vapor_pressure_deficit', 'vapor_pressure',
'minimum_temperature',
'snow_water_equivalent', 'surface_shortwave_radiation',
'soil_moisture',
'runoff', 'precipitation_accumulation',
'Reference_evapotranspiration',
'climate_water_deficit', 'palmer_drought_severity_index',
'brightness_temperature', 'scan_fire_size', 'confidence',
'fire_radiative_power', 'daynight', 'fire_type',
'n_pixels_ndvi',
'ndvi', 'ndvi_anomaly_percent', 'year', 'seasons', 'month',
'day'],
dtype='object')

random_forest_model, random_forest_info, bayesian_model, bayesian_info


= create_models(reduced_df, target)

rf_accuracy, rf_precision, rf_recall, rf_f1, rf_cm =


random_forest_info

print("Random Forest")
print(f"rf_accuracy={rf_accuracy}")
print(f"rf_precision={rf_precision}")
print(f"rf_recall={rf_recall}")
print(f"rf_f1={rf_f1}")
print(f"rf_cm:\n{rf_cm}")

Random Forest
rf_accuracy=0.7525005076318212
rf_precision=0.7525005076318212
rf_recall=0.7525005076318212
rf_f1=0.7525005076318212
rf_cm:
l n h
l 67544.0 21275.0 11630.0
n 355747.0 1105059.0 61038.0
h 1175.0 2289.0 205172.0

bm_accuracy, bm_precision, bm_recall, bm_f1, bm_cm = bayesian_info

print("Bayesian Model")
print(f"bm_accuracy={bm_accuracy}")
print(f"bm_precision={bm_precision}")
print(f"bm_recall={bm_recall}")
print(f"bm_f1={bm_f1}")
print(f"bm_cm:\n{bm_cm}")

Bayesian Model
bm_accuracy=0.6458054910823028
bm_precision=0.6458054910823028
bm_recall=0.6458054910823028
bm_f1=0.6458054910823028
bm_cm:
l n h
l 17212.0 54666.0 28571.0
n 136139.0 1006275.0 379430.0
h 12520.0 37179.0 158937.0

rf_probs = random_forest_model.feature_importances_ # Influencias de


variables en el modelo

for [column, rf_prob] in


list(sorted(zip(reduced_df.drop(columns=[target]), rf_probs), key =
lambda x: x[1]))[::-1]:
print(f"{column} = {rf_prob}")

brightness_temperature = 0.36536143226513207
fire_radiative_power = 0.26298131436302397
scan_fire_size = 0.19816786282273172
daynight = 0.05344582322736113
year = 0.02746812323112576
latitude = 0.008749474713167015
month = 0.007927916793298018
ndvi = 0.007855331489897404
population_density = 0.006623341421557317
vapor_pressure_deficit = 0.006342677338069368
longitude = 0.00460010894885883
palmer_drought_severity_index = 0.004476762419985014
climate_water_deficit = 0.004437413280390877
vapor_pressure = 0.00417743441088386
Reference_evapotranspiration = 0.0037091304899637884
fire_type = 0.003679213372044027
wind_speed = 0.003615540209061185
soil_moisture = 0.0035809291293696765
land_cover_type = 0.0031771276115307783
ndvi_anomaly_percent = 0.002763994002717311
precipitation_accumulation = 0.0024544810602002078
land_cover_subtype = 0.002428343721722926
runoff = 0.0024117304912774583
minimum_temperature = 0.0023194547493360323
n_pixels_ndvi = 0.0021459017251619786
surface_shortwave_radiation = 0.0013781765069856905
day = 0.0013768713136899481
seasons = 0.001300280176602127
vegetation_percent = 0.0010427064968135003
snow_water_equivalent = 1.1022180412770868e-06

most_influyent_index = np.argmax(np.abs(rf_probs))
most_influyent_name =
reduced_df.drop(columns=[target]).columns[most_influyent_index]
most_influyent_value = rf_probs[most_influyent_index]

print(f"{most_influyent_name} = {most_influyent_value}")

brightness_temperature = 0.36536143226513207

import matplotlib.pyplot as plt


import cartopy.crs as ccrs
import cartopy.feature as cfeature

# Crear una figura con dos subplots


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10),
subplot_kw={'projection': ccrs.PlateCarree()})
extent = [-80, -66, -4, 13]

# Configurar el primer mapa


ax1.set_extent(extent)
ax1.add_feature(cfeature.BORDERS, linestyle=':')
ax1.add_feature(cfeature.COASTLINE)
ax1.add_feature(cfeature.LAND, edgecolor='black')
ax1.add_feature(cfeature.LAKES, alpha=0.5)
ax1.add_feature(cfeature.RIVERS)
gl1 = ax1.gridlines(draw_labels=True)
gl1.top_labels = False
gl1.right_labels = False
ax1.set_title('Dry season')

# Configurar el segundo mapa


ax2.set_extent(extent)
ax2.add_feature(cfeature.BORDERS, linestyle=':')
ax2.add_feature(cfeature.COASTLINE)
ax2.add_feature(cfeature.LAND, edgecolor='black')
ax2.add_feature(cfeature.LAKES, alpha=0.5)
ax2.add_feature(cfeature.RIVERS)
gl2 = ax2.gridlines(draw_labels=True)
gl2.top_labels = False
gl2.right_labels = False
ax2.set_title('Rainy season')

# confidences (l, n, h)
confidences = df_final_cleanned[target].unique()
colors = ["yellow", "orange", "red"]
year = 2022

for color, conf in zip(colors, confidences):


# dry season
coords = df_final_cleanned[
(df_final_cleanned[target] == conf) &
(df_final_cleanned["seasons"] == 0) &
(df_final_cleanned["year"] == year)][['longitude',
'latitude']].values
lon_values, lat_values = coords[:, 0], coords[:, 1]
ax1.scatter(lon_values, lat_values, color=color, s=50,
transform=ccrs.PlateCarree(), zorder=5)

# rainy season
coords = df_final_cleanned[
(df_final_cleanned[target] == conf) &
(df_final_cleanned["seasons"] == 1) &
(df_final_cleanned["year"] == year)][['longitude',
'latitude']].values
lon_values, lat_values = coords[:, 0], coords[:, 1]
ax2.scatter(lon_values, lat_values, color=color, s=50,
transform=ccrs.PlateCarree(), zorder=5)

# Mostrar el mapa
plt.suptitle('Mapa de incendios')
plt.show()

/usr/local/lib/python3.10/dist-packages/cartopy/io/__init__.py:241:
DownloadWarning: Downloading:
https://naturalearth.s3.amazonaws.com/10m_physical/ne_10m_land.zip
warnings.warn(f'Downloading: {url}', DownloadWarning)
/usr/local/lib/python3.10/dist-packages/cartopy/io/__init__.py:241:
DownloadWarning: Downloading:
https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_0_boun
dary_lines_land.zip
warnings.warn(f'Downloading: {url}', DownloadWarning)
/usr/local/lib/python3.10/dist-packages/cartopy/io/__init__.py:241:
DownloadWarning: Downloading:
https://naturalearth.s3.amazonaws.com/10m_physical/ne_10m_coastline.zi
p
warnings.warn(f'Downloading: {url}', DownloadWarning)
/usr/local/lib/python3.10/dist-packages/cartopy/io/__init__.py:241:
DownloadWarning: Downloading:
https://naturalearth.s3.amazonaws.com/10m_physical/ne_10m_lakes.zip
warnings.warn(f'Downloading: {url}', DownloadWarning)
/usr/local/lib/python3.10/dist-packages/cartopy/io/__init__.py:241:
DownloadWarning: Downloading:
https://naturalearth.s3.amazonaws.com/10m_physical/ne_10m_rivers_lake_
centerlines.zip
warnings.warn(f'Downloading: {url}', DownloadWarning)
save_models(dict_values, random_forest_model, bayesian_model,
models_path)

Example of the model


random_forest_model =
joblib.load(f"{models_path}/random_forest_model.joblib")
bayesian_model = joblib.load(f"{models_path}/bayesian_model.joblib")
dict_values = joblib.load(f"{models_path}/dict_values.joblib")

dict_values

# Medellin
ff_value = {
'latitude': 6.25184,
'longitude': -75.56359,
'population_density': 19134.373047,
'land_cover_type': 'Terra Firma',
'land_cover_subtype': 'Tree cover',
'vegetation_percent': '35% short vegetation cover',
'wind_speed': 1.3,
'vapor_pressure_deficit': 0.82,
'vapor_pressure': 1.913,
'minimum_temperature': 289.45,
'snow_water_equivalent': 0.0,
'surface_shortwave_radiation': 207.8,
'soil_moisture': 36,
'runoff': 108.4,
'precipitation_accumulation': 221.7,
'Reference_evapotranspiration': 113.3,
'climate_water_deficit': 0.0,
'palmer_drought_severity_index': -3.4,
'brightness_temperature': 321.6,
'scan_fire_size': 1.3,
'fire_radiative_power': 9.0,
'daynight': 'D',
'fire_type': 'presumed vegetation fire',
'n_pixels_ndvi': 15.0,
'ndvi': 0.7695,
'ndvi_anomaly_percent': 100.7927,
'year': 2023,
'seasons': 'dry season',
'month': 4,
'day': 2
}

for key in dict_values:


old_value = ff_value[key]
new_value = dict_values[key][old_value]
ff_value[key] = new_value

mp_values = np.array([value for _, value in


ff_value.items()]).reshape(1, -1)

value_predicted = random_forest_model.predict(mp_values)[0]

if value_predicted == "l":
print("0 - 30% de probabilidad de que ocurra un incendio")
elif value_predicted == "n":
print("30 - 80% de probabilidad de que ocurra un incendio")
elif value_predicted == "h":
print("80 - 100% de probabilidad de que ocurra un incendio")

30 - 80% de probabilidad de que ocurra un incendio

You might also like