0% found this document useful (0 votes)
32 views

Code - Cap 3

Uploaded by

clisman
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
32 views

Code - Cap 3

Uploaded by

clisman
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

import os

import calendar
import numpy as np
import networkx as nx
import pandas as pd
from pandas.plotting import scatter_matrix, parallel_coordinates
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pylab as plt

###### 3.2 EXAMPLES #########


#############################
###### load the Boston Housing file
housing_df = pd.read_csv("D:\__UNI_2022_HP\_CC 442_Mineria de Datos\_Semana 1_2_\
BostonHousing.csv")
# rename CAT. MEDV column for easier data handling
housing_df = housing_df.rename(columns={"CAT. MEDV":"CAT_MEDV"})
housing_df.head(9)

####### 3.3 Basic Charts: Bar Charts, Line Graphs, and Scatter Plots ########
#############################################################################
## Load the Amtrak data and convert them to be suitable for time series analysis
Amtrak_df = pd.read_csv("D:\__UNI_2022_HP\_CC 442_Mineria de Datos\_Semana 1_2_\
Amtrak.csv", squeeze=True)
Amtrak_df["Date"] = pd.to_datetime(Amtrak_df.Month, format="%d/%m/%Y")
ridership_ts = pd.Series(Amtrak_df.Ridership.values,index=Amtrak_df.Date)
## Boston housing data
housing_df = pd.read_csv("C:\Python\BostonHousing.csv")
housing_df = housing_df.rename(columns={"CAT. MEDV":"CAT_MEDV"})

### Pandas version


## line graph
ridership_ts.plot(ylim=[1300, 2300], legend=False)
plt.xlabel("Year") # set x-axis label
plt.ylabel("Ridership (in 000s)") # set y-axis label
housing_df.plot.scatter(x="LSTAT", y="MEDV", legend=False)
ax = housing_df.groupby("CHAS").mean().MEDV.plot(kind="bar")
ax.set_ylabel("Avg. MEDV")
dataForPlot = housing_df.groupby("CHAS").mean()["CAT_MEDV"] *100
ax = dataForPlot.plot(kind="bar", figsize=[5, 3])
ax.set_ylabel("Avg. MEDV")

### matplotlib version


## line graph
plt.plot(ridership_ts.index, ridership_ts)
plt.xlabel("Year") # set x-axis label
plt.ylabel("Ridership (in 000s)") # set y-axis label
## Set the color of the points in the scatterplot and draw as open circles.
plt.scatter(housing_df.LSTAT, housing_df.MEDV, color="C2",facecolor="none")
plt.xlabel("LSTAT"); plt.ylabel("MEDV")
## barchart of CHAS vs. mean MEDV
# compute mean MEDV per CHAS = (0, 1)
dataForPlot = housing_df.groupby("CHAS").mean().MEDV
fig, ax = plt.subplots()
ax.bar(dataForPlot.index, dataForPlot, color={"C5", "C1"})
ax.set_xticks((0, 1))
ax.set_xlabel("CHAS")
ax.set_ylabel("Avg. MEDV")
## barchart of CHAS vs. CAT.MEDV
dataForPlot = housing_df.groupby("CHAS").mean()["CAT_MEDV"] *100
fig, ax = plt.subplots()
ax.bar(dataForPlot.index, dataForPlot, color={"C5", "C1"})
ax.set_xticks((0, 1))
ax.set_xlabel("CHAS"); ax.set_ylabel("Avg. MEDV")

########## Distribution Plots: Boxplots and Histograms


################### histogram of MEDV
ax = housing_df.MEDV.hist()
ax.set_xlabel("MEDV"); ax.set_ylabel("count")
# alternative plot with matplotlib
fig, ax = plt.subplots()
ax.hist(housing_df.MEDV)
ax.set_axisbelow(True) # Show the grid lines behind the histogram
ax.grid(which="major", color="grey", linestyle='-')
ax.set_xlabel("MEDV"); ax.set_ylabel("count")
plt.show()
################### boxplot of MEDV for different values of CHAS
ax = housing_df.boxplot(column="MEDV", by="CHAS")
ax.set_ylabel("MEDV")
plt.suptitle("") # Suppress the titles
plt.title("")
# alternative plot with matplotlib
dataForPlot =
[list(housing_df[housing_df.CHAS==0].MEDV),list(housing_df[housing_df.CHAS==1].MEDV
)]
fig, ax = plt.subplots()
ax.boxplot(dataForPlot)
ax.set_xticks((1, 2))
ax.set_xticklabels((0, 1))
ax.set_xlabel("CHAS"); ax.set_ylabel("MEDV")
plt.show()

################## side-by-side boxplots


fig, axes = plt.subplots(nrows=1, ncols=4)
housing_df.boxplot(column="NOX", by="CAT_MEDV", ax=axes[0])
housing_df.boxplot(column="LSTAT", by="CAT_MEDV", ax=axes[1])
housing_df.boxplot(column="PTRATIO", by="CAT_MEDV", ax=axes[2])
housing_df.boxplot(column="INDUS", by="CAT_MEDV", ax=axes[3])
for ax in axes:
ax.set_xlabel("CAT.MEDV")

########## Heatmaps: Visualizing Correlations and Missing Values


################# simple heatmap of correlations (without values)
corr = housing_df.corr()
sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns)
# Change the colormap to a divergent scale and fix the range of the colormap
sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns, vmin=-1,
vmax=1, cmap="RdBu")
# Include information about values (example demonstrate how to control the size of
# the plot
fig, ax = plt.subplots()
fig.set_size_inches(11, 7)
sns.heatmap(corr, annot=True, fmt=".1f", cmap="RdBu", center=0,ax=ax)

##### ########### code for generating a heatmap of missing values


df = pd.read_csv(r"D:\__UNI_2022_HP\_CC 442_Mineria de Datos\_Semana 1_2_\
NYPD_Motor_Vehicle_Collisions_1000.csv").sort_values(["DATE"])
# given a dataframe df create a copy of the array that is 0 if a field contains a
# value and 1 for NaN
naInfo = np.zeros(df.shape)
naInfo[df.isna().values] = 1
naInfo = pd.DataFrame(naInfo, columns=df.columns)
fig, ax = plt.subplots()
fig.set_size_inches(13, 9)
ax = sns.heatmap(naInfo, vmin=0, vmax=1, cmap=["white","#666666"], cbar=False,
ax=ax)
ax.set_yticks([])
# draw frame around figure
rect = plt.Rectangle((0, 0), naInfo.shape[1], naInfo.shape[0],linewidth=1,
edgecolor="lightgrey", facecolor="none")
rect = ax.add_patch(rect)
rect.set_clip_on(False)
plt.xticks(rotation=80)

########### 3.4 Multidimensional Visualization ##############################


#############################################################################
######### Adding Variables: Color, Size, Shape, Multiple Panels, and Animation
# Color the points by the value of CAT.MEDV
housing_df.plot.scatter(x="LSTAT", y="NOX", c=["C0" if c == 1 else "C1" for c in
housing_df.CAT_MEDV])
# Plot first the data points for CAT.MEDV of 0 and then of 1
# Setting color to ’none’ gives open circles
_, ax = plt.subplots()
for catValue, color in (0, "C1"), (1, "C0"):
subset_df = housing_df[housing_df.CAT_MEDV == catValue]
ax.scatter(subset_df.LSTAT, subset_df.NOX, color="none",edgecolor=color)
ax.set_xlabel("LSTAT")
ax.set_ylabel("NOX")
ax.legend(["CAT.MEDV 0", "CAT.MEDV 1"])
plt.show()
################## panel plots
# compute mean MEDV per RAD and CHAS
dataForPlot_df = housing_df.groupby(["CHAS","RAD"]).mean()["MEDV"]
# We determine all possible RAD values to use as ticks
ticks = set(housing_df.RAD)
for i in range(2):
for t in ticks.difference(dataForPlot_df[i].index):
dataForPlot_df.loc[(i, t)] = 0
# reorder to rows, so that the index is sorted
dataForPlot_df = dataForPlot_df[sorted(dataForPlot_df.index)]
# Determine a common range for the y axis
yRange = [0, max(dataForPlot_df) * 1.1]
fig, axes = plt.subplots(nrows=2, ncols=1)
dataForPlot_df[0].plot.bar(x="RAD", ax=axes[0], ylim=yRange)
dataForPlot_df[1].plot.bar(x="RAD", ax=axes[1], ylim=yRange)
axes[0].annotate("CHAS = 0", xy=(3.5, 45))
axes[1].annotate("CHAS = 1", xy=(3.5, 45))
plt.show()

# Display scatterplots between the different variables


# The diagonal shows the distribution for each variable
df = housing_df[["CRIM", "INDUS", "LSTAT", "MEDV"]]
axes = scatter_matrix(df, alpha=0.5, figsize=(6, 6),diagonal="kde")
corr = df.corr().as_matrix()
for i, j in zip(*plt.np.triu_indices_from(axes, k=1)):
axes[i, j].annotate(’xycoords=’axes fraction’, ha=’center’,va=’center’)
plt.show()

##### Manipulations: Rescaling, Aggregation and Hierarchies,Zooming, Filtering

################# Rescaling
# Avoid the use of scientific notation for the log axis
plt.rcParams["axes.formatter.min_exponent"] = 4
## scatter plot: regular and log scale
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 4))
# regular scale
housing_df.plot.scatter(x="CRIM", y="MEDV", ax=axes[0])
# log scale
ax = housing_df.plot.scatter(x="CRIM", y="MEDV", logx=True,logy=True, ax=axes[1])
ax.set_yticks([5, 10, 20, 50])
ax.set_yticklabels([5, 10, 20, 50])
plt.tight_layout(); plt.show()
## boxplot: regular and log scale
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 3))
# regular scale
ax = housing_df.boxplot(column="CRIM", by="CAT_MEDV",ax=axes[0])
ax.set_xlabel("CAT.MEDV"); ax.set_ylabel("CRIM")
# log scale
ax = housing_df.boxplot(column="CRIM", by="CAT_MEDV",ax=axes[1])
ax.set_xlabel("CAT.MEDV"); ax.set_ylabel("CRIM");
ax.set_yscale("log")
# suppress the title
axes[0].get_figure().suptitle(""); plt.tight_layout();
plt.show()

########## Aggregation and Hierarchies


fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
Amtrak_df = pd.read_csv("D:\__UNI_2022_HP\_CC 442_Mineria de Datos\_Semana 1_2_\
Amtrak.csv")
Amtrak_df["Month"] = pd.to_datetime(Amtrak_df.Month, format="%d/%m/%Y")
Amtrak_df.set_index("Month", inplace=True)
# fit quadratic curve and display
quadraticFit = np.poly1d(np.polyfit(range(len(Amtrak_df)),Amtrak_df.Ridership, 2))
Amtrak_fit = pd.DataFrame({"fit" : [quadraticFit(t) for t in
range(len(Amtrak_df))]})
Amtrak_fit.index = Amtrak_df.index
ax = Amtrak_df.plot(ylim=[1300, 2300], legend=False, ax=axes[0][0])
Amtrak_fit.plot(ax=ax)
ax.set_xlabel("Year"); ax.set_ylabel("Ridership (in 000s)") #set x and y-axis label
# Zoom in 2-year period
ridership_2yrs = Amtrak_df.loc["1991-01-01":"1992-12-01"]
ax = ridership_2yrs.plot(ylim=[1300, 2300], legend=False,ax=axes[1][0])
ax.set_xlabel("Year"); ax.set_ylabel("Ridership (in 000s)") #set x and y-axis label
# Average by month
byMonth = Amtrak_df.groupby(by=[Amtrak_df.index.month]).mean()
ax = byMonth.plot(ylim=[1300, 2300], legend=False, ax=axes[0][1])
ax.set_xlabel("Month"); ax.set_ylabel("Ridership (in 000s)") #set x and y-axis
label
yticks = [-2.0,-1.75,-1.5,-1.25,-1.0,-0.75,-0.5,-0.25,0.0]
ax.set_xticks(range(1, 13))
ax.set_xticklabels([calendar.month_abbr[i] for i in range(1,13)]);
# Average by year (exclude data from 2004)
byYear = Amtrak_df.loc["1991-01-01":"2003-12-
01"].groupby(pd.Grouper(freq="A")).mean()
ax = byYear.plot(ylim=[1300, 2300], legend=False, ax=axes[1][1])
ax.set_xlabel("Year"); ax.set_ylabel("Ridership (in 000s)") #set x and y-axis label
plt.tight_layout()
plt.show()

You might also like