Visualization
Python
pandas and matplotlib
line plot
multiple line plots
barplot
boxplot
heatmap
Life Expectancy and Health Expenditure
FDATA = "./files/matplot.life-expectancy-vs-health-expenditure.csv"
OPTCHART = {1: ['Life Expectancy', 'LifeExpectancy','Number of years'], 2: ['Health
Expenditure','HealthExpenditure','Dollars/Capita Year']}
COLORS = {"LifeExpectancy": "#16acd8", "HealthExpenditure": "#4e16d8"}
COLRENAME = {'Entity':'Country'}
def menu():
strOut = ""
for key, val in OPTCHART.items():
strOut += str(key) + ": " + val[0] + "\n"
strOut += "Your choice: "
return strOut
print(menu())
df = pd.read_csv(FDATA)
df = df.rename(columns=COLRENAME)
#print(df.head())
# list of countries to check user's input
clist = df.Country.unique()
# select one Country for plotting
country = input("Country: ")
while country not in clist:
country = input("Country: ")
# select what data
opt = int(input(menu()))
while opt not in OPTCHART.keys():
opt = int(input(menu()))
colname = OPTCHART[opt][1]
selcol = {}
selcol[colname] = COLORS[colname]
# select data for chart
dfsel = df[(df.Country == country) & (~df[colname].isna())][['Year',colname]]
#print(dfsel)
Q1 plot one information for a selected country
pandas plot
# one line, no legend
ax = dfsel.plot.line(x='Year', legend=False, color=selcol)
plt.xlabel('Year')
plt.ylabel(OPTCHART[opt][2])
plt.title("pandas.plot.line: " +OPTCHART[opt][0] + "\n" + country.title())
#plt.show()
pandas plot - uses line
ax = dfsel.plot(x='Year', legend=False, color=selcol)
plt.xlabel('Year')
plt.ylabel(OPTCHART[opt][2])
plt.title("pandas.plot: " + OPTCHART[opt][0] + "\n" + country.title())
#plt.show()
matplotlib
fig, ax = plt.subplots()
plt.plot(dfsel['Year'], dfsel[colname], label=OPTCHART[opt][0]) # Set label for first
line
plt.xlabel('Year')
plt.ylabel(OPTCHART[opt][2])
plt.title("matplotlib: " + OPTCHART[opt][0] + "\n" + country.title())
plt.legend() # Show legend with the specified labels
#plt.show()
Q2 plot both data
## Two different Y axes
### separate charts
dfcountry = df[(df.Country == country) & (~df[colname].isna())]
fig, axes = plt.subplots(2, 1, figsize=(8, 10))
axes[0].plot(dfcountry.Year, dfcountry.LifeExpectancy)
axes[1].plot(dfcountry.Year, dfcountry.HealthExpenditure)
plt.show()
fig, ax1 = plt.subplots(figsize=(8, 8))
ax2 = ax1.twinx()
ax1.plot(dfcountry.Year, dfcountry.LifeExpectancy, color=COLORS['LifeExpectancy'])
ax2.plot(dfcountry.Year, dfcountry.HealthExpenditure,
color=COLORS['HealthExpenditure'])
ax1.set_xlabel("Year")
ax1.set_ylabel(OPTCHART[1][2])
#ax1.tick_params(axis="y", labelcolor=COLOR_TEMPERATURE)
ax2.set_ylabel(OPTCHART[2][2])
#ax2.tick_params(axis="y", labelcolor=COLOR_PRICE)
plt.show()
Q3 plot several selected countries
create table with column per country and plot with single chart
#TODO : ask the user a set of countries
countries = ['Italy','France']
dftab = df[(df.Country.isin(countries)) & (~df[colname].isna())]
[['Country','Year',colname]].pivot(index='Year', columns='Country', values=colname)
ax = dftab.plot(kind='line')
plt.ylabel(OPTCHART[opt][2])
#plt.title(OPTCHART[opt][0] + "\n(countries: " + ", ".join(countries).strip() + ")")
plt.title("pandas.plot - single chart: " + OPTCHART[opt][0])
plt.show()
one chart per country
#dfsel = df[(df.Country.isin(countries)) & (~df[colname].isna())]
[['Country','Year',colname]].pivot(index='Year', columns='Country', values=colname)
#print(dfsel)
ax = dftab.plot(kind='line', subplots=True, title="pandas.plot - multiple chart: " +
OPTCHART[opt][0])
plt.ylabel(OPTCHART[opt][2])
#it appears on the last subplot
#plt.title("pandas.plot - multiple chart: " + OPTCHART[opt][0])
plt.show()
matplotlib
# single char
fig, axes = plt.subplots()
for i, country in enumerate(countries):
plt.plot(dftab.index, dftab[country], label=country) # Plot each entity
plt.xlabel('Year')
plt.ylabel(OPTCHART[opt][2])
plt.title("matplotlib - single chart: " + OPTCHART[opt][0])
plt.legend(title='Country')
plt.show()
one chart per country
COLORS = ['#00202e', '#003f5c', '#2c4875', '#8a508f', '#bc5090', '#ff6361',
'#ff8531', '#ffa600']
ncountries = len(countries)
fig, axes = plt.subplots(ncountries, 1, figsize=(8, 6 * ncountries))
for i, country in enumerate(dftab.columns):
axes[i].plot(dftab.index, dftab[country], color=COLORS[i], legend=False)
axes[i].set_title(f'{country} Life Expectancy Over Years')
axes[i].set_xlabel('Year')
axes[i].set_ylabel(OPTCHART[opt][2])
plt.tight_layout()
plt.show()
Q4 boxplot
selcountries = ['France','Germany','Italy']
df.groupby('Year').agg({'LifeExpectancy':'mean'})
dfcs = df[(df['Year']>2000) & (df['Country'].isin(selcountries))]
[['Country','Year','LifeExpectancy','HealthExpenditure']]
#dfcs.groupby('Country')[['LifeExpectancy']].boxplot()
ax = dfcs.groupby('Country')[['LifeExpectancy']].boxplot(subplots=False)
ax.set_xticklabels(selcountries)
Passwords
FDATA = "./files/passwords.txt.csv"
FCAT = "./files/passwords.cat.csv"
TIMECONV = {'seconds': 1/3600,
'minutes': 1/60,
'hours': 1,
'days': 24,
'weeks': 168,
'months': 720,
'years': 8760}
dfp = pd.read_csv(FDATA)
dfc = pd.read_csv(FCAT)
dfp.head()
dfc.head()
dfp['online_hours'] = dfp['value']*dfp['time_unit'].map(TIMECONV)
dfp['online_hours'] = dfp['online_hours'].astype(int)
#dfp[dfp['online_hours'].isna()]
# number of passwords per category and average online breaking time
dfg = dfp[['catid','online_hours']].groupby(['catid']).agg({'catid': 'size',
'online_hours': 'mean'})
dfg.rename(columns={'catid': 'count'}, inplace=True)
dfg = dfg.reset_index()
dfres = dfg.merge(dfc, how='right', left_on='catid', right_on='id')
#print(dfres)
Q1 pandas number of passwords per category, showing the name
dfres.plot(kind='bar', x='category', y='count')
plt.ylabel('number of passwords')
plt.title('Plot on grouped - flat data')
plt.show()
pandas let it compute
dfpc = dfp.merge(dfc, how='right', left_on='catid', right_on='id')
dfpc[['category']].value_counts().plot(kind='bar', xlabel='Category', ylabel='Count',
rot=90)
plt.title('Plot value count')
plt.show()
#print(dfpc)
matplotlib
fig, ax = plt.subplots()
ax.bar(dfres['category'], dfres['online_hours'], width=1, edgecolor="white",
linewidth=0.7)
###
ncat = dfres['category'].nunique()
ax.set_xlim(-1,ncat)
###
plt.xticks(rotation=90)
plt.title('Matplot on grouped')
plt.show()
Q2 show average times
dfpc[dfpc.online_hours < 10000].boxplot(column=['online_hours'], by='category',
grid=False, color='black', rot=90)
histogram
dfpc[['strength']].plot(kind='hist')