0% found this document useful (0 votes)
45 views9 pages

Numpy

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
45 views9 pages

Numpy

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

# Print the top few rows

DATA MANIPULATION WITH PANDAS


print(homelessness_ind.head())
1) Inspecting a DataFrame
# Sort homelessness by descending family members
# edited/added homelessness_fam = homelessness.sort_values("family_members", ascending=False)
import pandas as pd # Print the top few rows
homelessness = pd.read_csv('homelessness.csv', index_col=0) print(homelessness_fam.head())
# Print the head of the homelessness data # Sort homelessness by region, then descending family members
print(homelessness.head()) homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], a
# Print information about homelessness scending=[True, False])

print(homelessness.info()) # Print the top few rows

# Print the shape of homelessness print(homelessness_reg_fam.head())

print(homelessness.shape)
4) Subsetting columns
# Print a description of homelessness
# Select the individuals column
print(homelessness.describe())
individuals = homelessness["individuals"]
2) Parts of a DataFrame # Print the head of the result

# Import pandas using the alias pd print(individuals.head())

import pandas as pd # Select the state and family_members columns

# Print the values of homelessness state_fam = homelessness[["state", "family_members"]]

print(homelessness.values) # Print the head of the result

# Print the column index of homelessness print(state_fam.head())

print(homelessness.columns) # Select only the individuals and state columns, in that order

# Print the row index of homelessness ind_state = homelessness[["individuals", "state"]]

print(homelessness.index) # Print the head of the result


print(ind_state.head())
3) Sorting rows
Subsetting rows
# Sort homelessness by individuals
homelessness_ind = homelessness.sort_values("individuals") # Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"] > 10000] homelessness["total"] = homelessness["individuals"] + homelessness["family_memb
ers"]
# See the result
# Add p_individuals col as proportion of total that are individuals
print(ind_gt_10k)
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]
# Filter for rows where region is Mountain
# See the result
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(homelessness)
# See the result
print(mountain_reg) Combo-attack!
# Filter for rows where family_members is less than 1000 # and region is Pacific
# Create indiv_per_10k col as homeless individuals per 10k state pop
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & (hom
elessness["region"] == "Pacific")] homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessn
ess["state_pop"]
# See the result
# Subset rows for indiv_per_10k greater than 20
print(fam_lt_1k_pac)
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
Subsetting rows by categorical variables # Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascendin
# Subset for rows in South Atlantic or Mid-Atlantic regions
g=False)
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") |
# From high_homelessness_srt, select the state and indiv_per_10k cols
(homelessness["region"] == "Mid-Atlantic")]
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
# See the result
print(south_mid_atlantic)
print(result)
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"] Mean and median
# Filter for rows in the Mojave Desert states
# edited/added
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
sales = pd.read_csv('sales_subset.csv', index_col=0)
# See the result
# Print the head of the sales DataFrame
print(mojave_homelessness)
print(sales.head())
Adding new columns # Print the info about the sales DataFrame
print(sales.info())
# Add total col as sum of individuals and family_members
# Print the mean of weekly_sales
print(sales["weekly_sales"].mean()) sales_1_1 = sales[(sales["department"] == 1) & (sales["store"] == 1)]
# Print the median of weekly_sales # Sort sales_1_1 by date
print(sales["weekly_sales"].median()) sales_1_1 = sales_1_1.sort_values("date")
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
Summarizing dates
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
# Print the maximum of the date column # Get the cumulative max of weekly_sales, add as cum_max_sales col
print(sales["date"].max()) sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()
# Print the minimum of the date column # See the columns you calculated
print(sales["date"].min()) print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])

Efficient summaries Dropping duplicates

# A custom IQR functiondef iqr(column):  Remove rows of sales with duplicate pairs of store and type and save
return column.quantile(0.75) - column.quantile(0.25) as store_types and print the head.
 Remove rows of sales with duplicate pairs of store and department and save
# Print IQR of the temperature_c column as store_depts and print the head.
print(sales["temperature_c"].agg(iqr))
# A custom IQR functiondef iqr(column): # Drop duplicate store/type combinations
return column.quantile(0.75) - column.quantile(0.25) store_types = sales.drop_duplicates(subset=["store", "type"])
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment print(store_types.head())
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)) # Drop duplicate store/department combinations
# Import NumPy and create custom IQR functionimport numpy as npdef iqr(colum store_depts = sales.drop_duplicates(subset=["store", "department"])
n):
print(store_depts.head())
return column.quantile(0.75) - column.quantile(0.25)
# Subset the rows where is_holiday is True and drop duplicate dates
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unem
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset="date")
ployment
# Print date col of holiday_dates
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, n
p.median])) print(holiday_dates["date"])

Cumulative statistics Counting categorical variables

# edited/added # Count the number of stores of each type


store_counts = store_types["type"].value_counts() sales_by_type = sales.groupby("type")["weekly_sales"].sum()
print(store_counts) # Get proportion for each type
# Get the proportion of stores of each type sales_propn_by_type = sales_by_type / sum(sales_by_type)
store_props = store_types["type"].value_counts(normalize=True) print(sales_propn_by_type)
print(store_props) # Group by type and is_holiday; calc total weekly sales
# Count the number of each department number and sort sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].s
um()
dept_counts_sorted = store_depts["department"].value_counts(sort=True)
print(sales_by_type_is_holiday)
print(dept_counts_sorted)
# Get the proportion of departments of each number and sort Multiple grouped summaries
dept_props_sorted = store_depts["department"].value_counts(sort=True, normalize
=True) # Import numpy with the alias npimport numpy as np
print(dept_props_sorted) # For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean,
What percent of sales occurred at each store type? np.median])
# Print sales_stats
# Calc total weekly sales
print(sales_stats)
sales_all = sales["weekly_sales"].sum()
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min,
# Subset for type A stores, calc total weekly sales
max, mean, and median
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l
# Subset for type B stores, calc total weekly sales "]].agg([np.min, np.max, np.mean, np.median])
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum() # Print unemp_fuel_stats
# Subset for type C stores, calc total weekly sales print(unemp_fuel_stats)
sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()
Pivoting on one variable
# Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all # Pivot for mean weekly_sales for each store type
print(sales_propn_by_type) mean_sales_by_type = sales.pivot_table(values="weekly_sales", index="type")
# Print mean_sales_by_type
Calculations with .groupby()
print(mean_sales_by_type)
# Group by type; calc total weekly sales # Import NumPy as npimport numpy as np
# Pivot for mean and median weekly_sales for each store type print(temperatures_ind)
mean_med_sales_by_type = sales.pivot_table("weekly_sales", "type", aggfunc = [np. # Reset the temperatures_ind index, keeping its contents
mean, np.median])
print(temperatures_ind.reset_index())
# Print mean_med_sales_by_type
# Reset the temperatures_ind index, dropping its contents
print(mean_med_sales_by_type)
print(temperatures_ind.reset_index(drop=True))
# Pivot for mean weekly_sales by store type and holiday
mean_sales_by_type_holiday = sales.pivot_table("weekly_sales", "type", "is_holida Subsetting with .loc[]
y")
# Make a list of cities to subset on
# Print mean_sales_by_type_holiday
cities = ["Moscow", "Saint Petersburg"]
print(mean_sales_by_type_holiday)
# Subset temperatures using square brackets
Fill in missing values and sum values with pivot tables print(temperatures[temperatures["city"].isin(cities)])
# Subset temperatures_ind using .loc[]
# Print mean weekly_sales by department and type; fill missing values with 0
print(temperatures_ind.loc[cities])
print(sales.pivot_table(values="weekly_sales", index="department", columns="type
", fill_value=0))
Setting multi-level indexes
# Print the mean weekly_sales by department and type; fill missing values with 0s; s
um all rows and cols # Index temperatures by country & city
print(sales.pivot_table(values="weekly_sales", index="department", columns="type temperatures_ind = temperatures.set_index(["country", "city"])
", fill_value=0, margins =True))
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
Setting and removing indexes rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]
# Subset for rows to keep
# edited/added
print(temperatures_ind.loc[rows_to_keep])
temperatures = pd.read_csv('temperatures.csv', index_col=0)
temperatures['date'] = pd.to_datetime(temperatures['date'], infer_datetime_format= Sorting by index values
True)
# Look at temperatures # Sort temperatures_ind by index values
print(temperatures) print(temperatures_ind.sort_index())
# Set the index of temperatures to city # Sort temperatures_ind by index values at the city level
temperatures_ind = temperatures.set_index("city") print(temperatures_ind.sort_index(level="city"))
# Look at temperatures_ind # Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True, Fal # Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
se]))
print(temperatures_ind.loc["2010":"2011"])

Slicing index values # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index() Subsetting by row/column number
# Subset rows from Pakistan to Russia
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures_srt.loc["Pakistan":"Russia"])
print(temperatures.iloc[22, 1])
# Try to subset rows from Lahore to Moscow
# Use slicing to get the first 5 rows
print(temperatures_srt.loc["Lahore":"Moscow"])
print(temperatures.iloc[:5])
# Subset rows from Pakistan, Lahore to Russia, Moscow
# Use slicing to get columns 3 to 4
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])
print(temperatures.iloc[:, 2:4])

Slicing in both directions # Use slicing in both directions at once


print(temperatures.iloc[:5, 2:4])
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")]) Pivot temperature by city and year
# Subset columns from date to avg_temp_c
# Add a year column to temperatures
print(temperatures_srt.loc[:, "date":"avg_temp_c"])
temperatures["year"] = temperatures["date"].dt.year
# Subset in both directions at once
# Pivot avg_temp_c by country and city vs year
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"), "date":"avg_
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index =
temp_c"])
["country", "city"], columns = "year")

Slicing time series # See the result


print(temp_by_country_city_vs_year)
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temp Subsetting pivot tables
eratures["date"] <= "2011-12-31")]
# Subset for Egypt to India
print(temperatures_bool)
temp_by_country_city_vs_year.loc["Egypt":"India"]
# Set date as the index and sort the index
# Subset for Egypt, Cairo to India, Delhi
temperatures_ind = temperatures.set_index("date").sort_index()
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")] Changes in sales over time

# Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010 # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"), "2005":"2 # Get the total number of avocados sold on each date
010"]
nb_sold_by_date = avocados.groupby("date")["nb_sold"].sum()

Calculating on a pivot table # Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(kind="line")
# Get the worldwide mean temp by year
# Show the plot
mean_temp_by_year = temp_by_country_city_vs_year.mean()
plt.show()
# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]) Avocado supply and demand
# Get the mean temp by city
# Scatter plot of avg_price vs. nb_sold with title
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
avocados.plot(x="nb_sold", y="avg_price", kind="scatter", title="Number of avocad
# Filter for the city that had the lowest mean temp os sold vs. average price")
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()]) # Show the plot
plt.show()
Which avocado size is most popular?

# edited/added Price of conventional vs. organic avocados

import urllib.requestimport pickle # Histogram of conventional avg_price


avocados = pd.read_pickle("avoplotto.pkl") avocados[avocados["type"] == "conventional"]["avg_price"].hist()
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt # Histogram of organic avg_price
# Look at the first few rows of data avocados[avocados["type"] == "organic"]["avg_price"].hist()
print(avocados.head()) # Add a legend
# Get the total number of avocados sold of each size plt.legend(["conventional", "organic"])
nb_sold_by_size = avocados.groupby("size")["nb_sold"].sum() # Show the plot
# Create a bar plot of the number of avocados sold by size plt.show()
nb_sold_by_size.plot(kind="bar") # Modify histogram transparency to 0.5
# Show the plot avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5)
plt.show() # Modify histogram transparency to 0.5
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5) Removing missing values

# Add a legend # Remove rows with missing values


plt.legend(["conventional", "organic"]) avocados_complete = avocados_2016.dropna()
# Show the plot # Check if any columns contain missing values
plt.show() print(avocados_complete.isna().any())
# Modify bins to 20
Replacing missing values
avocados[avocados["type"] == "conventional"]["avg_price"].hist(bins=20, alpha=0.
5) # List the columns with missing values
# Modify bins to 20 cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
avocados[avocados["type"] == "organic"]["avg_price"].hist(bins= 20, alpha=0.5) # Create histograms showing the distributions cols_with_missing
# Add a legend avocados_2016[cols_with_missing].hist()
plt.legend(["conventional", "organic"]) # Show the plot
# Show the plot plt.show()
plt.show()
List of dictionaries
Finding missing values
# Create a list of dictionaries with new data
# edited/added avocados_list = [
avocados_2016 = pd.read_csv('avocados_2016.csv') {"date": "2019-11-03", "small_sold": 10376832, "large_sold": 7835071},
cols_with_missing = ['small_sold', 'large_sold', 'xl_sold'] {"date": "2019-11-10", "small_sold": 10717154, "large_sold": 8561348},
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt ]
# Check individual values for missing values # Convert list into DataFrame
print(avocados_2016.isna()) avocados_2019 = pd.DataFrame(avocados_list)
# Check each column for missing values # Print the new DataFrame
print(avocados_2016.isna().any()) print(avocados_2019)
# Bar plot of missing values by variable
avocados_2016.isna().sum().plot(kind="bar") Dictionary of lists
# Show plot # Create a dictionary of lists with new data
plt.show() avocados_dict = {
"date": ["2019-11-17", "2019-12-01"], # Print airline_totals_sorted
"small_sold": [10859987, 9291631], print(airline_totals_sorted)
"large_sold": [7674135, 6238096] # Save as airline_totals_sorted.csv
} airline_totals_sorted.to_csv("airline_totals_sorted.csv")
# Convert dictionary into DataFrame
avocados_2019 = pd.DataFrame(avocados_dict)
# Print the new DataFrame
print(avocados_2019)

CSV to DataFrame

# Read CSV as DataFrame called airline_bumping


airline_bumping = pd.read_csv('airline_bumping.csv') # edited/added
# Take a look at the DataFrame
print(airline_bumping.head())
# For each airline, select nb_bumped and total_passengers and sum
airline_totals = airline_bumping.groupby("airline")[["nb_bumped", "total_passenger
s"]].sum()
# Create new col, bumps_per_10k: no. of bumps per 10k passengers for each airline
airline_totals["bumps_per_10k"] = airline_totals["nb_bumped"] / airline_totals["tota
l_passengers"] * 10000
# Print airline_totals
print(airline_totals)
# Print airline_totals
print(airline_totals)

DataFrame to CSV

# Create airline_totals_sorted
airline_totals_sorted = airline_totals.sort_values("bumps_per_10k", ascending=Fals
e)

You might also like