0% found this document useful (0 votes)
359 views

Visualizing Big Data With Trelliscope

This document contains code for analyzing and visualizing ride share and bike share data using R. It includes: 1. Code to summarize the number of daily rides and plot them over time. 2. Code to examine distributions of fare amounts, payment types, and their relationship to trip duration using histograms and hexbin plots. 3. Code to facet plots of daily rides by payment type and day of week. 4. Additional code examples visualize and explore bike share data over time by membership status, day of week, and other variables. Plots are created using ggplot2 and facets, and some are visualized interactively using Trelliscope.

Uploaded by

Octavio Flores
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
359 views

Visualizing Big Data With Trelliscope

This document contains code for analyzing and visualizing ride share and bike share data using R. It includes: 1. Code to summarize the number of daily rides and plot them over time. 2. Code to examine distributions of fare amounts, payment types, and their relationship to trip duration using histograms and hexbin plots. 3. Code to facet plots of daily rides by payment type and day of week. 4. Additional code examples visualize and explore bike share data over time by membership status, day of week, and other variables. Plots are created using ggplot2 and facets, and some are visualized interactively using Trelliscope.

Uploaded by

Octavio Flores
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

Daily ride counts

library(dplyr)
library(ggplot2)

# Summarize the number of taxi rides by pickup day


daily_count <- tx %>%
group_by(pickup_date) %>%
summarise(n_rides = n())

# Create a line plot


ggplot(daily_count, aes(pickup_date, n_rides)) +
geom_line()

Distribution of cab fare amount


library(ggplot2)

# Create a histogram of total_amount


ggplot(tx, aes(total_amount)) +
geom_histogram() +
scale_x_log10()

Distribution of payment type


library(ggplot2)

# Create a bar chart of payment_type


ggplot(tx, aes(payment_type)) +
geom_bar()

Relationship between trip duration and total fare


library(ggplot2)

# Create a hexagon-binned plot of total_amount vs. trip_duration


ggplot(tx, aes(trip_duration, total_amount)) +
geom_hex(bins = 75) +
scale_x_log10() +
scale_y_log10()

Faceting daily rides


library(dplyr)
library(ggplot2)

# Summarize taxi rides count by payment type, pickup date, pickup day of week
daily_count <- tx %>%
filter(payment_type %in% c("Card", "Cash")) %>%
group_by(pickup_date, pickup_dow, payment_type) %>%
summarise(n_rides = n())

# Plot the data


ggplot(daily_count, aes(pickup_date, n_rides)) +
geom_point() +
facet_grid(payment_type ~ pickup_dow) +
coord_fixed(ratio = 0.4)

Tip amount distribution faceted by payment type


library(ggplot2)

# Histogram of the tip amount faceted on payment type


ggplot(tx, aes(tip_amount + 0.01)) +
geom_histogram() +
scale_x_log10() +
facet_wrap(~ payment_type, ncol = 1, scales = "free_y")

Comparing fare distribution by payment type


library(ggplot2)
library(dplyr)
library(tidyr)

# Get data ready to plot


amount_compare <- tx_pop %>%
mutate(total_no_tip = total_amount - tip_amount) %>%
select(total_amount, total_no_tip, payment_type) %>%
gather(amount_type, amount, -payment_type)

# Quantile plot
ggplot(amount_compare, aes(sample = amount, color = payment_type)) +
geom_qq(distribution = stats::qunif, shape = 21) +
facet_wrap(~ amount_type) +
ylim(c(3, 20))

Trelliscope faceting gapminder by country


library(ggplot2)
library(trelliscopejs)

# Create the plot


ggplot(gapminder, aes(year, lifeExp)) +
geom_line() +
# Facet on country and continent
facet_trelliscope(~ country + continent)
///////////////////
library(ggplot2)
library(trelliscopejs)

ggplot(gapminder, aes(year, lifeExp)) +


geom_line() +
facet_trelliscope(~ country + continent,
name = "lifeExp_by_country",
desc = "Life expectancy vs. year per country",
nrow = 1, ncol = 2)

//3

Customizing the gapminder display


library(trelliscopejs)
library(ggplot2)

# Create the plot


ggplot(gapminder, aes(year, lifeExp)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_trelliscope(~ country + continent,
name = "lifeExp_by_country",
desc = "Life expectancy vs. year for 142 countries.",
nrow = 1, ncol = 2,
# Set the scales
scales = "sliced",
# Specify automatic cognostics
auto_cog = TRUE)
//3

Adding custom cognostics


library(ggplot2)
library(dplyr)
library(gapminder)
library(trelliscopejs)
space_to_dash <- function(x) gsub(" ", "-", x)

# Group by country and create the two new variables


gap <- gapminder %>%
group_by(country) %>%
mutate(
delta_lifeExp = tail(lifeExp, 1) - head(lifeExp, 1),
ihme_link = paste0("http://www.healthdata.org/", space_to_dash(country)))

# Add the description


gap$delta_lifeExp <- cog(gap$delta_lifeExp, desc = "Overall change in life
expectancy")
# Specify the default label
gap$ihme_link <- cog(gap$ihme_link, default_label = TRUE)

ggplot(gap, aes(year, lifeExp)) +


geom_point() +
facet_trelliscope(~ country + continent,
name = "lifeExp_by_country",
desc = "Life expectancy vs. year.",
nrow = 1, ncol = 2,
scales = c("same", "sliced"))

//1

Grouping and nesting


library(dplyr)
library(tidyr)

# Nest stocks by symbol


by_symbol <- stocks %>%
group_by(symbol) %>%
nest()
///////////
//////////2
library(dplyr)
library(tidyr)
library(purrr)

# Nest stocks by symbol


by_symbol <- stocks %>%
group_by(symbol) %>%
nest()

min_volume_fn <- function(x) min(x$volume)

# Create new column


by_symbol_min <- by_symbol %>%
mutate(min_volume = map_dbl(data, min_volume_fn))

Stock price display


library(trelliscopejs)
library(dplyr)
library(purrr)
library(plotly)

ohlc_plot <- function(d) {


plot_ly(d, x = ~date, type = "ohlc",
open = ~open, close = ~close,
high = ~high, low = ~low)
}

by_symbol_plot <- by_symbol %>%


mutate(panel = map_plot(data, ohlc_plot))
////////////////
//3
////////////////
library(trelliscopejs)
library(dplyr)
library(purrr)
library(plotly)

ohlc_plot <- function(d) {


plot_ly(d, x = ~date, type = "ohlc",
open = ~open, close = ~close,
high = ~high, low = ~low)
}

by_symbol_plot <- by_symbol %>%


mutate(panel = map_plot(data, ohlc_plot))

trelliscope(by_symbol_plot, name = "ohlc_top500")

//2

Adding cognostics
library(trelliscopejs)
library(dplyr)

# Create market_cap_log
by_symbol <- mutate(by_symbol,
market_cap_log = cog(
val = log10(market_cap),
desc = "log base 10 market capitalization"
)
)

Cognostics from nested data frames


library(trelliscopejs)
library(dplyr)
library(purrr)

annual_return <- function(x)


100 * (tail(x$close, 1) - head(x$open, 1)) / head(x$open, 1)

# Compute by_symbol_avg
by_symbol_avg <- mutate(by_symbol,
stats = map(data, function(x) {
data_frame(
mean_close = mean(x$close),
mean_volume = mean(x$volume),
annual_return = annual_return(x)
)
}))

//4

Customizing the stock display


library(trelliscopejs)

# Create the trelliscope display


p <- trelliscope(
by_symbol,
name = "ohlc_top500",
desc = "a plot",
width = 600,
height = 300
)

# View the display


p

Visualizing Pokemon
library(dplyr)
library(trelliscopejs)

# Create a new data frame to use for plotting


pokemon2 <- pokemon %>%
# Reduce the variables in the dataset
select(pokemon, type_1, attack, generation_id, url_image) %>%
mutate(
# Respecify pokemon
pokemon = cog(val = pokemon, default_label = TRUE),
# Create panel variable
panel = img_panel(url_image)
)

# Create the display


trelliscope(pokemon2, name = "pokemon", nrow = 3, ncol = 6)

//1

Number of daily rides


library(dplyr)
library(ggplot2)

# Compute daily counts


daily <- bike %>%
group_by(start_day, weekday) %>%
summarise(n = n())

# Plot the result


ggplot(daily, aes(start_day, n, color = weekday)) +
geom_point()

Examining time-of-day
library(dplyr)
library(ggplot2)
# Compute week_hod
week_hod <- bike %>%
group_by(start_wk, start_hod, weekday) %>%
summarise(n = n())

# Plot the result


ggplot(week_hod, aes(start_wk, n, color = weekday)) +
geom_point() +
facet_grid(~ start_hod) +
scale_y_sqrt()

Effect of membership and weekday


library(dplyr)
library(ggplot2)

# Compute wk_memb_hod
wk_memb_hod <- bike %>%
group_by(start_wk, start_hod, weekday, membership) %>%
summarise(n = n())

# Plot the result


ggplot(wk_memb_hod, aes(start_wk, n, color = weekday)) +
geom_point() +
facet_grid(membership ~ start_hod) +
scale_y_sqrt()

Daily plots
library(dplyr)
library(ggplot2)

# Compute daily_may
daily_may <- bike %>%
filter(start_mon == 5) %>%
group_by(start_day, start_hod, membership) %>%
summarise(n = n())

# Plot the result


ggplot(daily_may, aes(start_hod, n, color = membership)) +
geom_point() +
facet_wrap(~ start_day, ncol = 7)

Looking at all days


library(ggplot2)

ggplot(daily_may, aes(start_hod, n, color = membership)) +


geom_point() +
# Facet on start_day
facet_trelliscope(~ start_day, nrow = 3, ncol = 7)

Augmenting the Data: Route Summary Statistics


library(trelliscopejs)
library(ggplot2)
library(dplyr)

# Function to construct a Google maps URL with cycling directions


make_gmap_url <- function(start_lat, start_lon, end_lat, end_lon) {
paste0("https://www.google.com/maps/dir/?api=1",
"&origin=", start_lat, ",", start_lon,
"&destination=", end_lat, ",", end_lon,
"&travelmode=bicycling")
}

# Compute tot_rides, weekday_diff, and map_url


route_hod_updated <- route_hod %>%
group_by(start_station_code, end_station_code) %>%
mutate(
tot_rides = sum(n),
weekday_diff = mean(n[weekday == "workweek"]) - mean(n[weekday ==
"weekend"]),
map_url = make_gmap_url(start_lat, start_lon, end_lat, end_lon))

Visualizing the Data: Counts by Hour-of-Day


library(trelliscopejs)
library(ggplot2)

# Create the plot


ggplot(route_hod, aes(start_hod, n, color = weekday)) +
geom_point(size = 3) +
facet_trelliscope(~ start_station_name + end_station_name, nrow = 2, ncol =
4) +
theme(legend.position = "none")

//3

You might also like