Scalable Data Processing in R

Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 8

//4

How does processing time vary by data size?


# Load the microbenchmark package
library(microbenchmark)

# Compare the timings for sorting different sizes of vector


mb <- microbenchmark(
# Sort a random normal vector length 1e5
"1e5" = sort(rnorm(1e5)),
# Sort a random normal vector length 2.5e5
"2.5e5" = sort(rnorm(2.5e5)),
# Sort a random normal vector length 5e5
"5e5" = sort(rnorm(5e5)),
"7.5e5" = sort(rnorm(7.5e5)),
"1e6" = sort(rnorm(1e6)),
times = 10
)

# Plot the resulting benchmark object


plot(mb)

Reading a big.matrix object


# Load the bigmemory package
library(bigmemory)

# Create the big.matrix object: x


x <- read.big.matrix("mortgage-sample.csv", header = TRUE,
type = "integer",
backingfile = "mortgage-sample.bin",
descriptorfile = "mortgage-sample.desc")

# Find the dimensions of x


dim(x)

Attaching a big.matrix object


# Attach mortgage-sample.desc
mort <- attach.big.matrix("mortgage-sample.desc")

# Find the dimensions of mort


dim(mort)

# Look at the first 6 rows of mort


head(mort)

Creating tables with big.matrix objects


# Create mort
mort <- attach.big.matrix("mortgage-sample.desc")

# Look at the first 3 rows


mort[1:3, ]

# Create a table of the number of mortgages for each year in the data set
table(mort[, "year"])

Data summary using bigsummary


# Load the biganalytics package
library(biganalytics)
# Get the column means of mort
colmean(mort)

# Use biganalytics' summary function to get a summary of the data


summary(mort)

Copying matrices and big matrices


# Use deepcopy() to create first_three
first_three <- deepcopy(mort, cols = 1:3,
backingfile = "first_three.bin",
descriptorfile = "first_three.desc")

# Set first_three_2 equal to first_three


first_three_2 <- first_three

# Set the value in the first row and first column of first_three to NA
first_three[1, 1] <- NA

# Verify the change shows up in first_three_2


first_three_2[1, 1]

# but not in mort


mort[1, 1]

Tabulating using bigtable


# Load the bigtabulate package
library(bigtabulate)

# Call bigtable to create a variable called race_table


race_table <- bigtable(mort, "borrower_race")

# Rename the elements of race_table


names(race_table) <- race_cat
race_table

Borrower Race and Ethnicity by Year (I)


# Create a table of the borrower race by year
race_year_table <- bigtable(mort, c("borrower_race", "year"))

# Convert rydf to a data frame


rydf <- as.data.frame(race_year_table)

# Create the new column Race


rydf$Race <- race_cat

# Let's see what it looks like


rydf

Female Proportion Borrowing


female_residence_prop <- function(x, rows) {
x_subset <- x[rows, ]
# Find the proporation of female borrowers in urban areas
prop_female_urban <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 1) /
sum(x_subset[, "msa"] == 1)
# Find the proporation of female borrowers in rural areas
prop_female_rural <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 0) /
sum(x_subset[, "msa"] == 0)
c(prop_female_urban, prop_female_rural)
}

# Find the proportion of female borrowers in 2015


female_residence_prop(mort, mort[, "year"] == 2015)

Split
# Split the row numbers of the mortage data by year
spl <- split(1:nrow(mort), mort[, "year"])

# Call str on spl


str(spl)

Apply
# For each of the row splits, find the female residence proportion
all_years <- Map(function(rows) female_residence_prop(mort, rows), spl)

# Call str on all_years


str(all_years)

Combine
# Collect the results as rows in a matrix
prop_female <- Reduce(rbind, all_years)

# Rename the row and column names


dimnames(prop_female) <- list(names(all_years), c("prop_female_urban",
"prop_femal_rural"))

# View the matrix


prop_female

Visualizing Female Proportion Borrowing


# Load the tidyr and ggplot2 packages
library(tidyr)
library(ggplot2)

# Convert prop_female to a data frame


prop_female_df <- as.data.frame(prop_female)

# Add a new column Year


prop_female_df$Year <- row.names(prop_female_df)

# Call gather on prop_female_df


prop_female_long <- gather(prop_female_df, Region, Prop, -Year)

# Create a line plot


ggplot(prop_female_long, aes(x = Year, y = Prop, group = Region, color =
Region)) +
geom_line()

The Borrower Income Ratio


# Load biganalytics and dplyr packages
library(biganalytics)
library(dplyr)

# Call summary on mort


summary(mort)
bir_df_wide <- bigtable(mort, c("borrower_income_ratio", "year")) %>%
# Turn it into a data.frame
as.data.frame() %>%
# Create a new column called BIR with the corresponding table categories
mutate(BIR = c(">=0,<=50%", ">50, <=80%", ">80%"))

bir_df_wide

Tidy Big Tables


# Load the tidyr and ggplot2 packages
library(tidyr)
library(ggplot2)

bir_df_wide %>%
# Transform the wide-formatted data.frame into the long format
gather(Year, Count, -BIR) %>%
# Use ggplot to create a line plot
ggplot(aes(x = Year, y = Count, group = BIR, color = BIR)) +
geom_line()

//2
//4

Foldable operations (I)


foldable_range <- function(x) {
if (is.list(x)) {
# If x is a list then reduce it by the min and max of each element in the
list
c(Reduce(min, x), Reduce(max, x))
} else {
# Otherwise, assume it's a vector and find it's range
range(x)
}
}

# Verify that foldable_range() works on the record_number column


foldable_range(mort[, "record_number"])

Foldable operations (II)


# Split the mortgage data by year
spl <- split(1:nrow(mort), mort[,"year"])

# Use foldable_range() to get the range of the record numbers


foldable_range(Map(function(s) foldable_range(mort[s, "record_number"]),
spl))

Compare read.delim() and read.delim.raw()


# Load the iotools and microbenchmark packages
library(iotools)
library(microbenchmark)

# Time the reading of files


microbenchmark(
# Time the reading of a file using read.delim five times
read.delim("mortgage-sample.csv", header = FALSE, sep = ","),
# Time the reading of a file using read.delim.raw five times
read.delim.raw("mortgage-sample.csv", header = FALSE, sep = ","),
times = 5
)
Reading raw data and turning it into a data structure
# Read mortgage-sample.csv as a raw vector
raw_file_content <- readAsRaw("mortgage-sample.csv")

# Convert the raw vector contents to a matrix


mort_mat <- mstrsplit(raw_file_content, sep = ",", type = "integer", skip =
1)

# Look at the first 6 rows


head(mort_mat)

# Convert the raw file contents to a data.frame


mort_df <- dstrsplit(raw_file_content, sep = ",", col_types = rep("integer",
16), skip = 1)

# Look at the first 6 rows


head(mort_df)

Reading chunks in as a matrix


# Define the function to apply to each chunk
make_table <- function(chunk) {
# Read each chunk as a matrix
x <- mstrsplit(chunk, type = "integer", sep = ",")
# Create a table of the number of borrowers (column 3) for each chunk
table(x[, 3])
}

Reading chunks in as a data.frame


# Define the function to apply to each chunk
make_msa_table <- function(chunk) {
# Read each chunk as a data frame
x <- dstrsplit(chunk, col_types = rep("integer", length(col_names)), sep
= ",")
# Set the column names of the data frame that's been read
colnames(x) <- col_names
# Create new column, msa_pretty, with a string description of where the
borrower lives
x$msa_pretty <- msa_map[x$msa + 1]
# Create a table from the msa_pretty column
table(x$msa_pretty)
}

# Create a file connection to mortgage-sample.csv


fc <- file("mortgage-sample.csv", "rb")

# Read the first line to get rid of the header


readLines(fc, n = 1)

# Read the data in chunks


counts <- chunk.apply(fc, make_msa_table, CH.MAX.SIZE = 1e5)

# Close the file connection


close(fc)

# Aggregate the counts as before


colSums(counts)

Parallelizing calls to chunk.apply


iotools_read_fun <- function(parallel) {
fc <- file("mortgage-sample.csv", "rb")
readLines(fc, n = 1)
chunk.apply(fc, make_msa_table,
CH.MAX.SIZE = 1e5, parallel = parallel)
close(fc)
}

# Benchmark the new function


microbenchmark(
# Use one process
iotools_read_fun(parallel = 1),
# Use three processes
iotools_read_fun(parallel = 3),
times = 20
)

Race and Ethnic Representation in the Mortgage Data


# Create a table of borrower_race column
race_table <- bigtable(mort, "borrower_race")

# Rename the elements


names(race_table) <- race_cat[as.numeric(names(race_table))]

# Find the proportion


race_table[1:7] / sum(race_table[1:7])

Comparing the Borrower Race/Ethnicity and their Proportions


# Create table of the borrower_race
race_table_chunks <- chunk.apply(
"mortgage-sample.csv", function(chunk) {
x <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(x) <- mort_names
table(x[, "borrower_race"])
}, CH.MAX.SIZE = 1e5)

# Add up the columns


race_table <- colSums(race_table_chunks)

# Find the proportion


borrower_proportion <- race_table[1:7] / sum(race_table[1:7])

# Create the matrix


matrix(c(pop_proportion, borrower_proportion), byrow = TRUE, nrow = 2,
dimnames = list(c("Population Proportion", "Borrower Proportion"),
race_cat[1:7]))

Looking for Predictable Missingness


# Create a variable indicating if borrower_race is missing in the mortgage
data
borrower_race_ind <- mort[, "borrower_race"] == 9

# Create a factor variable indicating the affordability


affordability_factor <- factor(mort[, "affordability"])

# Perform a logistic regression


summary(glm(borrower_race_ind ~ affordability_factor, family = binomial))

//3
Borrower Race and Ethnicity by Year (II)
# Open a connection to the file and skip the header
fc <- file("mortgage-sample.csv", "rb")
readLines(fc, n = 1)

# Create a function to read chunks


make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("borrower_race", "year"))
}

# Import data using chunk.apply


race_year_table <- chunk.apply(fc, make_table)

# Close connection
close(fc)

# Cast it to a data frame


rydf <- as.data.frame(race_year_table)

# Create a new column Race with race/ethnicity


rydf$Race <- race_cat

Visualizing the Adjusted Demographic Trends


# View rydf
rydf

# View pop_proportion
pop_proportion

# Gather on all variables except Race


rydfl <- gather(rydf, Year, Count, -Race)

# Create a new adjusted count variable


rydfl$Adjusted_Count <- rydfl$Count / pop_proportion[rydfl$Race]

# Plot
ggplot(rydfl, aes(x = Year, y = Adjusted_Count, group = Race, color = Race))
+
geom_line()

Relative change in demographic trend


# View rydf
rydf

# Get the first column of rydf


column1 <- rydf[, 1]

# Normalize the first 8 columns


for(this_column in 1:8) {
rydf[, this_column] <- rydf[, this_column] / column1
}

# Convert the data to long format


rydf_long <- gather(rydf, Year, Proportion, -Race)
# Plot
ggplot(rydf_long, aes(x = Year, y = Proportion, group = Race, color = Race))
+
geom_line()

Borrower Region by Year


# Open a connection to the file and skip the header
fc <- file("mortgage-sample.csv", "rb")
readLines(fc, n = 1)

# Create a function to read chunks


make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("msa", "year"))
}

# Import data using chunk.apply


msa_year_table <- chunk.apply(fc, make_table)

# Close connection
close(fc)

# Convert to a data frame


df_msa <- as.data.frame(msa_year_table)

# Rename columns
df_msa$MSA <- c("rural", "city")

# Gather on all columns except Year


df_msa_long <- gather(df_msa, Year, Count, -MSA)

# Plot
ggplot(df_msa_long, aes(x = Year, y = Count, group = MSA, color = MSA)) +
geom_line()

Who is securing federally guaranteed loans?


# Tabulate borrower_income_ratio and federal_guarantee
ir_by_fg <- bigtable(mort, c("borrower_income_ratio", "federal_guarantee"))

# Label the columns and rows of the table


dimnames(ir_by_fg) <- list(income_cat, guarantee_cat)

# For each row in ir_by_fg, divide by the sum of the row


for (i in seq_len(nrow(ir_by_fg))) {
ir_by_fg[i, ] <- ir_by_fg[i, ] / sum(ir_by_fg[i, ])
}

# Print
ir_by_fg

You might also like