0% found this document useful (0 votes)

188 views8 pages

Scalable Data Processing in R

This document discusses various techniques for working with large datasets in R, including using big data packages to efficiently store, access, and analyze data that does not fit into memory. It shows how to load large CSV files as big matrix objects, perform operations like sorting, tabulating, and summarizing data in a distributed manner. It also demonstrates using parallel processing to further improve performance of analytic tasks on large datasets.

Uploaded by

Octavio Flores

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

188 views8 pages

Scalable Data Processing in R

Uploaded by

Octavio Flores

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

//4

How does processing time vary by data size?

# Load the microbenchmark package
library(microbenchmark)

# Compare the timings for sorting different sizes of vector

mb <- microbenchmark(
# Sort a random normal vector length 1e5
"1e5" = sort(rnorm(1e5)),
# Sort a random normal vector length 2.5e5
"2.5e5" = sort(rnorm(2.5e5)),
# Sort a random normal vector length 5e5
"5e5" = sort(rnorm(5e5)),
"7.5e5" = sort(rnorm(7.5e5)),
"1e6" = sort(rnorm(1e6)),
times = 10
)

# Plot the resulting benchmark object

plot(mb)

Reading a [Link] object

# Load the bigmemory package
library(bigmemory)

# Create the [Link] object: x

x <- [Link]("[Link]", header = TRUE,
type = "integer",
backingfile = "[Link]",
descriptorfile = "[Link]")

# Find the dimensions of x

dim(x)

Attaching a [Link] object

# Attach [Link]
mort <- [Link]("[Link]")

# Find the dimensions of mort

dim(mort)

# Look at the first 6 rows of mort

head(mort)

Creating tables with [Link] objects

# Create mort
mort <- [Link]("[Link]")

# Look at the first 3 rows

mort[1:3, ]

# Create a table of the number of mortgages for each year in the data set
table(mort[, "year"])

Data summary using bigsummary

# Load the biganalytics package
library(biganalytics)
# Get the column means of mort
colmean(mort)

# Use biganalytics' summary function to get a summary of the data

summary(mort)

Copying matrices and big matrices

# Use deepcopy() to create first_three
first_three <- deepcopy(mort, cols = 1:3,
backingfile = "first_three.bin",
descriptorfile = "first_three.desc")

# Set first_three_2 equal to first_three

first_three_2 <- first_three

# Set the value in the first row and first column of first_three to NA
first_three[1, 1] <- NA

# Verify the change shows up in first_three_2

first_three_2[1, 1]

# but not in mort

mort[1, 1]

Tabulating using bigtable

# Load the bigtabulate package
library(bigtabulate)

# Call bigtable to create a variable called race_table

race_table <- bigtable(mort, "borrower_race")

# Rename the elements of race_table

names(race_table) <- race_cat
race_table

Borrower Race and Ethnicity by Year (I)

# Create a table of the borrower race by year
race_year_table <- bigtable(mort, c("borrower_race", "year"))

# Convert rydf to a data frame

rydf <- [Link](race_year_table)

# Create the new column Race

rydf$Race <- race_cat

# Let's see what it looks like

rydf

Female Proportion Borrowing

female_residence_prop <- function(x, rows) {
x_subset <- x[rows, ]
# Find the proporation of female borrowers in urban areas
prop_female_urban <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 1) /
sum(x_subset[, "msa"] == 1)
# Find the proporation of female borrowers in rural areas
prop_female_rural <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 0) /
sum(x_subset[, "msa"] == 0)
c(prop_female_urban, prop_female_rural)
}

# Find the proportion of female borrowers in 2015

female_residence_prop(mort, mort[, "year"] == 2015)

Split
# Split the row numbers of the mortage data by year
spl <- split(1:nrow(mort), mort[, "year"])

# Call str on spl

str(spl)

Apply
# For each of the row splits, find the female residence proportion
all_years <- Map(function(rows) female_residence_prop(mort, rows), spl)

# Call str on all_years

str(all_years)

Combine
# Collect the results as rows in a matrix
prop_female <- Reduce(rbind, all_years)

# Rename the row and column names

dimnames(prop_female) <- list(names(all_years), c("prop_female_urban",
"prop_femal_rural"))

# View the matrix

prop_female

Visualizing Female Proportion Borrowing

# Load the tidyr and ggplot2 packages
library(tidyr)
library(ggplot2)

# Convert prop_female to a data frame

prop_female_df <- [Link](prop_female)

# Add a new column Year

prop_female_df$Year <- [Link](prop_female_df)

# Call gather on prop_female_df

prop_female_long <- gather(prop_female_df, Region, Prop, -Year)

# Create a line plot

ggplot(prop_female_long, aes(x = Year, y = Prop, group = Region, color =
Region)) +
geom_line()

The Borrower Income Ratio

# Load biganalytics and dplyr packages
library(biganalytics)
library(dplyr)

# Call summary on mort

summary(mort)
bir_df_wide <- bigtable(mort, c("borrower_income_ratio", "year")) %>%
# Turn it into a [Link]
[Link]() %>%
# Create a new column called BIR with the corresponding table categories
mutate(BIR = c(">=0,<=50%", ">50, <=80%", ">80%"))

bir_df_wide

Tidy Big Tables

# Load the tidyr and ggplot2 packages
library(tidyr)
library(ggplot2)

bir_df_wide %>%
# Transform the wide-formatted [Link] into the long format
gather(Year, Count, -BIR) %>%
# Use ggplot to create a line plot
ggplot(aes(x = Year, y = Count, group = BIR, color = BIR)) +
geom_line()

//2
//4

Foldable operations (I)

foldable_range <- function(x) {
if ([Link](x)) {
# If x is a list then reduce it by the min and max of each element in the
list
c(Reduce(min, x), Reduce(max, x))
} else {
# Otherwise, assume it's a vector and find it's range
range(x)
}
}

# Verify that foldable_range() works on the record_number column

foldable_range(mort[, "record_number"])

Foldable operations (II)

# Split the mortgage data by year
spl <- split(1:nrow(mort), mort[,"year"])

# Use foldable_range() to get the range of the record numbers

foldable_range(Map(function(s) foldable_range(mort[s, "record_number"]),
spl))

Compare [Link]() and [Link]()

# Load the iotools and microbenchmark packages
library(iotools)
library(microbenchmark)

# Time the reading of files

microbenchmark(
# Time the reading of a file using [Link] five times
[Link]("[Link]", header = FALSE, sep = ","),
# Time the reading of a file using [Link] five times
[Link]("[Link]", header = FALSE, sep = ","),
times = 5
)
Reading raw data and turning it into a data structure
# Read [Link] as a raw vector
raw_file_content <- readAsRaw("[Link]")

# Convert the raw vector contents to a matrix

mort_mat <- mstrsplit(raw_file_content, sep = ",", type = "integer", skip =
1)

# Look at the first 6 rows

head(mort_mat)

# Convert the raw file contents to a [Link]

mort_df <- dstrsplit(raw_file_content, sep = ",", col_types = rep("integer",
16), skip = 1)

# Look at the first 6 rows

head(mort_df)

Reading chunks in as a matrix

# Define the function to apply to each chunk
make_table <- function(chunk) {
# Read each chunk as a matrix
x <- mstrsplit(chunk, type = "integer", sep = ",")
# Create a table of the number of borrowers (column 3) for each chunk
table(x[, 3])
}

Reading chunks in as a [Link]

# Define the function to apply to each chunk
make_msa_table <- function(chunk) {
# Read each chunk as a data frame
x <- dstrsplit(chunk, col_types = rep("integer", length(col_names)), sep
= ",")
# Set the column names of the data frame that's been read
colnames(x) <- col_names
# Create new column, msa_pretty, with a string description of where the
borrower lives
x$msa_pretty <- msa_map[x$msa + 1]
# Create a table from the msa_pretty column
table(x$msa_pretty)
}

# Create a file connection to [Link]

fc <- file("[Link]", "rb")

# Read the first line to get rid of the header

readLines(fc, n = 1)

# Read the data in chunks

counts <- [Link](fc, make_msa_table, [Link] = 1e5)

# Close the file connection

close(fc)

# Aggregate the counts as before

colSums(counts)

Parallelizing calls to [Link]

iotools_read_fun <- function(parallel) {
fc <- file("[Link]", "rb")
readLines(fc, n = 1)
[Link](fc, make_msa_table,
[Link] = 1e5, parallel = parallel)
close(fc)
}

# Benchmark the new function

microbenchmark(
# Use one process
iotools_read_fun(parallel = 1),
# Use three processes
iotools_read_fun(parallel = 3),
times = 20
)

Race and Ethnic Representation in the Mortgage Data

# Create a table of borrower_race column
race_table <- bigtable(mort, "borrower_race")

# Rename the elements

names(race_table) <- race_cat[[Link](names(race_table))]

# Find the proportion

race_table[1:7] / sum(race_table[1:7])

Comparing the Borrower Race/Ethnicity and their Proportions

# Create table of the borrower_race
race_table_chunks <- [Link](
"[Link]", function(chunk) {
x <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(x) <- mort_names
table(x[, "borrower_race"])
}, [Link] = 1e5)

# Add up the columns

race_table <- colSums(race_table_chunks)

# Find the proportion

borrower_proportion <- race_table[1:7] / sum(race_table[1:7])

# Create the matrix

matrix(c(pop_proportion, borrower_proportion), byrow = TRUE, nrow = 2,
dimnames = list(c("Population Proportion", "Borrower Proportion"),
race_cat[1:7]))

Looking for Predictable Missingness

# Create a variable indicating if borrower_race is missing in the mortgage
data
borrower_race_ind <- mort[, "borrower_race"] == 9

# Create a factor variable indicating the affordability

affordability_factor <- factor(mort[, "affordability"])

# Perform a logistic regression

summary(glm(borrower_race_ind ~ affordability_factor, family = binomial))

//3
Borrower Race and Ethnicity by Year (II)
# Open a connection to the file and skip the header
fc <- file("[Link]", "rb")
readLines(fc, n = 1)

# Create a function to read chunks

make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("borrower_race", "year"))
}

# Import data using [Link]

race_year_table <- [Link](fc, make_table)

# Close connection
close(fc)

# Cast it to a data frame

rydf <- [Link](race_year_table)

# Create a new column Race with race/ethnicity

rydf$Race <- race_cat

Visualizing the Adjusted Demographic Trends

# View rydf
rydf

# View pop_proportion
pop_proportion

# Gather on all variables except Race

rydfl <- gather(rydf, Year, Count, -Race)

# Create a new adjusted count variable

rydfl$Adjusted_Count <- rydfl$Count / pop_proportion[rydfl$Race]

# Plot
ggplot(rydfl, aes(x = Year, y = Adjusted_Count, group = Race, color = Race))
+
geom_line()

Relative change in demographic trend

# View rydf
rydf

# Get the first column of rydf

column1 <- rydf[, 1]

# Normalize the first 8 columns

for(this_column in 1:8) {
rydf[, this_column] <- rydf[, this_column] / column1
}

# Convert the data to long format

rydf_long <- gather(rydf, Year, Proportion, -Race)
# Plot
ggplot(rydf_long, aes(x = Year, y = Proportion, group = Race, color = Race))
+
geom_line()

Borrower Region by Year

# Open a connection to the file and skip the header
fc <- file("[Link]", "rb")
readLines(fc, n = 1)

# Create a function to read chunks

make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("msa", "year"))
}

# Import data using [Link]

msa_year_table <- [Link](fc, make_table)

# Close connection
close(fc)

# Convert to a data frame

df_msa <- [Link](msa_year_table)

# Rename columns
df_msa$MSA <- c("rural", "city")

# Gather on all columns except Year

df_msa_long <- gather(df_msa, Year, Count, -MSA)

# Plot
ggplot(df_msa_long, aes(x = Year, y = Count, group = MSA, color = MSA)) +
geom_line()

Who is securing federally guaranteed loans?

# Tabulate borrower_income_ratio and federal_guarantee
ir_by_fg <- bigtable(mort, c("borrower_income_ratio", "federal_guarantee"))

# Label the columns and rows of the table

dimnames(ir_by_fg) <- list(income_cat, guarantee_cat)

# For each row in ir_by_fg, divide by the sum of the row

for (i in seq_len(nrow(ir_by_fg))) {
ir_by_fg[i, ] <- ir_by_fg[i, ] / sum(ir_by_fg[i, ])
}

# Print
ir_by_fg

Trelliscope Visualization of Rides
No ratings yet
Trelliscope Visualization of Rides
7 pages
R Benchmarking Techniques and Comparisons
No ratings yet
R Benchmarking Techniques and Comparisons
5 pages
Supervised Learning in R Classification
No ratings yet
Supervised Learning in R Classification
7 pages
Most Frequent Words Analysis in R
100% (4)
Most Frequent Words Analysis in R
14 pages
Introduction To Spark With Sparklyr in R
No ratings yet
Introduction To Spark With Sparklyr in R
11 pages
Yahia Belayadi: Real Name in Research
No ratings yet
Yahia Belayadi: Real Name in Research
8 pages
Learn Python With Examples
100% (10)
Learn Python With Examples
92 pages
Literary Analysis
No ratings yet
Literary Analysis
1 page
Learner Reflection Form
No ratings yet
Learner Reflection Form
3 pages
SA-1 - VII Maths EM
No ratings yet
SA-1 - VII Maths EM
6 pages
? Sermon-Obedience
No ratings yet
? Sermon-Obedience
5 pages
CN Lab Manual - r22
0% (1)
CN Lab Manual - r22
85 pages
A Job Interview Learner Worksheets
100% (1)
A Job Interview Learner Worksheets
7 pages
Midterm Examinations
No ratings yet
Midterm Examinations
4 pages
Think Like A Psychologist: Patrick King
No ratings yet
Think Like A Psychologist: Patrick King
13 pages
G3335-90031 Quant Familiarization
No ratings yet
G3335-90031 Quant Familiarization
110 pages
Past Tense Irregular Verbs
No ratings yet
Past Tense Irregular Verbs
1 page
Literary Meaning Reclaiming The Study of Literature
No ratings yet
Literary Meaning Reclaiming The Study of Literature
260 pages
2 Appreciating Others
No ratings yet
2 Appreciating Others
5 pages
2025 Jehovah's Witnesses Calendar
No ratings yet
2025 Jehovah's Witnesses Calendar
8 pages
Appreciation The Importance Language To Human Lives
No ratings yet
Appreciation The Importance Language To Human Lives
12 pages
Grammar Extra: Advanced Unit 7
No ratings yet
Grammar Extra: Advanced Unit 7
2 pages
English Language B Weeks 1 To 4
No ratings yet
English Language B Weeks 1 To 4
7 pages
Cloudian HyperStore Introduction
No ratings yet
Cloudian HyperStore Introduction
6 pages
Writing Effective Concept Papers
No ratings yet
Writing Effective Concept Papers
4 pages
Valenzuela City Reading Program Evaluation
No ratings yet
Valenzuela City Reading Program Evaluation
3 pages
Windows Kernel Overview
No ratings yet
Windows Kernel Overview
14 pages
English Tenses and Grammar Guide
100% (2)
English Tenses and Grammar Guide
9 pages
Effective Communication in Healthcare
100% (1)
Effective Communication in Healthcare
30 pages
Chapter 16
No ratings yet
Chapter 16
20 pages
Web Scraping by Using R
No ratings yet
Web Scraping by Using R
3 pages
Understanding Database Triggers in Oracle
No ratings yet
Understanding Database Triggers in Oracle
5 pages
Itl 512 Learning Map Planning 1
No ratings yet
Itl 512 Learning Map Planning 1
12 pages
Wordpress Testing Way
No ratings yet
Wordpress Testing Way
3 pages
SPVM Instructions For The Preparation of A 1-Page Extended Abstract
No ratings yet
SPVM Instructions For The Preparation of A 1-Page Extended Abstract
1 page