DataViz Ggplot Sample
DataViz Ggplot Sample
Data Vizualization
• R has several systems for making graphs
• ggplot2 - most elegant and most versatile
• ggplot2 implements the grammar of graphics, a coherent system for describing and building graphs
• Prerequisites: - install.packages(“tidyverse”) - library(tidyverse)
• to specify explicitly about where a function (or dataset) comes from use
– package::function()
– ggplot2 :: ggplot
Creating a graph
library(tidyverse)
1
str(cars)
2
125
100
75
dist
50
25
0
5 10 15 20 25
speed
str(mpg)
3
40
30
hwy
20
2 3 4 5 6 7
displ
4
40
30
hwy
20
2 3 4 5 6 7
displ
#Scatter plot is best for continuous variables (x,y) and not useful to display the categorical variables
Aesthetic mappings
• An aesthetic is a visual property of the objects in your plot. Aesthetics include things like the size, the
shape, or the color of your points. #color
str(mpg)
5
40
class
2seater
compact
30
midsize
hwy
minivan
pickup
subcompact
suv
20
2 3 4 5 6 7
displ
#Color manufacturers with different color
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy, color=manufacturer))
6
manufacturer
audi
40 chevrolet
dodge
ford
honda
hyundai
30
jeep
hwy
land rover
lincoln
mercury
nissan
20
pontiac
subaru
toyota
volkswagen
2 3 4 5 6 7
displ
##Problems #size
#Using size for a discrete variable is not advised.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, size = manufacturer))
7
manufacturer
audi
40 chevrolet
dodge
ford
honda
hyundai
30
jeep
hwy
land rover
lincoln
mercury
nissan
20
pontiac
subaru
toyota
volkswagen
2 3 4 5 6 7
displ
ggplot(data = cars) +
geom_point(mapping = aes(x=speed, y= dist, size = speed))
8
125
100
speed
75 5
10
dist
15
50 20
25
25
0
5 10 15 20 25
speed
#use shape
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 7. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
9
40
class
2seater
compact
30
midsize
hwy
minivan
pickup
subcompact
suv
20
2 3 4 5 6 7
displ #
use numbers for the shape to specify upto 25 R buit-in shapes (0 to 24)
shapes <- data.frame(
shape = c(0:19, 22, 21, 24, 23, 20),
x = 0:24 %/% 5,
y = -(0:24 %% 5)
)
ggplot(shapes, aes(x, y)) +
geom_point(aes(shape = shape), size = 5, fill = "red") +
geom_text(aes(label = shape), hjust = 0, nudge_x = 0.15) +
scale_shape_identity() +
expand_limits(x = 4.1) +
theme_void()
10
0 5 10 15 22
1 6 11 16 21
2 7 12 17 24
3 8 13 18 23
4 9 14 19 20
#use alpha
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
11
40
class
2seater
compact
30
midsize
hwy
minivan
pickup
subcompact
suv
20
2 3 4 5 6 7
displ
12
Set the aesthetic properties of geom manually
40
30
hwy
20
2 3 4 5 6 7
displ
##Facets #One way to add additional variables is with aesthetics. #Another way, particularly useful for
categorical variables, is to split your plot into facets, subplots that each display one subset of the data.
#To facet your plot by a single variable, use facet_wrap(). #The first argument of facet_wrap() should be a
formula, which you create with ~ followed by a variable name (here “formula” is the name of a data structure
in R, not a synonym for “equation”). The variable that you pass to facet_wrap() should be discrete.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
13
2seater compact midsize minivan
40
30
20
hwy
2 3 4 5 6 7
pickup subcompact suv
40
30
20
2 3 4 5 6 7 2 3 4 5 6 7 2 3 4 5 6 7
displ
#To facet your plot on the combination of two variables, add facet_grid() to your plot call. The formula
should contain two variable names separated by a ~.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl)
14
4 5 6 8
40
30
4
20
40
hwy
30
f
20
40
30
r
20
2 3 4 5 6 7 2 3 4 5 6 7 2 3 4 5 6 7 2 3 4 5 6 7
displ
#Statistical Transformation
str(diamonds)
15
20000
15000
count
10000
5000
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa...
ggplot(data = iris) +
geom_bar(mapping = aes(x = Species))
16
50
40
30
count
20
10
On flights dataset
library(nycflights13)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 60...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 8...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
17
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-0...
ggplot(data = flights) +
geom_bar(mapping = aes(x = carrier))
60000
40000
count
20000
9E AA AS B6 DL EV F9 FL HA MQ OO UA US VX WN YV
carrier
ggplot(data = flights) +
geom_bar(mapping = aes(x = origin))
18
125000
100000
75000
count
50000
25000
19
1000
750
count
500
250
## # A tibble: 100 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 12 1758 1359 239 1910 1511
## 2 2013 10 28 1152 1200 -8 1343 1334
## 3 2013 4 6 1508 1515 -7 1731 1740
## 4 2013 6 18 604 610 -6 1000 924
## 5 2013 6 10 1522 1500 22 1633 1646
## 6 2013 9 15 NA 1159 NA NA 1309
## 7 2013 3 10 2051 2045 6 2349 2357
## 8 2013 8 4 1815 1627 108 1958 1813
## 9 2013 7 26 646 647 -1 806 809
## 10 2013 9 24 1258 1305 -7 1424 1438
## # ... with 90 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
ggplot(data = fl100) +
geom_bar(mapping = aes(x = carrier))
20
20
15
count
10
9E AA B6 DL EV F9 MQ UA US VX WN YV
carrier
ggplot(data = fl100) +
geom_bar(mapping = aes(x = origin))
21
40
30
count
20
10
## # A tibble: 100 x 5
## flight origin dest air_time carrier
## <int> <chr> <chr> <dbl> <chr>
## 1 1703 EWR BOS 35 UA
## 2 667 LGA ORD 122 UA
## 3 4309 JFK IND 106 MQ
## 4 303 JFK SFO 369 UA
## 5 1498 LGA ORD 105 UA
## 6 3386 JFK DCA NA MQ
## 7 677 JFK LAX 318 B6
## 8 5940 EWR RDU 68 EV
## 9 905 JFK ORD 111 B6
## 10 5378 LGA PIT 54 EV
## # ... with 90 more rows
#Additional
use alpha
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi"...
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro"...
22
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0,...
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, ...
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, ...
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "a...
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4",...
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17...
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25...
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",...
## $ class <chr> "compact", "compact", "compact", "compact", "compact",...
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy, alpha=year))
40
year
2000
30
2002
hwy
2004
2006
2008
20
2 3 4 5 6 7
displ
23