0% found this document useful (0 votes)
26 views

10-Visualization of Streaming Data and Class R Code-10!03!2023

This document contains code for various data visualization and analysis techniques including: 1. Contour plots of 2D and 3D surfaces from sample data using base R plotting functions. 2. Scatter plots, density plots, and contour plots of sample data overlaid using ggplot2. 3. Kernel density estimation and 3D surface plot of sample data using Plotly. 4. Node-link diagrams and network graphs of sample data using igraph, ggraph, and networkD3 packages in R.

Uploaded by

G Krishna Vamsi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
26 views

10-Visualization of Streaming Data and Class R Code-10!03!2023

This document contains code for various data visualization and analysis techniques including: 1. Contour plots of 2D and 3D surfaces from sample data using base R plotting functions. 2. Scatter plots, density plots, and contour plots of sample data overlaid using ggplot2. 3. Kernel density estimation and 3D surface plot of sample data using Plotly. 4. Node-link diagrams and network graphs of sample data using igraph, ggraph, and networkD3 packages in R.

Uploaded by

G Krishna Vamsi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 19

# Data

x <- -10:10
y <- -10:10
z <- sqrt(outer(x ^ 2, y ^ 2, "+"))

contour(x, y, z)

# You can also type, the following


# but the axes will be between 0 and 1
contour(z)

#=====
x <- -10:10
y <- -10:10
z <- sqrt(outer(x ^ 2, y ^ 2, "+"))

contour(x, y, z,
nlevels = 20)

############################################

# Libraries
library(tidyverse)
#install.packages('hrbrthemes')
library(hrbrthemes)
library(viridis)
#install.packages('patchwork')
library(patchwork)

# Dataset:
a <- data.frame( x=rnorm(20000, 10, 1.2),
y=rnorm(20000, 10, 1.2),
group=rep("A",20000))
b <- data.frame( x=rnorm(20000, 14.5, 1.2), y=rnorm(20000, 14.5, 1.2),
group=rep("B",20000))
c <- data.frame( x=rnorm(20000, 9.5, 1.5), y=rnorm(20000, 15.5, 1.5),
group=rep("C",20000))
data <- do.call(rbind, list(a,b,c))
View(data)
p1 <- data %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="#69b3a2", size=2) +
theme_ipsum() +
theme( legend.position="non
e"
)
p2 <- ggplot(data, aes(x=x, y=y) ) +
stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
scale_fill_viridis() +
theme( legend.position
='none'
)

p1 + p2

library(plotly)
library(MASS)

# Compute kde2d
kd <- with(data, MASS::kde2d(x, y, n = 50))

# Plot with plotly


plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()

#Node-Link Diagram
#https://krisrs1128.github.io/stat479/

library("dplyr")
library("knitr")
library("ggplot2")
#install.packages('ggraph')
library("ggraph")
library("gridExtra")
#install.packages("networkD3")
library("networkD3")
library("tidygraph")
theme_set(theme_graph())

#A node-link diagram
G_school <- as_tbl_graph(highschool) %>%
activate(edges) %>%
mutate(year = factor(year))

ggraph(G_school) +
geom_edge_link(aes(col = year), width = 0.1) +
geom_node_point()

#For Tree
G_flare <- tbl_graph(flare$vertices, flare$edges)
p1 <- ggraph(G_flare, 'tree') +
geom_edge_link() +
geom_node_label(aes(label = shortName), size = 3)

p2 <- ggraph(G_flare, 'tree', circular = TRUE) +


geom_edge_link() +
geom_node_label(aes(label = shortName), size = 3)

grid.arrange(p1, p2, ncol = 2)

############################################

#3D Network
school_edges <- G_school %>%
activate(edges) %>%
as.data.frame()
simpleNetwork(school_edges)

from <- match(flare$imports$from, flare$vertices$name)


to <- match(flare$imports$to, flare$vertices$name)
ggraph(G_flare, layout = 'dendrogram', circular = TRUE) +
geom_conn_bundle(data = get_con(from = from, to = to), alpha = 0.1) +
geom_node_label(aes(label = shortName), size = 2) +
coord_fixed()

############################################
#Shiny
#ui.R
library(shiny)
library(shinydashboard)

shinyServer( pageWithSidebar
( headerPanel("My First
App"), sidebarPanel(
selectInput("Distribution",
"Please Select Distribution Type",
choices = c("Normal", "Exponential")),
sliderInput("sampleSize",
"Please Select Sample Size",
min = 100, max=5000, value=1000,step=100),
conditionalPanel(condition ="input.Distribution=='Normal'",
textInput("mean","Please Select the mean:",10),
textInput("sd", "Please Select Std. Devivation",3)),
conditionalPanel(condition="input.Distribution=='Exponential'",
textInput("lambda","Pls. Select Exponential Lambda",1))
),
mainPanel( plotOutput("myPl
ot")
)
)
)

#server.R
shinyServer(
function(input, output, session){
output$myPlot <- renderPlot({
distType <- input$Distribution
size <- input$sampleSize
if(distType == "Normal"){
randomVec <- rnorm(size, mean = as.numeric(input$mean),
sd = as.numeric(input$sd))
}
else{
randomVec <- rexp(size, rate = 1/as.numeric(input$lambda))
}
hist(randomVec, col="blue")
})
}
)

############################################
#igraph
#install.packages('igraph')
library(igraph)
nnodes <- 100
nnedges <- 200

nodes <- data.frame(id = 1:nnodes)


edges <- data.frame(from = sample(1:nnodes, nnedges, replace = T),
to = sample(1:nnodes, nnedges, replace = T))

# with defaut layout


visNetwork(nodes, edges, height = "500px") %>%
visIgraphLayout() %>%
visNodes(size = 10)

# in circle ?
visNetwork(nodes, edges, height = "500px") %>%
visIgraphLayout(layout = "layout_in_circle") %>%
visNodes(size = 10) %>%
visOptions(highlightNearest = list(enabled = T, hover = T),
nodesIdSelection = T)

library("igraph", quietly = TRUE, warn.conflicts = FALSE, verbose = FALSE)


igraph_network <- graph.famous("Walther")
igraph_network
plot(igraph_network)

#================

my_ui <- fluidPage(

sidebarLayout( si

debarPanel(

# Input: Selector for choosing dataset


selectInput(inputId = "my_variable", # internal ID name
label = "Choose a variable:", # your label
choices = c("Petal Length", "Petal Width", "Sepal Length", "Sepal
Width")),

# Input: Slider for choosing number of bins


sliderInput(inputId = "my_slider", # internal ID name
label = "Select number of bins:", # your label
min = 1, # Min value available
max = 30, # Max value available
value = 20)), # the starting value

mainPanel(
plotOutput(outputId = "hist_plot"))
)
)

# Define logic (functionality)


my_server <- function(input, output) {

# Return the requested variable


variableInput <- reactive({
switch(input$my_variable,
"Petal Length" = iris$Petal.Length,
"Petal Width" = iris$Petal.Width,
"Sepal Length" = iris$Sepal.Length,
"Sepal Width" = iris$Sepal.Width)
})

output$hist_plot <- renderPlot( {


x <- variableInput()
bins <- seq(min(x), max(x), length.out = input$my_slider + 1)
hist(x, breaks = bins, col = "lightgreen", border = "white",
xlab = "cm",
main = paste("Histogram of", input$my_variable ,"for the flowers of the iris
family"))
}
)

# Run the app


shinyApp(ui = my_ui, server = my_server)

############################################

#Heat Map

# Load the library


library("lattice")

# Dummy data
data <- matrix(runif(100, 0, 5) , 10 , 10)
colnames(data) <- letters[c(1:10)]
rownames(data) <- paste( rep("row",10) , c(1:10) , sep=" ")

# plot it flipping the axis


levelplot( t(data[c(nrow(data):1) , ]),
col.regions=heat.colors(100))

#Heat Map
# Library
library(ggplot2)

# Dummy data
x <- LETTERS[1:20]
y <- paste0("var", seq(1,20))
data <- expand.grid(X=x, Y=y)
data$Z <- runif(400, 0, 5)
library(ggplot2)
ggplot(data, aes(X, Y, fill= Z)) +
geom_tile()

#Heapmap - Viridis
# Lattice package
require(lattice)

# The volcano dataset is provided, it looks like that:


#head(volcano)

# 1: native palette from R


levelplot(volcano, col.regions = terrain.colors(100)) # try cm.colors() or
terrain.colors()

# 2: Rcolorbrewer palette
library(RColorBrewer)
coul <- colorRampPalette(brewer.pal(8, "PiYG"))(25)
levelplot(volcano, col.regions = coul) # try cm.colors() or terrain.colors()

# 3: Viridis
library(viridisLite)
coul <- viridis(100)
levelplot(volcano, col.regions = coul)
#levelplot(volcano, col.regions = magma(100))

#Heatmap - mtcars
# The mtcars dataset:
data <- as.matrix(mtcars)

# Default Heatmap
heatmap(data)
#mtcars - Normalization
# Use 'scale' to normalize
heatmap(data, scale="column")

############################################

#Data Cleaning
data <- data.frame(x1 = c(1:4, 99999, 1, NA, 1, 1, NA), # Create example data
frame
x1 = c(1:5, 1, "NA", 1, 1, "NA"),
x1 = c(letters[c(1:3)], "x x", "x", " y y y", "x", "a", "a", NA),
x4 = "",
x5 = NA)
data # Print example data frame
#Example 1: Modify Column Names
colnames(data) # Print column names
colnames(data) <- paste0("col", 1:ncol(data)) # Modify all column names
data # Print updated data frame

#Format missing cell


data[data == ""] # Print blank data cells
data[data == ""] <- NA # Replace blanks by NA
data$col2 # Print column
data$col2[data$col2 == "NA"] <- NA # Replace character "NA"

#Example 3: Remove Empty Rows & Columns


data <- data[rowSums(is.na(data)) != ncol(data), ] # Drop empty rows
data # Print updated data frame
data <- data[ , colSums(is.na(data)) != nrow(data)] # Drop empty columns
data # Print updated data frame
#Example 4: Remove Rows with Missing Values
data <- na.omit(data) # Delete rows with missing values
data # Print updated data frame
#Example 5: Remove Duplicates
data <- unique(data) # Exclude duplicates
data # Print updated data frame
#Example 6: Modify Classes of Columns
sapply(data, class) # Print classes of all columns
#convert to approprate class name
data <- type.convert(data, as.is = TRUE)
data # Print updated data frame
sapply(data, class) # Print classes of updated columns
data$col1[data$col1 %in% boxplot.stats(data$col1)$out] # Identify outliers in
column
data <- data[! data$col1 %in% boxplot.stats(data$col1)$out, ] # Remove rows
with outliers
data # Print updated data frame
#Example 8: Remove Spaces in Character Strings
data$col3 <- gsub(" ", "", data$col3) # Delete white space in character
strings
data # Print updated data frame
#Example 9: Combine Categories
data$col3[data$col3 %in% c("b", "c")] <- "a" # Merge categories
data # Print updated data frame

############################################

#Time-Series
data(AirPassengers)
AirPassengers
str(AirPassengers)
class(AirPassengers)
#Check for missing values
sum(is.na(AirPassengers))
start(AirPassengers)
end(AirPassengers)
frequency(AirPassengers)
summary(AirPassengers)
par(mfrow=c(3,3))
plot(AirPassengers)
plot.ts(AirPassengers)
# This will fit in a line
abline(reg=lm(AirPassengers~time(AirPassengers)))
#This will print the cycle across years
cycle(AirPassengers)

#Step 3: Make it stationary


#stationary means there should be consitant mean and variance
plot(log(AirPassengers))
plot(diff(log(AirPassengers)))
#This will aggregate the cycles and display a year on year trend
plot(aggregate(AirPassengers,FUN=mean))
#Box plot across months will give us a sense on seasonal effect
boxplot(AirPassengers~cycle(AirPassengers))

plot(diff(log(AirPassengers)))
#Time Series Decomposition
#Decomposition break data into trend, seasonal, regular and random
plot(decompose(AirPassengers)) # time series decomposition
#The above figure shows the time series decomposition into trend, seasonal and
random (noise) . It is clear that the time series is non-stationary (has random
walks) because of seasonal effects and a trend (linear trend).

#Step 5: Model Identification and Estimation


#Autocorrelation function and partial autocorrelation function to determine
value of p and q
#AR I MA - Auto Regression Moving Average Integration
#p d q
acf(AirPassengers)
acf(diff(log(AirPassengers)))

pacf(diff(log(AirPassengers)))
#It determine value of p (value we got as 0)

#d is number of time you do the differentiations to make the mean


#We do diff only one time so value of d is 1
plot(diff(log(AirPassengers)))

#Stepn 6: ARIMA Model Prediction

fit <- arima(log(AirPassengers),c(0,1,1),seasonal =


list(order=c(0,1,1),period=12))
fit
#Predict for next 10 years
pred <- predict(fit,n.ahead=10*12) #10 years * 12 months
pred
#2.718 is e value and round them to 0 decimal
pred1<-round(2.718^pred$pred,0)
pred1 #give op of 1960 to 1970

#plot this model


#line type (lty) can be specified using either text ("blank", "solid", "dashed",
"dotted", "dotdash", "longdash", "twodash") or number (0, 1, 2, 3, 4, 5, 6). Note
that lty = "solid" is identical to lty=1.
ts.plot(AirPassengers,pred1,log="y",lty=c(1,3))
#Compare predicted values with original values
#Get only 1961 values
data1<-head(pred1,12)
data1
#Predicted Values
predicted_1960 <- round(data1)#head of Predicted
predicted_1960
#Original
original_1960 <- tail(AirPassengers,12) #tail of original
original_1960
#Lets Test this Model we are going to take a dataset till 1959, and then we
predict value of 1960, then validate that 1960 from alredy existing value we have
it in dataset
#Recreate model till 1959
datawide <- ts(AirPassengers, frequency = 12, start=c(1949,1), end=c(1959,12))
datawide
#Create model
fit1 <- arima(log(datawide),c(0,1,1),seasonal = list(order=c(0,1,1),period=12))
pred <- predict(fit1,n.ahead=10*12) # predictfor now 1960 to 1970
pred1<-2.718^pred$pred
pred1 #give op of 1960 to 1970

data11=round(head(pred1,12),0) #head of Predicted


data22=round(tail(AirPassengers,12),0) #tail of original

plot(data11,col="red", type="l")
lines(data22,col="blue")
#Step 7: Check normality using Q-Q plot
#qqnorm is a generic function the default method of which produces a normal
QQ plot of the values in y. qqline adds a line to a “theoretical”, by default normal,
quantile-quantile plot which passes through the probs quantiles, by default the
first and third quartiles.

qqnorm(residuals(fit))
qqline(residuals(fit))

############################################

#Text Analytics - Sentimental Analysis


#install.packages("tidytext")
#install.packages("tidyverse")
#install.packages("textdata")
# Load dplyr and tidytext
library(dplyr)
library(tidytext)
library(textdata)
# Choose the bing lexicon
get_sentiments("bing")

# Choose the nrc lexicon

get_sentiments("nrc") %>%
count(sentiment) # Count words by sentiment

#Geocoded Tweets
#The geocoded_tweets dataset contains three columns:
#1. state, a state in the United States
#2. word, a word used in tweets posted on Twitter
#3. freq, the average frequency of that word in that state (per billion words)

# geocoded_tweets has been pre-defined


load("geocoded_tweets.rda")
geocoded_tweets
# Use get_sentiments() to access the "bing" lexicon and assign it to bing.
bing <- get_sentiments("bing")
bing
# Use an inner_join() to implement sentiment analysis on the geocoded tweet
data using the bing lexicon.
tweets_nrc <- geocoded_tweets %>%
inner_join(bing)

#Test
#Access nrc lexicon: nrc
nrc <- get_sentiments("nrc")

# Use data frame with text data


tweets_nrc = geocoded_tweets %>%
# With inner join, implement sentiment analysis using `nrc`
inner_join(nrc)

head(tweets_nrc)

tweets_nrc %>%
# Filter to only choose the words associated with sadness
filter(sentiment=="sadness") %>%
# Group by word
group_by(word) %>%
# Use the summarize verb to find the mean frequency
summarize(freq = mean(freq)) %>%
# Arrange to sort in order of descending frequency
arrange(desc(freq))

joy_words <- tweets_nrc %>%


# Filter to choose only words associated with joy
filter(sentiment=="joy") %>%
# Group by each word
group_by(word) %>%
# Use the summarize verb to find the mean frequency
summarize(freq = mean(freq)) %>%
# Arrange to sort in order of descending frequency
arrange(desc(freq))
joy_words
#install.packages("wordcloud")
library("wordcloud")
set.seed(100)
wordcloud(words = joy_words$word, freq = joy_words$freq,
random.order=TRUE)
wordcloud(words = joy_words$word, freq = joy_words$freq,
random.order=FALSE)
wordcloud(words = joy_words$word, freq = joy_words$freq,
min.freq = 3, max.words=250,
random.order=T,
rot.per=0.30,
colors=brewer.pal(8, "Dark2"))
# Load ggplot2
library(ggplot2)

joy_words %>%
top_n(20) %>%
mutate(word = reorder(word, freq)) %>%
# Use aes() to put words on the x-axis and frequency on the y-axis
ggplot(aes(x=word, y=freq)) +
# Make a bar chart with geom_col()
geom_col() +
coord_flip()
#install.packages('wordcloud2')
library(wordcloud2)
wordcloud2(joy_words, size = 1.5,
color='random-dark')
#circle, cardioid, diamond, triangle-forward
#triangle, pentagon, star

wordcloud2(joy_words, size = 0.7, shape = 'star')

text <- c("Because I could not stop for Death -",


"He kindly stopped for me -",
"The Carriage held but just Ourselves - * * ***",
"and Immortality?")
text
text_df <- tibble(line=1:4, text = text)
text_df %>%
unnest_tokens(word, text)
#Salaries Data - Visualization
library(ggplot2)
data(Salaries, package="carData")
View(Salaries)
# plot experience vs. salary
ggplot(Salaries,
aes(x = yrs.since.phd,
y = salary)) +
geom_point() +
labs(title = "Academic salary by years since degree")

# plot experience vs. salary (color represents rank)


ggplot(Salaries, aes(x = yrs.since.phd,
y = salary,
color=rank)) +
geom_point() +
labs(title = "Academic salary by rank and years since degree")

# (color represents rank, shape represents sex)


ggplot(Salaries,
aes(x = yrs.since.phd,
y = salary,
color = rank,
shape = sex)) +
geom_point(size = 3,
alpha = .6) +
labs(title = "Academic salary by rank, sex, and years since degree")

# plot experience vs. salary


# (color represents rank and size represents service)
ggplot(Salaries,
aes(x = yrs.since.phd,
y = salary,
color = rank,
size = yrs.service)) +
geom_point(alpha = .6) +
labs(title = "Academic salary by rank, years of service, and years since degree")

# plot salary histograms by rank


ggplot(Salaries, aes(x = salary)) +
geom_histogram(fill = "cornflowerblue",
color = "white") +
facet_wrap(~rank, ncol = 1) +
labs(title = "Salary histograms by rank")

# plot salary histograms by rank and sex


ggplot(Salaries, aes(x = salary / 1000)) +
geom_histogram(color = "white",
fill = "cornflowerblue") +
facet_grid(sex ~ rank) +
labs(title = "Salary histograms by sex and rank",
x = "Salary ($1000)")

#$$$$$$$$# calculate means and standard erroes by sex, rank and discipline

library(dplyr)
plotdata <- Salaries %>%
group_by(sex, rank, discipline) %>%
summarize(n = n(),
mean = mean(salary),
sd = sd(salary),
se = sd / sqrt(n))

# create better labels for discipline


plotdata$discipline <- factor(plotdata$discipline,
labels = c("Theoretical",
"Applied"))
# create plot
ggplot(plotdata,
aes(x = sex,
y = mean,
color = sex)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = mean - se,
ymax = mean + se),
width = .1) +
scale_y_continuous(breaks = seq(70000, 140000, 10000),
label = scales::dollar) +
facet_grid(. ~ rank + discipline) +
theme_bw() +
theme(legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank()) +
labs(x="",
y="",
title="Nine month academic salaries by gender, discipline, and rank",
subtitle = "(Means and standard errors)") +
scale_color_brewer(palette="Set1")

###########################################
#library(MASS)
#install.packages("ISLR")
#library(ISLR)
#fix(Boston)

names(Boston)
head(Boston)
Bostondf<-data.frame(lstat=Boston$lstat,
age=Boston$age,
medv=Boston$medv)
plot(Bostondf)
pairs(Bostondf)
#install.packages("scatterplot3d") # Install
library("scatterplot3d")
scatterplot3d(Bostondf, angle=30)

#Parallel Coordinates
# Libraries
library(tidyverse)
library(hrbrthemes)
library(patchwork)
#install.packages('GGally')
library(GGally)
library(viridis)

diamonds %>%
sample_n(10) %>%
ggparcoord( column
s = c(1,5:7),
groupColumn = 2,
#order = "anyClass",
showPoints = TRUE,
title = "Diamonds features",
alphaLines = 0.3
)+
scale_color_viridis(discrete=TRUE) +
theme_ipsum()+
theme(
plot.title = element_text(size=10)
)

# Data set is provided by R natively


# Parallel plot or parallel coordinates plot allows to compare the feature of
several individual observations (series) on a set of numeric variables. Each
vertical bar represents a variable and often has its own scale. (The units can even
be different). Values are then plotted as series of lines connected across each
axis.

data <- iris

# Plot
data %>%
ggparcoord(
columns = 1:4, groupColumn = 5, order = "anyClass",
showPoints = TRUE,
title = "Parallel Coordinate Plot for the Iris Data",
alphaLines = 0.3
)+
scale_color_viridis(discrete=TRUE) +
theme_ipsum()+
theme(
plot.title = element_text(size=10)
)

# Plot
p1 <- data %>%
ggparcoord(
columns = 1:4, groupColumn = 5, order = "anyClass",
scale="globalminmax",
showPoints = TRUE,
title = "No scaling",
alphaLines = 0.3
)+
scale_color_viridis(discrete=TRUE) +
theme_ipsum()+
theme( legend.position="none
",
plot.title = element_text(size=10)
)+
xlab("")

p2 <- data %>%


ggparcoord(
columns = 1:4, groupColumn = 5, order = "anyClass",
scale="uniminmax",
showPoints = TRUE,
title = "Standardize to Min = 0 and Max = 1",
alphaLines = 0.3
)+
scale_color_viridis(discrete=TRUE) +
theme_ipsum()+
theme( legend.position="none
",
plot.title = element_text(size=10)
)+
xlab("")

p3 <- data %>%


ggparcoord(
columns = 1:4, groupColumn = 5, order = "anyClass",
scale="std",
showPoints = TRUE,
title = "Normalize univariately (substract mean & divide by sd)",
alphaLines = 0.3
)+
scale_color_viridis(discrete=TRUE) +
theme_ipsum()+
theme( legend.position="none
",
plot.title = element_text(size=10)
)+
xlab("")

p4 <- data %>%


ggparcoord(
columns = 1:4, groupColumn = 5, order = "anyClass",
scale="center",
showPoints = TRUE,
title = "Standardize and center variables",
alphaLines = 0.3
)+
scale_color_viridis(discrete=TRUE) +
theme_ipsum()+
theme( legend.position="none
",
plot.title = element_text(size=10)
)+
xlab("")

p1 + p2 + p3 + p4 + plot_layout(ncol = 2)

############################################
###########Tidying Shakespearean plays
#shakespeare dataset contains 3 colomns
#1. title, the title of a Shakespearean play
#2. type, the type of play, either tragedy or comedy
#3. text, a line from that play

#1. load shakespeare.rda into r environment

#2. Pipe the shakespeare data frame to the next line


# Use count to find out how many titles/types there are

#3. Load tidytext/ tidyverse

#4. create and object tidy_shakespeare


# Group by the titles of the plays
# Define a new column linenumber
# Transform the non-tidy text data to tidy text data

#5. Pipe the tidy Shakespeare data frame to the next line
# Use count to find out how many times each word is used

#6. Sentiment analysis of tidy_shakespeare assin to object


shakespeare_sentiment
# Implement sentiment analysis with the "bing" lexicon

#7. shakespeare_sentiment
# Find how many positive/negative words each play has

#8. Tragedy or comedy from tidy_shakespeare assign to sentiment_counts


# Implement sentiment analysis using the "bing" lexicon
# Count the number of words by title, type, and sentiment

#9. from sentiment_counts


# Group by the titles of the plays
# Find the total number of words in each play
# Calculate the number of words divided by the total
# Filter the results for only negative sentiment then arrange percentage in asc
order

#10 Most common positive and negative words and assign to word_could
# Implement sentiment analysis using the "bing" lexicon
# Count by word and sentiment
#11. extract top 10 words from word_counts and assing to top_words
# Group by sentiment
# Take the top 10 for each sentiment and ungroup it
# Make word a factor in order of n

#12 Use aes() to put words on the x-axis and n on the y-axis
# Make a bar chart with geom_col()
# facet_wrap for sentiments and apply scales as free
#Move x to y and y to x

#13 from tidy_shakespeare Calculating a contribution score

# Count by title and word

# Implement sentiment analysis using the "afinn" lexicon


# Group by title

# Calculate a contribution for each word in each title

You might also like