0% found this document useful (0 votes)
69 views14 pages

8 - Textual Analysis - 2020

The document analyzes textual data from happiness reflections using R. It loads and prepares the dataset, investigates variables, tokenizes the text into words, removes stopwords, computes word counts and TF-IDF scores, joins with sentiment dictionaries, analyzes sentiment by reflection period, and computes word pairs and correlations. Visualizations are created to showcase top words, sentiments, and word relationships. The analysis provides insights into the language and sentiments expressed in the reflections.

Uploaded by

N
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
69 views14 pages

8 - Textual Analysis - 2020

The document analyzes textual data from happiness reflections using R. It loads and prepares the dataset, investigates variables, tokenizes the text into words, removes stopwords, computes word counts and TF-IDF scores, joins with sentiment dictionaries, analyzes sentiment by reflection period, and computes word pairs and correlations. Visualizations are created to showcase top words, sentiments, and word relationships. The analysis provides insights into the language and sentiments expressed in the reflections.

Uploaded by

N
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 14

8_Textual Analysis_2020

Lennard Schmidt
5/10/2020

require("dplyr")
require("tidytext")
require("textdata")
require("widyr")

Load Dataset
happy_df <- read.csv("./happydb.csv", sep = ";")

Prepare Dataframe and Check for NA


happy_df <- happy_df[,-c(1,4,6,8)]
happy_df$cleaned_hm <- as.character(happy_df$cleaned_hm)
colnames(happy_df) <- c("wid", "reflection_period", "cleaned_hm", "num_sentence", "hm_category")

sapply(happy_df, function(x) sum(is.na(x)))

## wid reflection_period cleaned_hm num_sentence


## 0 0 0 0
## hm_category
## 0

Investigate Variables
skimr::skim(happy_df)

Data summary
Name happy_df
Number of rows 100535
Number of columns 5
_______________________
Column type frequency:
character 1
factor 2
numeric 2
________________________
Group variables None
Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace

cleaned_hm 0 1 6 6532 0 96481 365

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts

reflection_period 0 1 FALSE 2 3m: 50704, 24h: 49831

hm_category 0 1 FALSE 7 aff: 34168, ach: 33993, enj: 11144, bon: 10727

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist

wid 0 1 2746.62 3535.01 1 410 1125 3507 13839 ▇▂▁▁▁


num_sentence 0 1 1.34 1.30 1 1 1 1 69 ▇▁▁▁▁

Tokenize by Word
happy_df.tidy <- tidytext::unnest_tokens(happy_df, word, cleaned_hm)
dplyr::count(happy_df.tidy, word, sort = TRUE)
## # A tibble: 27,549 x 2
## word n
## <chr> <int>
## 1 i 104285
## 2 my 74039
## 3 a 71630
## 4 to 57289
## 5 and 56695
## 6 the 52502
## 7 was 33927
## 8 for 27126
## 9 in 26429
## 10 me 25540
## # ... with 27,539 more rows

Remove Stopwords
happy_df.clean <- dplyr::anti_join(happy_df.tidy, tidytext::get_stopwords())

## Joining, by = "word"

dplyr::count(happy_df.clean, word, sort = TRUE)

## # A tibble: 27,381 x 2
## word n
## <chr> <int>
## 1 happy 18732
## 2 got 13378
## 3 made 11435
## 4 went 9616
## 5 time 9328
## 6 new 8870
## 7 day 8048
## 8 work 7864
## 9 last 6391
## 10 good 5851
## # ... with 27,371 more rows
Compute and Visualize Word Counts
happy_df.count <- dplyr::count(happy_df.clean, word, sort = TRUE)
happy_df.count <- happy_df.count[which( happy_df.count$word != "happy" &
happy_df.count$word != "happiest" ),]
happy_df.count$word <- reorder(happy_df.count$word, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = word, y = n)) +


ggplot2::geom_col() +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute TF-IDF
happy_df.count <- dplyr::count(happy_df.clean, wid, word, sort = TRUE)
happy_df.count <- happy_df.count[which(happy_df.count$n > 10 &
happy_df.count$word != "happy" &
happy_df.count$word != "happiest" &
happy_df.count$word != "happiness" &
nchar(happy_df.count$word) > 3),]
head(happy_df.count)

## # A tibble: 6 x 3
## wid word n
## <int> <chr> <int>
## 1 280 life 123
## 2 120 event 92
## 3 280 good 89
## 4 954 time 84
## 5 40 went 80
## 6 5 made 79

tidytext::bind_tf_idf(happy_df.count, word, wid, n)

## # A tibble: 2,256 x 6
## wid word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 280 life 123 0.370 2.51 0.930
## 2 120 event 92 0.262 3.69 0.968
## 3 280 good 89 0.268 2.64 0.708
## 4 954 time 84 0.0617 1.80 0.111
## 5 40 went 80 0.0458 1.70 0.0779
## 6 5 made 79 0.745 1.84 1.37
## 7 455 time 76 0.0285 1.80 0.0511
## 8 40 thank 75 0.0429 6.40 0.275
## 9 954 roti 73 0.0536 6.40 0.343
## 10 55 really 72 0.270 2.74 0.738
## # ... with 2,246 more rows

Visualize as Word Cloud


wordcloud::wordcloud(happy_df.count$word, happy_df.count$n, min.freq = 1, max.words = 100, random.order=FALSE)

Join Sentiment Dictionary and Visualize Sentiment


Counts
happy_df.sen <- dplyr::inner_join(happy_df.clean, tidytext::get_sentiments("nrc"), by = "word")
happy_df.sen <- dplyr::inner_join(happy_df.sen, tidytext::get_sentiments("afinn"), by = "word")
head(happy_df.sen, 10)
## wid reflection_period num_sentence hm_category word sentiment
## 1 2053 24h 1 affection successful anticipation
## 2 2053 24h 1 affection successful joy
## 3 2053 24h 1 affection successful positive
## 4 2053 24h 1 affection successful trust
## 5 2053 24h 1 affection sympathy positive
## 6 2053 24h 1 affection sympathy sadness
## 7 2 24h 1 affection happy anticipation
## 8 2 24h 1 affection happy joy
## 9 2 24h 1 affection happy positive
## 10 2 24h 1 affection happy trust
## value
## 1 3
## 2 3
## 3 3
## 4 3
## 5 2
## 6 2
## 7 3
## 8 3
## 9 3
## 10 3

happy_df.sen_count <- count(happy_df.sen, sentiment, word, sort = TRUE)


happy_df.sen_count$word <- reorder(happy_df.sen_count$word, happy_df.sen_count$n)
happy_df.sen_count <- by(happy_df.sen_count, happy_df.sen_count["sentiment"], head, n=5)
happy_df.sen_count <- Reduce(rbind, happy_df.sen_count)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = word, y = n, fill = sentiment)) +


ggplot2::geom_col(show.legend = FALSE) +
ggplot2::facet_wrap(~sentiment, scales = "free") +
ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute and Visualize Count by Reflection Time
happy_df.sen_count <- dplyr::count(happy_df.sen, reflection_period, sentiment, sort = TRUE)
happy_df.sen_count$sentiment <- reorder(happy_df.sen_count$sentiment, happy_df.sen_count$n)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = sentiment, y = n, fill = reflection_period)) +


ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute and Visualize Mean Valence by Reflection
Period
happy_df.sen_count <- dplyr::count(happy_df.sen, reflection_period, word, value, sort = TRUE)
happy_df.sen_count$value <- happy_df.sen_count$value * happy_df.sen_count$n
happy_df.sen_agg <- aggregate(cbind(n, value) ~ reflection_period, happy_df.sen_count, sd)
happy_df.sen_agg$mean_value <- happy_df.sen_agg$value / happy_df.sen_agg$n
head(happy_df.sen_agg)

## reflection_period n value mean_value


## 1 24h 1538.305 4560.348 2.964529
## 2 3m 1638.364 4873.171 2.974413
ggplot2::ggplot(happy_df.sen_agg, ggplot2::aes(x= reflection_period, y = mean_value, fill = reflection_period)) +
ggplot2::geom_col() +
ggpubr::theme_pubclean()

Tokenize and Visualize by Sentence


happy_df.tidy <- tidytext::unnest_tokens(happy_df, sentence, cleaned_hm, token = "sentences")
happy_df.count <- dplyr::count(happy_df.tidy, sentence, sort = TRUE)
happy_df.count$sentence <- reorder(happy_df.count$sentence, happy_df.count$n)
happy_df.count <- head(happy_df.count, 10)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = sentence, y = n)) +


ggplot2::geom_col() +
ggplot2::coord_flip() +
ggplot2::scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 60)) +
ggpubr::theme_pubclean()

Tokenize and Visualize by N-Gram


happy_df.tidy <- tidytext::unnest_tokens(happy_df, bigram, cleaned_hm, token = "ngrams", n = 2)
happy_df.count <- dplyr::count(happy_df.tidy, bigram, sort = TRUE)
happy_df.count$bigram <- reorder(happy_df.count$bigram, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = bigram, y = n)) +


ggplot2::geom_col() +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()

Compute Word Pairs and Correlations


word_pair <- widyr::pairwise_count(happy_df.clean, word, wid, sort = TRUE)
head(word_pair, 10)

## # A tibble: 10 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 made happy 2987
## 2 happy made 2987
## 3 made got 2393
## 4 got made 2393
## 5 new got 2311
## 6 got new 2311
## 7 time got 2300
## 8 got time 2300
## 9 got happy 2287
## 10 happy got 2287

word_cor <- widyr::pairwise_cor(happy_df.clean[sample(nrow(happy_df.clean), 1000),], word, wid, sort = TRUE)


head(word_cor[which(word_cor$correlation != 1),], 10)

## # A tibble: 10 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 client learned 1.
## 2 attended wheeler 1.
## 3 hardly human 1.
## 4 meet walt 1.
## 5 word area 1.
## 6 normally engineer 1.
## 7 creek content 1.
## 8 force die 1.
## 9 spring neighborhood 1.
## 10 know performed 1.

Visualize Correlations
word_cor <- head(word_cor[which(word_cor$correlation != 1),],100)
g <- igraph::graph_from_data_frame(word_cor)

ggraph::ggraph(g, layout = "fr") +


ggraph::geom_edge_link(ggplot2::aes(edge_alpha = correlation), show.legend = FALSE) +
ggraph::geom_node_point(color = "lightblue", size = 4) +
ggraph::geom_node_text(ggplot2::aes(label = name), repel = TRUE) +
ggplot2::theme_void()

You might also like