0% found this document useful (0 votes)

69 views14 pages

8 - Textual Analysis - 2020

The document analyzes textual data from happiness reflections using R. It loads and prepares the dataset, investigates variables, tokenizes the text into words, removes stopwords, computes word counts and TF-IDF scores, joins with sentiment dictionaries, analyzes sentiment by reflection period, and computes word pairs and correlations. Visualizations are created to showcase top words, sentiments, and word relationships. The analysis provides insights into the language and sentiments expressed in the reflections.

Uploaded by

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

69 views14 pages

8 - Textual Analysis - 2020

Uploaded by

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 14

8_Textual Analysis_2020

Lennard Schmidt
5/10/2020

require("dplyr")
require("tidytext")
require("textdata")
require("widyr")

Load Dataset
happy_df <- read.csv("./happydb.csv", sep = ";")

Prepare Dataframe and Check for NA

happy_df <- happy_df[,-c(1,4,6,8)]
happy_df$cleaned_hm <- as.character(happy_df$cleaned_hm)
colnames(happy_df) <- c("wid", "reflection_period", "cleaned_hm", "num_sentence", "hm_category")

sapply(happy_df, function(x) sum(is.na(x)))

## wid reflection_period cleaned_hm num_sentence

## 0 0 0 0
## hm_category
## 0

Investigate Variables
skimr::skim(happy_df)

Data summary
Name happy_df
Number of rows 100535
Number of columns 5
_______________________
Column type frequency:
character 1
factor 2
numeric 2
________________________
Group variables None
Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace

cleaned_hm 0 1 6 6532 0 96481 365

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts

reﬂection_period 0 1 FALSE 2 3m: 50704, 24h: 49831

hm_category 0 1 FALSE 7 aﬀ: 34168, ach: 33993, enj: 11144, bon: 10727

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist

wid 0 1 2746.62 3535.01 1 410 1125 3507 13839 ▇▂▁▁▁

num_sentence 0 1 1.34 1.30 1 1 1 1 69 ▇▁▁▁▁

Tokenize by Word
happy_df.tidy <- tidytext::unnest_tokens(happy_df, word, cleaned_hm)
dplyr::count(happy_df.tidy, word, sort = TRUE)
## # A tibble: 27,549 x 2
## word n
## <chr> <int>
## 1 i 104285
## 2 my 74039
## 3 a 71630
## 4 to 57289
## 5 and 56695
## 6 the 52502
## 7 was 33927
## 8 for 27126
## 9 in 26429
## 10 me 25540
## # ... with 27,539 more rows

Remove Stopwords
happy_df.clean <- dplyr::anti_join(happy_df.tidy, tidytext::get_stopwords())

## Joining, by = "word"

dplyr::count(happy_df.clean, word, sort = TRUE)

## # A tibble: 27,381 x 2
## word n
## <chr> <int>
## 1 happy 18732
## 2 got 13378
## 3 made 11435
## 4 went 9616
## 5 time 9328
## 6 new 8870
## 7 day 8048
## 8 work 7864
## 9 last 6391
## 10 good 5851
## # ... with 27,371 more rows
Compute and Visualize Word Counts
happy_df.count <- dplyr::count(happy_df.clean, word, sort = TRUE)
happy_df.count <- happy_df.count[which( happy_df.count$word != "happy" &
happy_df.count$word != "happiest" ),]
happy_df.count$word <- reorder(happy_df.count$word, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = word, y = n)) +

ggplot2::geom_col() +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute TF-IDF
happy_df.count <- dplyr::count(happy_df.clean, wid, word, sort = TRUE)
happy_df.count <- happy_df.count[which(happy_df.count$n > 10 &
happy_df.count$word != "happy" &
happy_df.count$word != "happiest" &
happy_df.count$word != "happiness" &
nchar(happy_df.count$word) > 3),]
head(happy_df.count)

## # A tibble: 6 x 3
## wid word n
## <int> <chr> <int>
## 1 280 life 123
## 2 120 event 92
## 3 280 good 89
## 4 954 time 84
## 5 40 went 80
## 6 5 made 79

tidytext::bind_tf_idf(happy_df.count, word, wid, n)

## # A tibble: 2,256 x 6
## wid word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 280 life 123 0.370 2.51 0.930
## 2 120 event 92 0.262 3.69 0.968
## 3 280 good 89 0.268 2.64 0.708
## 4 954 time 84 0.0617 1.80 0.111
## 5 40 went 80 0.0458 1.70 0.0779
## 6 5 made 79 0.745 1.84 1.37
## 7 455 time 76 0.0285 1.80 0.0511
## 8 40 thank 75 0.0429 6.40 0.275
## 9 954 roti 73 0.0536 6.40 0.343
## 10 55 really 72 0.270 2.74 0.738
## # ... with 2,246 more rows

Visualize as Word Cloud

wordcloud::wordcloud(happy_df.count$word, happy_df.count$n, min.freq = 1, max.words = 100, random.order=FALSE)

Join Sentiment Dictionary and Visualize Sentiment

Counts
happy_df.sen <- dplyr::inner_join(happy_df.clean, tidytext::get_sentiments("nrc"), by = "word")
happy_df.sen <- dplyr::inner_join(happy_df.sen, tidytext::get_sentiments("afinn"), by = "word")
head(happy_df.sen, 10)
## wid reflection_period num_sentence hm_category word sentiment
## 1 2053 24h 1 affection successful anticipation
## 2 2053 24h 1 affection successful joy
## 3 2053 24h 1 affection successful positive
## 4 2053 24h 1 affection successful trust
## 5 2053 24h 1 affection sympathy positive
## 6 2053 24h 1 affection sympathy sadness
## 7 2 24h 1 affection happy anticipation
## 8 2 24h 1 affection happy joy
## 9 2 24h 1 affection happy positive
## 10 2 24h 1 affection happy trust
## value
## 1 3
## 2 3
## 3 3
## 4 3
## 5 2
## 6 2
## 7 3
## 8 3
## 9 3
## 10 3

happy_df.sen_count <- count(happy_df.sen, sentiment, word, sort = TRUE)

happy_df.sen_count$word <- reorder(happy_df.sen_count$word, happy_df.sen_count$n)
happy_df.sen_count <- by(happy_df.sen_count, happy_df.sen_count["sentiment"], head, n=5)
happy_df.sen_count <- Reduce(rbind, happy_df.sen_count)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = word, y = n, fill = sentiment)) +

ggplot2::geom_col(show.legend = FALSE) +
ggplot2::facet_wrap(~sentiment, scales = "free") +
ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute and Visualize Count by Reﬂection Time
happy_df.sen_count <- dplyr::count(happy_df.sen, reflection_period, sentiment, sort = TRUE)
happy_df.sen_count$sentiment <- reorder(happy_df.sen_count$sentiment, happy_df.sen_count$n)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = sentiment, y = n, fill = reflection_period)) +

ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute and Visualize Mean Valence by Reﬂection
Period
happy_df.sen_count <- dplyr::count(happy_df.sen, reflection_period, word, value, sort = TRUE)
happy_df.sen_count$value <- happy_df.sen_count$value * happy_df.sen_count$n
happy_df.sen_agg <- aggregate(cbind(n, value) ~ reflection_period, happy_df.sen_count, sd)
happy_df.sen_agg$mean_value <- happy_df.sen_agg$value / happy_df.sen_agg$n
head(happy_df.sen_agg)

## reflection_period n value mean_value

## 1 24h 1538.305 4560.348 2.964529
## 2 3m 1638.364 4873.171 2.974413
ggplot2::ggplot(happy_df.sen_agg, ggplot2::aes(x= reflection_period, y = mean_value, fill = reflection_period)) +
ggplot2::geom_col() +
ggpubr::theme_pubclean()

Tokenize and Visualize by Sentence

happy_df.tidy <- tidytext::unnest_tokens(happy_df, sentence, cleaned_hm, token = "sentences")
happy_df.count <- dplyr::count(happy_df.tidy, sentence, sort = TRUE)
happy_df.count$sentence <- reorder(happy_df.count$sentence, happy_df.count$n)
happy_df.count <- head(happy_df.count, 10)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = sentence, y = n)) +

ggplot2::geom_col() +
ggplot2::coord_flip() +
ggplot2::scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 60)) +
ggpubr::theme_pubclean()

Tokenize and Visualize by N-Gram

happy_df.tidy <- tidytext::unnest_tokens(happy_df, bigram, cleaned_hm, token = "ngrams", n = 2)
happy_df.count <- dplyr::count(happy_df.tidy, bigram, sort = TRUE)
happy_df.count$bigram <- reorder(happy_df.count$bigram, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = bigram, y = n)) +

ggplot2::geom_col() +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()

Compute Word Pairs and Correlations

word_pair <- widyr::pairwise_count(happy_df.clean, word, wid, sort = TRUE)
head(word_pair, 10)

## # A tibble: 10 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 made happy 2987
## 2 happy made 2987
## 3 made got 2393
## 4 got made 2393
## 5 new got 2311
## 6 got new 2311
## 7 time got 2300
## 8 got time 2300
## 9 got happy 2287
## 10 happy got 2287

word_cor <- widyr::pairwise_cor(happy_df.clean[sample(nrow(happy_df.clean), 1000),], word, wid, sort = TRUE)

head(word_cor[which(word_cor$correlation != 1),], 10)

## # A tibble: 10 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 client learned 1.
## 2 attended wheeler 1.
## 3 hardly human 1.
## 4 meet walt 1.
## 5 word area 1.
## 6 normally engineer 1.
## 7 creek content 1.
## 8 force die 1.
## 9 spring neighborhood 1.
## 10 know performed 1.

Visualize Correlations
word_cor <- head(word_cor[which(word_cor$correlation != 1),],100)
g <- igraph::graph_from_data_frame(word_cor)

ggraph::ggraph(g, layout = "fr") +

ggraph::geom_edge_link(ggplot2::aes(edge_alpha = correlation), show.legend = FALSE) +
ggraph::geom_node_point(color = "lightblue", size = 4) +
ggraph::geom_node_text(ggplot2::aes(label = name), repel = TRUE) +
ggplot2::theme_void()

Diversity in Survey Questions On The Same Topic Techniques For Improving Comparability Instant PDF Download
100% (19)
Diversity in Survey Questions On The Same Topic Techniques For Improving Comparability Instant PDF Download
15 pages
Nurse Patient Interaction
No ratings yet
Nurse Patient Interaction
14 pages
Stata Guide PDF
No ratings yet
Stata Guide PDF
181 pages
SSDA Communication v2 - CW PDF
100% (1)
SSDA Communication v2 - CW PDF
41 pages
Psychology in Modern India
100% (1)
Psychology in Modern India
19 pages
SLIDES 1 Escp - Python - Ds - 2020
100% (1)
SLIDES 1 Escp - Python - Ds - 2020
50 pages
VCD
No ratings yet
VCD
121 pages
Evert2004phd 353p. The Statistics of Word Cooccurrences, Word Pairs and Collocations
No ratings yet
Evert2004phd 353p. The Statistics of Word Cooccurrences, Word Pairs and Collocations
353 pages
Table of Concrete Design Properties Including Strength Properties
No ratings yet
Table of Concrete Design Properties Including Strength Properties
7 pages
Difference Between Systematic and Unsystematic Risk
No ratings yet
Difference Between Systematic and Unsystematic Risk
6 pages
Data-Driven Management of Blue Detergent: Simulation Case
No ratings yet
Data-Driven Management of Blue Detergent: Simulation Case
4 pages
O Captain! My Captain!-Walt Whitman
100% (1)
O Captain! My Captain!-Walt Whitman
3 pages
Blood Type "A" Food List: Meat & Poultry: Seafood
100% (4)
Blood Type "A" Food List: Meat & Poultry: Seafood
5 pages
MPD RGH
100% (1)
MPD RGH
12 pages
Text Mining With R
No ratings yet
Text Mining With R
15 pages
Package VCD': September 16, 2024
No ratings yet
Package VCD': September 16, 2024
144 pages
Arab Digital Development Report 2019 English 0
No ratings yet
Arab Digital Development Report 2019 English 0
191 pages
Arab Digital Development Report 2019 Arabic
No ratings yet
Arab Digital Development Report 2019 Arabic
189 pages
Introduction: Royal Kingdom of Maharlikan
100% (1)
Introduction: Royal Kingdom of Maharlikan
2 pages
Intellectual Property Innovation Arab Region English 0
No ratings yet
Intellectual Property Innovation Arab Region English 0
171 pages
Package Zoo': July 30, 2024
No ratings yet
Package Zoo': July 30, 2024
75 pages
Package Zoo': R Topics Documented
No ratings yet
Package Zoo': R Topics Documented
75 pages
Assignment # 07 (Updated)
No ratings yet
Assignment # 07 (Updated)
59 pages
4.18 Data Wrangling Slides Part1
No ratings yet
4.18 Data Wrangling Slides Part1
54 pages
Package Dplyr': August 12, 2020
No ratings yet
Package Dplyr': August 12, 2020
87 pages
GoodNotes 11 PDF
No ratings yet
GoodNotes 11 PDF
75 pages
Notes
100% (2)
Notes
3 pages
Exploratory Data Analysis in R
No ratings yet
Exploratory Data Analysis in R
40 pages
Project New
No ratings yet
Project New
27 pages
Text Mining With R 1st Edition Julia Silge David Robinson PDF Download
No ratings yet
Text Mining With R 1st Edition Julia Silge David Robinson PDF Download
53 pages
Quantitative Methods in Linguistics - Lecture 3: Adrian Brasoveanu March 30, 2014
No ratings yet
Quantitative Methods in Linguistics - Lecture 3: Adrian Brasoveanu March 30, 2014
51 pages
R Class All
No ratings yet
R Class All
39 pages
Exploratory Data Analysis in R
No ratings yet
Exploratory Data Analysis in R
33 pages
Zoo
No ratings yet
Zoo
73 pages
Package Zoo': R Topics Documented
No ratings yet
Package Zoo': R Topics Documented
73 pages
Tidy Text
No ratings yet
Tidy Text
39 pages
Reference Material For NLP - 1
No ratings yet
Reference Material For NLP - 1
40 pages
Week 8
No ratings yet
Week 8
24 pages
Emotion Classification With DistilBERT
No ratings yet
Emotion Classification With DistilBERT
25 pages
Lecture - 7 MSDS
No ratings yet
Lecture - 7 MSDS
32 pages
Contructivism Art
No ratings yet
Contructivism Art
18 pages
CRAN (2020) - Package 'Zoo'
No ratings yet
CRAN (2020) - Package 'Zoo'
75 pages
Cermak 1989 NormsandScores PDF
No ratings yet
Cermak 1989 NormsandScores PDF
34 pages
CH 1
No ratings yet
CH 1
24 pages
Brand Perception Analysis Group Assignment
No ratings yet
Brand Perception Analysis Group Assignment
23 pages
Text Analysis
No ratings yet
Text Analysis
15 pages
Rakowski Preludes: A Brief Examination of His Compositional Process
0% (1)
Rakowski Preludes: A Brief Examination of His Compositional Process
20 pages
A Webometric Analysis of Some Universiti PDF
No ratings yet
A Webometric Analysis of Some Universiti PDF
16 pages
Package Psycho': January 22, 2020
No ratings yet
Package Psycho': January 22, 2020
20 pages
1592 Fdoc
No ratings yet
1592 Fdoc
14 pages
R Basic and Advanced
No ratings yet
R Basic and Advanced
9 pages
Murawski & Bick 2017 ECIS
No ratings yet
Murawski & Bick 2017 ECIS
17 pages
Determination of Age in Juvenile Justice
No ratings yet
Determination of Age in Juvenile Justice
11 pages
Grade 9 LO - Constitutional Values
No ratings yet
Grade 9 LO - Constitutional Values
17 pages
Compositional Data Analysis in Practice
No ratings yet
Compositional Data Analysis in Practice
14 pages
Text Analysis
No ratings yet
Text Analysis
15 pages
LabSet 02
No ratings yet
LabSet 02
12 pages
Twitter Sentiment Analysis
No ratings yet
Twitter Sentiment Analysis
13 pages
What Are The Tidyverse Packages in R Language?
No ratings yet
What Are The Tidyverse Packages in R Language?
12 pages
R Studio Commands
No ratings yet
R Studio Commands
19 pages
Moodmeter en Es
No ratings yet
Moodmeter en Es
12 pages
ANOVA - Activity 2
No ratings yet
ANOVA - Activity 2
11 pages
Lesson 13. Death
No ratings yet
Lesson 13. Death
22 pages
MyPwCUK (BBA) Report
No ratings yet
MyPwCUK (BBA) Report
7 pages
Prototype 1
No ratings yet
Prototype 1
10 pages
Order Tasks and Milestones Assignment
No ratings yet
Order Tasks and Milestones Assignment
6 pages
R Programming Cheat Sheet
No ratings yet
R Programming Cheat Sheet
7 pages
ALY 6000 Project 2
No ratings yet
ALY 6000 Project 2
11 pages
Murawski Et Al. 2019 ICIS
No ratings yet
Murawski Et Al. 2019 ICIS
9 pages
IMRAD Article
No ratings yet
IMRAD Article
7 pages
Summarizing Data
No ratings yet
Summarizing Data
20 pages
Technical Interview Questions Technical Interview Questions
No ratings yet
Technical Interview Questions Technical Interview Questions
13 pages
R Studio Notes
No ratings yet
R Studio Notes
6 pages
Big Data Pyqp Answers
No ratings yet
Big Data Pyqp Answers
6 pages
Step 1: Create A CSV File: # For Text Mining
No ratings yet
Step 1: Create A CSV File: # For Text Mining
9 pages
BDA Practical01
No ratings yet
BDA Practical01
5 pages
Cprogramming For 5th Sem Mech
No ratings yet
Cprogramming For 5th Sem Mech
9 pages
Gil Fronsdal Introduction To Mindfulness Meditation
No ratings yet
Gil Fronsdal Introduction To Mindfulness Meditation
103 pages
Sentiment 2
No ratings yet
Sentiment 2
7 pages
Allocation of Joint Costs/by-Product Costing
No ratings yet
Allocation of Joint Costs/by-Product Costing
8 pages
PPL 1
No ratings yet
PPL 1
17 pages
Medical Fitness
No ratings yet
Medical Fitness
1 page
Text Mining Package and Datacleaning: #Cleaning The Text or Text Transformation
No ratings yet
Text Mining Package and Datacleaning: #Cleaning The Text or Text Transformation
6 pages
Impact and Probability of Threat
No ratings yet
Impact and Probability of Threat
12 pages
L 2what Is Inclusive History
No ratings yet
L 2what Is Inclusive History
2 pages
Project: Data Science For Social Good: Crime Study Mohammad Osama
No ratings yet
Project: Data Science For Social Good: Crime Study Mohammad Osama
4 pages
Word Cloud
No ratings yet
Word Cloud
3 pages
Data Science Capstone - Week 2 Milestone - Exploratory Data Analysis On Text Files
No ratings yet
Data Science Capstone - Week 2 Milestone - Exploratory Data Analysis On Text Files
7 pages
E-Happiness Using Rstudio Programming Language: 1 Bagus Sumargo 2 Irsyad Hasari 3 Siti Rohmah Rohimah
No ratings yet
E-Happiness Using Rstudio Programming Language: 1 Bagus Sumargo 2 Irsyad Hasari 3 Siti Rohmah Rohimah
5 pages
R Code NB
No ratings yet
R Code NB
3 pages
Hashing
From Everand
Hashing
Prakash Hegade
No ratings yet
Q12 - Text Data (Word Frequencies)
No ratings yet
Q12 - Text Data (Word Frequencies)
1 page
Q1-Run Word Count On H G Wells Collection and Plot The Same
No ratings yet
Q1-Run Word Count On H G Wells Collection and Plot The Same
2 pages
Top Grade Homeopathic Medicines For Anger Control and Management - Homeopathy at DrHomeo
No ratings yet
Top Grade Homeopathic Medicines For Anger Control and Management - Homeopathy at DrHomeo
2 pages
5 Paso S Text Mining
No ratings yet
5 Paso S Text Mining
4 pages
Tmcode Text Mining
No ratings yet
Tmcode Text Mining
2 pages
Kinetics: The Oxidation of Iodide by Hydrogen Peroxide
No ratings yet
Kinetics: The Oxidation of Iodide by Hydrogen Peroxide
3 pages
Grade - 7
No ratings yet
Grade - 7
1 page
Siemens Power Engineering Guide 7E 354
No ratings yet
Siemens Power Engineering Guide 7E 354
1 page
Fast mental calculation tricks
From Everand
Fast mental calculation tricks
EasyMath
No ratings yet
Columbia 2017 PDF
100% (4)
Columbia 2017 PDF
190 pages
Sleep Hygiene
No ratings yet
Sleep Hygiene
1 page
The Greek Sense of Theatre Tragedy and Comedy 3rd Edition J Michael Walton Instant Download
No ratings yet
The Greek Sense of Theatre Tragedy and Comedy 3rd Edition J Michael Walton Instant Download
54 pages

8 - Textual Analysis - 2020

Uploaded by

8 - Textual Analysis - 2020

Uploaded by

8_Textual Analysis_2020

Prepare Dataframe and Check for NA

sapply(happy_df, function(x) sum(is.na(x)))

## wid reflection_period cleaned_hm num_sentence

skim_variable n_missing complete_rate min max empty n_unique whitespace

cleaned_hm 0 1 6 6532 0 96481 365

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts

reﬂection_period 0 1 FALSE 2 3m: 50704, 24h: 49831

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist

wid 0 1 2746.62 3535.01 1 410 1125 3507 13839 ▇▂▁▁▁

dplyr::count(happy_df.clean, word, sort = TRUE)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = word, y = n)) +

tidytext::bind_tf_idf(happy_df.count, word, wid, n)

Visualize as Word Cloud

Join Sentiment Dictionary and Visualize Sentiment

happy_df.sen_count <- count(happy_df.sen, sentiment, word, sort = TRUE)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = word, y = n, fill = sentiment)) +

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = sentiment, y = n, fill = reflection_period)) +

## reflection_period n value mean_value

Tokenize and Visualize by Sentence

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = sentence, y = n)) +

Tokenize and Visualize by N-Gram

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = bigram, y = n)) +

Compute Word Pairs and Correlations

word_cor <- widyr::pairwise_cor(happy_df.clean[sample(nrow(happy_df.clean), 1000),], word, wid, sort = TRUE)

ggraph::ggraph(g, layout = "fr") +

You might also like