DATA CLEANING:
It is a process of transforming raw data into consistent data that can be analysed .
Data Cleaning using Dataset:
Install the required packages and Import the required packages:
library(tidyverse)
library(grid)
library(gridExtra)
library(forcats)
library(modelr)
library(caret)
library(kknn)
1.Load Data:
airquality
summary(airquality)
Outcome:
Ozone Solar.R Wind Temp Month
Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00 Min. :5.000
1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00 1st Qu.:6.000
Median : 31.50 Median :205.0 Median : 9.700 Median :79.00 Median :7.000
Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88 Mean :6.993
3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00 3rd Qu.:8.000
Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00 Max. :9.000
NA's :37 NA's :7
2. Remove and Clean NA’s:
air=airquality
summary(air)
air$Ozone=ifelse(is.na(air$Ozone),median(air$Ozone,na.rm=TRUE),air$Ozone)
summary(air)
air$Solar.R=ifelse(is.na(air$Solar.R),median(air$Solar.R,na.rm=TRUE),air$Solar.R)
summary(air)
Outcome:
Ozone Solar.R Wind Temp Month
Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00 Min. :5.000
1st Qu.: 21.00 1st Qu.:120.0 1st Qu.: 7.400 1st Qu.:72.00 1st Qu.:6.000
Median : 31.50 Median :205.0 Median : 9.700 Median :79.00 Median :7.000
Mean : 39.56 Mean :186.8 Mean : 9.958 Mean :77.88 Mean :6.993
3rd Qu.: 46.00 3rd Qu.:256.0 3rd Qu.:11.500 3rd Qu.:85.00 3rd Qu.:8.000
Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00 Max. :9.000
Reference: https://data.world/cdc/air-quality-measures
WORD CLOUD
DATA: Text of Finance Minister Nirmala Sitharaman's Union Budget 2019 speech
CODE:
library("wordcloud")
library("tm")
library(RColorBrewer)
abcd=readLines("C:/Users/hp/Desktop/rs.txt")
abcd
corpus=Corpus(VectorSource(abcd))
inspect(corpus)
data=tm_map(corpus,tolower)
data=tm_map(data,removeNumbers)
data=tm_map(data,removePunctuation)
data=tm_map(data,removeWords,stopwords("english"))
data=tm_map(corpus,stripWhitespace)
inspect(data)
dtm<- TermDocumentMatrix(data)
dtm
m<-as.matrix(dtm)
v<-sort(rowSums(m),decreasing = TRUE)
d<-data.frame(word=names(v),freq=v)
head(d)
wordcloud(d$word,freq = d$freq,random.order = FALSE,colors = brewer.pal(1,"Dark2") ).
OUT COME:
TOPIC MODELLING:
A quantitative approach to the exploration of abstracts / topics from a selection of text documents
focused on each word's statistics. Simply put, the process of examining a large collection of
documents, identifying clusters of words and grouping them together based on similarity and
identifying patterns in multiple clusters.
Twitter Data Analysis
SOURCE CODE:
1. Install and load packages:
library("twitteR")
install.packages("tm")
library("tm")
install.packages("wordcloud")
library("wordcloud")
install.packages("RColorBrewer")
library("RColorBrewer")
install.packages("slam")
library("slam")
install.packages("topicmodels")
library("topicmodels")
2. Load the data and clean the data in the R environment
tweee
tweee=gsub("(RT|via)((?:\b*@)+)","",tweee)
tweee=gsub("http[^[:blank:]]+","",tweee)
tweee=gsub("@+","",tweee)
tweee=gsub("[t]{2,}","",tweee)
tweee=gsub("^+|+$","",tweee)
tweee=gsub("[[:punct:]]","",tweee)
corpus=Corpus(VectorSource(tweee))
corpus = Corpus(VectorSource(tweee))
corpus = tm_map(corpus,removePunctuation)
corpus = tm_map(corpus,stripWhitespace)
corpus = tm_map(corpus,tolower)
corpus=tm_map(corpus,removeWords,stopwords("english"))
Create a Term Document Matrix and Calculate TF-IDF
tdm = DocumentTermMatrix(corpus) # Creating a Term document Matrix
term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm >
0))
summary(term_tfidf)
tdm <- tdm[,term_tfidf >= 0.1]
tdm <- tdm[row_sums(tdm) > 0,]
summary(col_sums(tdm))
best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(tdm, d)})
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus)))
dtm <- DocumentTermMatrix(corpus[doc.lengths > 0])
3. Calculate the optimal Number of topics (K) in the Corpus and Apply LDA method
using topic models Package
models <- list(
CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-
3))),
VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_Fixed = LDA(dtm, k = k, control = list(estimate. alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000)))
lapply(models, terms, 10)
assignments <- sapply(models, topics)
head(assignments, n=10)
OUTCOME:
$CTM
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8
[1,] "true" "46562444" "false" "1990" "46562444" "false" "82278" "82278"
[2,] "ctrue" "46562447" "cfalse" "6973" "46562534" "cfalse" "82286" "82277"
[3,] "82281" "46562434" "c45" "586" "46562449" "586" "82285" "82286"
[4,] "82278" "46562443" "6973" "c24" "46562489" "c45" "82290" "82291"
[5,] "82277" "46562441" "1990" "3270" "46562443" "1203" "82275" "82280"
[6,] "82291" "46562449" "586" "7945" "46562432" "46562463" "82288" "82290"
[7,] "82288" "46562445" "1203" "1203" "46562441" "46562444" "82291" "82281"
[8,] "82290" "46562463" "3270" "c45" "46562483" "46562500" "82281" "82285"
[9,] "82275" "46562573" "c24" "36760" "46562560" "46562545" "82280" "82269"
[10,] "82269" "46562473" "7945" "46562556" "46562545" "1990" "82287" "82287"
Topic 9 Topic 10
[1,] "46562444" "36760"
[2,] "46562449" "11022"
[3,] "46562550" "5244"
[4,] "46562443" "20966"
[5,] "46562434" "c36760"
[6,] "46562547" "10403"
[7,] "46562473" "46562441"
[8,] "46562560" "46562534"
[9,] "46562463" "46562550"
[10,] "46562447" "46562443"
$VEM
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
[1,] "586" "82278" "586" "46562444" "false" "1990" "46562502"
[2,] "c45" "82286" "1203" "46562443" "36760" "6973" "46562444"
[3,] "1203" "82291" "36760" "46562449" "11022" "7945" "46562534"
[4,] "1990" "82281" "1990" "46562534" "5244" "c24" "46562443"
[5,] "82278" "82277" "82278" "46562485" "cfalse" "3270" "46562531"
[6,] "82286" "82290" "46562502" "46562560" "10403" "586" "46562447"
[7,] "82277" "82287" "82291" "46562473" "20966" "c45" "46562464"
[8,] "46562444" "82275" "c45" "46562459" "c36760" "1203" "46562449"
[9,] "3270" "82288" "46562444" "46562477" "586" "46562531" "46562550"
[10,] "false" "82280" "82286" "46562433" "46562502" "46562502" "46562560"
Topic 8 Topic 9 Topic 10
[1,] "true" "46562434" "82278"
[2,] "ctrue" "46562447" "82285"
[3,] "586" "46562444" "82286"
[4,] "c45" "46562550" "82290"
[5,] "1203" "46562449" "82275"
[6,] "36760" "46562441" "82288"
[7,] "1990" "46562463" "82280"
[8,] "false" "46562483" "82281"
[9,] "3270" "46562432" "82277"
[10,] "c24" "46562445" "82292"
$VEM_Fixed
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8
[1,] "82278" "82278" "46562502" "46562444" "false" "1990" "46562502" "true"
[2,] "82286" "82286" "46562444" "46562443" "36760" "6973" "46562444" "ctrue"
[3,] "1990" "82291" "46562450" "46562449" "11022" "586" "46562534" "586"
[4,] "586" "82281" "46562447" "46562534" "5244" "7945" "46562443" "c45"
[5,] "c45" "82277" "46562434" "46562485" "cfalse" "c24" "46562531" "1990"
[6,] "1203" "82290" "46562449" "46562560" "10403" "3270" "46562447" "1203"
[7,] "82277" "82287" "46562464" "46562473" "20966" "c45" "46562464" "36760"
[8,] "82288" "82275" "46562499" "46562459" "c36760" "1203" "46562449" "3270"
[9,] "82285" "82288" "46562534" "46562477" "586" "46562531" "46562560" "c24"
[10,] "82287" "82280" "46562556" "46562433" "1990" "46562502" "46562550" "6973"
Topic 9 Topic 10
[1,] "46562434" "82278"
[2,] "46562447" "82285"
[3,] "46562444" "82286"
[4,] "46562550" "82290"
[5,] "46562449" "82275"
[6,] "46562441" "82288"
[7,] "46562463" "82280"
[8,] "46562483" "82281"
[9,] "46562432" "82277"
[10,] "46562445" "82292"
$Gibbs
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
[1,] "36760" "82278" "false" "46562449" "true" "46562441" "1990"
[2,] "11022" "82286" "cfalse" "46562447" "cfalse" "46562452" "586"
[3,] "5244" "82290" "1742667" "46562483" "false" "46551946" "6973"
[4,] "20966" "82285" "2449959" "46562550" "1742667" "1742667" "1203"
[5,] "c36760" "82291" "465128" "46562534" "2449959" "46562466" "c45"
[6,] "cfalse" "82275" "46551916" "46562547" "465128" "46562502" "2185"
[7,] "false" "82281" "46551946" "46562432" "46551916" "46562445" "36760"
[8,] "1742667" "82288" "46551959" "46562502" "46551946" "46562449" "3270"
[9,] "2449959" "82280" "46562401" "46562579" "46551959" "46562484" "7945"
[10,] "465128" "82277" "46562405" "46562472" "46562401" "46562485" "c24"
Topic 8 Topic 9 Topic 10
[1,] "46562447" "46562445" "46562444"
[2,] "46562472" "46562556" "46562443"
[3,] "46562531" "46562563" "46562434"
[4,] "46562539" "46562477" "46562441"
[5,] "46562477" "46562450" "46562473"
[6,] "46562528" "46562545" "46562560"
[7,] "46562450" "46562454" "46562463"
[8,] "46562483" "46562460" "46562489"
[9,] "46562521" "46562501" "46562485"
[10,] "46562548" "46562535" "46562555
CTM VEM VEM_Fixed Gibbs
1 6 5 5 3
2 9 9 9 10
3 4 1 1 7
4 7 10 10 2
5 10 5 5 1
6 4 6 6 7
7 1 8 8 5
REFERENCE: https://github.com/mkearney/trumptweets
SENTIMENT ANALYSIS
Sentiment Analysis is a method in which views of different polarities are obtained. We mean
positive, negative or neutral by polarities. It is also known as mining opinion and
identification of polarity. You can find out the type of perception expressed in reports, blogs,
social media feeds, etc. with the aid of sentiment analysis. Sentiment Analysis is a
classification method in which the data are categorized into various classes. Such classes may
be binary in nature (positive or negative) or multiple (happy, sad, angry, etc.) classes may be
available.
SOURCE CODE:
Required packages:
library(twitteR)
library(sentiment)
library(plyr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
some_tweets = searchTwitter("Trump", n=1500, lang="en")
some_txt = sapply(some_tweets, function(x) x$getText())
Clean data
> some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
> some_txt = gsub("@\\w+", "", some_txt)
> # remove punctuation
> some_txt = gsub("[[:punct:]]", "", some_txt)
> # remove numbers
> some_txt = gsub("[[:digit:]]", "", some_txt)
> # remove html links
> some_txt = gsub("http\\w+", "", some_txt)
> # remove unnecessary spaces
> some_txt = gsub("[ \t]{2,}", "", some_txt)
> some_txt = gsub("^\\s+|\\s+$", "", some_txt)
Classify emotion
> class_emo = classify_emotion(some_txt, algorithm="bayes", prior=1.0)
> emotion = class_emo[,7]
> emotion[is.na(emotion)] = "unknown"
> # classify polarity
> class_pol = classify_polarity(some_txt, algorithm="bayes")
> # get polarity best fit
> polarity = class_pol[,4]
> # data frame with results
> sent_df = data.frame(text=some_txt, emotion=emotion,
+ polarity=polarity, stringsAsFactors=FALSE)
> sent_df = within(sent_df,
+ emotion <- factor(emotion, levels=names(sort(table(emotion), decreasing=TRUE))))
> head(sent_df)
Plot distribution of emotions
> ggplot(sent_df, aes(x=emotion)) +
+ geom_bar(aes(y=..count.., fill=emotion)) +
+ scale_fill_brewer(palette="Dark2") +
+ labs(x="emotion categories", y="number of tweets") +
+ ggtitle("Sentiment Analysis of Tweets about Trump\n(classification by emotion)")
Plot distribution of polarity
> ggplot(sent_df, aes(x=polarity)) +
+ geom_bar(aes(y=..count.., fill=polarity)) +
+ scale_fill_brewer(palette="RdGy") +
+ labs(x="polarity categories", y="number of tweets") +
+ ggtitle("Sentiment Analysis of Tweets about Starbucks\n(classification by polarity)")
# separating text by emotion
> emos = levels(factor(sent_df$emotion))
> nemo = length(emos)
> emo.docs = rep("", nemo)
> for (i in 1:nemo)
+{
+ tmp = some_txt[emotion == emos[i]]
+ emo.docs[i] = paste(tmp, collapse=" ")
+}
>
> # remove stopwords
> emo.docs = removeWords(emo.docs, stopwords("english"))
> # create corpus
> corpus = Corpus(VectorSource(emo.docs))
> tdm = TermDocumentMatrix(corpus)
> tdm = as.matrix(tdm)
> colnames(tdm) = emos
OUTCOME:
The bar graph above depicts twitter user’s sentiment score, negative score denoted by the (-)
symbol, which indicates unhappiness with the trump statement, whereas the positive score
denotes that users are quite happy. Whereas, zero here represents that users are neutral.