0% found this document useful (0 votes)
13 views14 pages

R语言基础入门指令 (tips)

This document provides code for web scraping articles from China Daily about poverty in China, analyzing the text content, and generating network graphs of character connections in the novel "A Tale of Two Cities". It includes code for scraping RSS feeds from Google News, cleaning text corpora, analyzing word frequencies, and generating co-occurrence networks. Examples of network centrality measures like degree and betweenness centrality are also calculated on patent inventor data and character networks.

Uploaded by

s2000152
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views14 pages

R语言基础入门指令 (tips)

This document provides code for web scraping articles from China Daily about poverty in China, analyzing the text content, and generating network graphs of character connections in the novel "A Tale of Two Cities". It includes code for scraping RSS feeds from Google News, cleaning text corpora, analyzing word frequencies, and generating co-occurrence networks. Examples of network centrality measures like degree and betweenness centrality are also calculated on patent inventor data and character networks.

Uploaded by

s2000152
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 14

阅读书籍:《R for Data Science》

 利用 R 语言程序爬取网页关键词数据

 R 语言的使用领域与方式

(可以放在论文背景部分)

老师上课的代码笔记:(原笔记)
for (n in 1:5) {This is my first loop}
print (n)
print (paste0 ("This us sentence number", n)
}

3+3

install.packages("tidyRSS")
install.packages("rvest")

options(timeout=600)
download.file('https://pstek.nl/psfiles/two-cities.zip', 'two-cities.zip')
unzip('two-cities.zip')

setwd("~/aei-test")

data <- read.csv("two-cities-characters.csv")

<html>
<body>
<h1>This is a heading</h1>
<p>This is a paragraph</p>
<a href= "https://asb.edu.my">ASB website</a>

</body>
</html>

# Session 1 & 2: Google News Article Scraping (UM AEI 16 Dec 2023)

## SESSION 1: WEB SCRAPING

# Setting things up...


setwd('~/aei-test') #this is the working directory

#Write a query...
query <- 'milei site:batimes.com.ar'

#Then we 'rephrase it' so that we can extract the articles via RSS
url <- paste0('https://news.google.com/rss/search?q=', URLencode(query, reserved= T), '&hl=en-
MY&gl=MY&ceid=MY:en')

library(tidyRSS) #load package to clean up RSS feeds


articles <- tidyfeed(url) #download RSS feed from Google News (maximum 100 articles per request)
#If you want more than 100 articles, do multiple queries and then use rbind() to combine them

#STOP: check your articles...


titles <- articles$item_title #save titles for further analysis

#NEXT: Scrape the paragraphs of the first 5 articles...


library(rvest) #load package to scrape HTML
library(httr) #alternative package useing the 'GET' command

paragraphs <- c() #this is an empty vector in which we will store our paragraphs
for(n in 1:5){ #this is a for-loop
#html <- read_html(articles$item_link[n]) #load the website and 'decode' it (time-out after 10 seconds)
html <- read_html(GET(articles$item_link[n], timeout(30))) #alternative code as above, allowing for a longer
time-out
url <- html_text(html_nodes(html, 'a')) #obtain URL link that Google News forwards to (marked by <a> in the
HTML)
html <- read_html(GET(url, timeout(30))) #alternative code as above, allowing for a longer time-out
para <- html_text(html_nodes(html, 'p')) #select the paragraphs (market by <p> in HTML)
paragraphs <- c(paragraphs, para) #add newly scraped paragraphs to vector of paragraphs
Sys.sleep(10) #take a break (10 seconds)
}

#Now, do some cleaning of the data


para_clean <- as.data.frame(table(paragraphs)); para_clean <- subset(para_clean, Freq < 2) #remove repeated
paragraphs
write.csv(para_clean, 'news-corpus.csv', row.names= F) #write your paragraphs to a file (CSV)
#manually remove lines at the top in CSV (containing dates, key words, advertisements, etc.)
para_clean <- read.csv('news-corpus.csv') #re-load the corpus after cleaning manually

cabinet reshuffle location:malaysia site:malaymail.com

location:malaysia site:chinadaily.com.cn
poverty location:kenya site:nation.africa
blackpink site:allkpop.com

## SESSION 2: CONTENT/TEXT ANALYSIS

#Load the corpus


library(quanteda)
para_clean$paragraphs <- as.character(para_clean$paragraphs) #make sure paragraphs are 'characters'
#mycorpus <- corpus(para_clean$paragraphs) #load paragraphs
mycorpus <- corpus(titles) #or load titles

#Clean and standardize the corpus (generate tokens)


mytokens <- tokens(mycorpus, remove_punct = TRUE, remove_numbers = TRUE) #remove numbers and
punctuation
mytokens <- tokens_select(mytokens, stopwords('en', source='stopwords-iso'), selection='remove') #remove
stopwords
mytokens <- tokens_wordstem(mytokens) #condense words to wordstems
mytokens <- tokens_tolower(mytokens) #make everything lower-case
#mytokens <- tokens_select(mytokens, c('milei', 'milei\'s', 'javier', 'buenos', 'aires', 'times', 'argentina' ),
selection='remove') #remove other words
mydfm <- dfm(mytokens) #generate a frequency matrix

#Analysis - Word Frequency


wfreq <-topfeatures(mydfm, 100) %>% as.data.frame() #100 most frequently occurring words (topics)

library(ggplot2) #generate a word frequency graph


wfreq <-topfeatures(mydfm, 10) %>% as.data.frame() #10 most frequently occurring words (topics)
wfreq$n <- as.numeric(wfreq$.)
wfreq$word <- row.names(wfreq)
ggplot(wfreq, aes(x = reorder(word, n, function(n) -n), y = n)) + geom_bar(stat = "identity") + xlab('')

library(quanteda.textplots) #generate a word frequency cloud


set.seed(123); textplot_wordcloud(mydfm, max_words = 100)

#Analysis - Co-word Analysis


myfcm <- fcm(mydfm) #generate a co-word matrix
dim(myfcm)

feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)

size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network

jpeg('cowords.jpg', width = 500, height = 500) #save the picture (open the file)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
dev.off() #close the file

## Session 3: Network Analysis on the Characters of "A Tale of Two Cities" (UM AEI 16 Dec 2023)

download.file('https://pstek.nl/psfiles/two-cities.zip', 'two-cities.zip') # Original text from


https://www.gutenberg.org/cache/epub/98/pg98.txt
unzip('two-cities.zip')

## STEP 1: GENERATING THE CORPUS

# Loading the file as a single vector (maybe not a good idea with a very large text...)
filepath <- 'a_tale_of_two_cities.txt'
all_lines <- c()
con = file(filepath, "r")
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
all_lines <- c(all_lines, line)
}
close(con)

# Combining chapters and their paragraphs into a single line


chap <- c()
para <- c()
for(n in all_lines){
if(grepl('CHAPTER', n, ignore.case= F)){
print(n)
add_para <- paste(para, collapse = ' ')
chap <- c(chap, add_para)
para <- c()
}
para <- c(para, n)
}

df <- data.frame('chap'= chap[47:90]) #select only the chapters (not the introductions, etc.)

## STEP 2: IDENTIFY THE CHARACTERS IN THE CORPUS


#Replace names with character labels
characters <- read.csv('two-cities-characters.csv') #Based on: https://www.sparknotes.com/lit/a-tale-of-two-
cities/characters/
for(n in 1:nrow(characters)){
df$chap <- gsub(characters$name[n], characters$label[n], df$chap)
}

#Isolate characters
df$characters <- gsub(paste0('(', paste0(characters$label, collapse= '|'), ')|.'), "\\1", df$chap)

library(quanteda)
mycorpus <- corpus(df$characters) #load as corpus
mytokens <- tokens(mycorpus) #generate tokens
mydfm <- dfm(mytokens) #generate a frequency matrix

library(ggplot2) #generate a word frequency graph (now only for characters)


wfreq <-topfeatures(mydfm, 10) %>% as.data.frame() #10 most frequently occurring characters
wfreq$n <- as.numeric(wfreq$.)
wfreq$word <- row.names(wfreq)
ggplot(wfreq, aes(x = reorder(word, n, function(n) -n), y = n)) + geom_bar(stat = "identity") + xlab('')

#Analysis - Co-character Analysis


library(quanteda.textplots)
myfcm <- fcm(mydfm) #generate a co-word matrix
dim(myfcm)

feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)

size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, vertex_size = size / max(size) * 3) #generate a co-word network

netw <- as.igraph(myfcm) #"export" to igraph

library(igraph)
degr <- as.data.frame(degree(netw)) #degree centrality
btwn <- as.data.frame(betweenness(netw)) #betweenness centrality

## Session 4: Network Analysis with Patent Inventor Data (UM AEI 16 Dec 2023)

library(igraph)
download.file('https://pstek.nl/psfiles/patent-inv-example.zip', 'patent-inv-example.zip')
unzip('patent-inv-example.zip')
data <- read.csv('ipab20210107-inv.csv')
# Identify edges
net1 <- c(); net2 <- c()
for(n in 1:length(data[,1])){
pn <- data[n,1] #get patent/paper number
cn <- data[n,6] #get region code
if( n == length(data[,1])){break}
for(m in n:length(data[,1])){
m <- m + 1 #check if next item(s) has/have the same patent number
if( m > nrow(data)){break}
pm <- data[m,1]
cm <- data[m,6]
if( pm == pn ){
net1 <- c(net1, cn)
net2 <- c(net2, cm)
} else {break}
}
}
df <- data.frame(net1,net2)
df <- df[net1 != net2,]

#dfmy <- subset(df, net1 == "MY" | net2 == "MY")


dfnlmy <- subset(df, net1 == "NL" | net2 == "NL" | net1 == "MY" | net2 == "MY")
df <- dfnlmy

netw <- graph.data.frame(df, directed= F) #load as a network


netw <- simplify(netw, edge.attr.comb="sum") #sum edge weightings

# Network analysis
degr <- as.data.frame(degree(netw)) #degree centrality
btwn <- as.data.frame(betweenness(netw)) #betweenness centrality

netw <- delete.vertices(netw, which(degree(netw) < 10))


plot(netw)
plot(netw, vertex.label.color = 'black', vertex.label.cex=0.6, vertex.label.dist=1, vertex.color='white',
vertex.size=degree(netw), edge.width=E(netw)$weight, layout=layout.fruchterman.reingold)
【自己操作】爬取 China Dail 网页:poverty 关键词的程序:

# Session 1 & 2: Google News Article Scraping (UM AEI 16 Dec 2023)

## SESSION 1: WEB SCRAPING

#Write a query...
query <- 'Poverty,location:China site:Chinadaily.com.cn'

#Then we 'rephrase it' so that we can extract the articles via RSS
url <- paste0('https://news.google.com/rss/search?q=', URLencode(query, reserved= T), '&hl=en-
MY&gl=MY&ceid=MY:en')

library(tidyRSS) #load package to clean up RSS feeds


articles <- tidyfeed(url) #download RSS feed from Google News (maximum 100 articles per request)
#If you want more than 100 articles, do multiple queries and then use rbind() to combine them

#STOP: check your articles...


titles <- articles$item_title #save titles for further analysis

## SESSION 2: CONTENT/TEXT ANALYSIS

#Load the corpus


library(quanteda)
mycorpus <- corpus(titles) #or load titles

#Clean and standardize the corpus (generate tokens)


mytokens <- tokens(mycorpus, remove_punct = TRUE, remove_numbers = TRUE) #remove numbers and
punctuation
mytokens <- tokens_select(mytokens, stopwords('en', source='stopwords-iso'), selection='remove') #remove
stopwords
mytokens <- tokens_wordstem(mytokens) #condense words to wordstems
mytokens <- tokens_tolower(mytokens) #make everything lower-case
mytokens <- tokens_select(mytokens, c('chinadaily.com.cn', 'china', 'daili', 'mountain'), selection='remove')
#remove other words(爬取后要删掉的无关变量)【注意:不要一次跑太多程序,否则图表中可能没有
词】
mydfm <- dfm(mytokens) #generate a frequency matrix

#Analysis - Word Frequency


wfreq <-topfeatures(mydfm, 100) %>% as.data.frame() #100 most frequently occurring words (topics)

library(ggplot2) #generate a word frequency graph


wfreq <-topfeatures(mydfm, 10) %>% as.data.frame() #10 most frequently occurring words (topics)
wfreq$n <- as.numeric(wfreq$.)
wfreq$word <- row.names(wfreq)
ggplot(wfreq, aes(x = reorder(word, n, function(n) -n), y = n)) + geom_bar(stat = "identity") + xlab('')

library(quanteda.textplots) #generate a word frequency cloud


set.seed(123); textplot_wordcloud(mydfm, max_words = 100)

#Analysis - Co-word Analysis


myfcm <- fcm(mydfm) #generate a co-word matrix
dim(myfcm)

feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)

size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network

jpeg('cowords.jpg', width = 500, height = 500) #save the picture (open the file)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
dev.off() #close the file
R 语言基础入门笔记:

You might also like