R语言基础入门指令 (tips)
R语言基础入门指令 (tips)
利用 R 语言程序爬取网页关键词数据
R 语言的使用领域与方式
(可以放在论文背景部分)
老师上课的代码笔记:(原笔记)
for (n in 1:5) {This is my first loop}
print (n)
print (paste0 ("This us sentence number", n)
}
3+3
install.packages("tidyRSS")
install.packages("rvest")
options(timeout=600)
download.file('https://pstek.nl/psfiles/two-cities.zip', 'two-cities.zip')
unzip('two-cities.zip')
setwd("~/aei-test")
<html>
<body>
<h1>This is a heading</h1>
<p>This is a paragraph</p>
<a href= "https://asb.edu.my">ASB website</a>
</body>
</html>
# Session 1 & 2: Google News Article Scraping (UM AEI 16 Dec 2023)
#Write a query...
query <- 'milei site:batimes.com.ar'
#Then we 'rephrase it' so that we can extract the articles via RSS
url <- paste0('https://news.google.com/rss/search?q=', URLencode(query, reserved= T), '&hl=en-
MY&gl=MY&ceid=MY:en')
paragraphs <- c() #this is an empty vector in which we will store our paragraphs
for(n in 1:5){ #this is a for-loop
#html <- read_html(articles$item_link[n]) #load the website and 'decode' it (time-out after 10 seconds)
html <- read_html(GET(articles$item_link[n], timeout(30))) #alternative code as above, allowing for a longer
time-out
url <- html_text(html_nodes(html, 'a')) #obtain URL link that Google News forwards to (marked by <a> in the
HTML)
html <- read_html(GET(url, timeout(30))) #alternative code as above, allowing for a longer time-out
para <- html_text(html_nodes(html, 'p')) #select the paragraphs (market by <p> in HTML)
paragraphs <- c(paragraphs, para) #add newly scraped paragraphs to vector of paragraphs
Sys.sleep(10) #take a break (10 seconds)
}
location:malaysia site:chinadaily.com.cn
poverty location:kenya site:nation.africa
blackpink site:allkpop.com
feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)
size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
jpeg('cowords.jpg', width = 500, height = 500) #save the picture (open the file)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
dev.off() #close the file
## Session 3: Network Analysis on the Characters of "A Tale of Two Cities" (UM AEI 16 Dec 2023)
# Loading the file as a single vector (maybe not a good idea with a very large text...)
filepath <- 'a_tale_of_two_cities.txt'
all_lines <- c()
con = file(filepath, "r")
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
all_lines <- c(all_lines, line)
}
close(con)
df <- data.frame('chap'= chap[47:90]) #select only the chapters (not the introductions, etc.)
#Isolate characters
df$characters <- gsub(paste0('(', paste0(characters$label, collapse= '|'), ')|.'), "\\1", df$chap)
library(quanteda)
mycorpus <- corpus(df$characters) #load as corpus
mytokens <- tokens(mycorpus) #generate tokens
mydfm <- dfm(mytokens) #generate a frequency matrix
feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)
size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, vertex_size = size / max(size) * 3) #generate a co-word network
library(igraph)
degr <- as.data.frame(degree(netw)) #degree centrality
btwn <- as.data.frame(betweenness(netw)) #betweenness centrality
## Session 4: Network Analysis with Patent Inventor Data (UM AEI 16 Dec 2023)
library(igraph)
download.file('https://pstek.nl/psfiles/patent-inv-example.zip', 'patent-inv-example.zip')
unzip('patent-inv-example.zip')
data <- read.csv('ipab20210107-inv.csv')
# Identify edges
net1 <- c(); net2 <- c()
for(n in 1:length(data[,1])){
pn <- data[n,1] #get patent/paper number
cn <- data[n,6] #get region code
if( n == length(data[,1])){break}
for(m in n:length(data[,1])){
m <- m + 1 #check if next item(s) has/have the same patent number
if( m > nrow(data)){break}
pm <- data[m,1]
cm <- data[m,6]
if( pm == pn ){
net1 <- c(net1, cn)
net2 <- c(net2, cm)
} else {break}
}
}
df <- data.frame(net1,net2)
df <- df[net1 != net2,]
# Network analysis
degr <- as.data.frame(degree(netw)) #degree centrality
btwn <- as.data.frame(betweenness(netw)) #betweenness centrality
# Session 1 & 2: Google News Article Scraping (UM AEI 16 Dec 2023)
#Write a query...
query <- 'Poverty,location:China site:Chinadaily.com.cn'
#Then we 'rephrase it' so that we can extract the articles via RSS
url <- paste0('https://news.google.com/rss/search?q=', URLencode(query, reserved= T), '&hl=en-
MY&gl=MY&ceid=MY:en')
feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)
size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
jpeg('cowords.jpg', width = 500, height = 500) #save the picture (open the file)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
dev.off() #close the file
R 语言基础入门笔记: