0% found this document useful (0 votes)

13 views14 pages

R语言基础入门指令 (tips)

This document provides code for web scraping articles from China Daily about poverty in China, analyzing the text content, and generating network graphs of character connections in the novel "A Tale of Two Cities". It includes code for scraping RSS feeds from Google News, cleaning text corpora, analyzing word frequencies, and generating co-occurrence networks. Examples of network centrality measures like degree and betweenness centrality are also calculated on patent inventor data and character networks.

Uploaded by

s2000152

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

13 views14 pages

R语言基础入门指令 (tips)

Uploaded by

s2000152

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 14

阅读书籍：《R for Data Science》

 利用 R 语言程序爬取网页关键词数据

 R 语言的使用领域与方式

（可以放在论文背景部分）

老师上课的代码笔记：（原笔记）
for (n in 1:5) {This is my first loop}
print (n)
print (paste0 ("This us sentence number", n)
}

3+3

install.packages("tidyRSS")
install.packages("rvest")

options(timeout=600)
download.file('https://pstek.nl/psfiles/two-cities.zip', 'two-cities.zip')
unzip('two-cities.zip')

setwd("~/aei-test")

data <- read.csv("two-cities-characters.csv")

<html>
<body>
<h1>This is a heading</h1>
<p>This is a paragraph</p>
<a href= "https://asb.edu.my">ASB website</a>

</body>
</html>

# Session 1 & 2: Google News Article Scraping (UM AEI 16 Dec 2023)

## SESSION 1: WEB SCRAPING

# Setting things up...

setwd('~/aei-test') #this is the working directory

#Write a query...
query <- 'milei site:batimes.com.ar'

#Then we 'rephrase it' so that we can extract the articles via RSS
url <- paste0('https://news.google.com/rss/search?q=', URLencode(query, reserved= T), '&hl=en-
MY&gl=MY&ceid=MY:en')

library(tidyRSS) #load package to clean up RSS feeds

articles <- tidyfeed(url) #download RSS feed from Google News (maximum 100 articles per request)
#If you want more than 100 articles, do multiple queries and then use rbind() to combine them

#STOP: check your articles...

titles <- articles$item_title #save titles for further analysis

#NEXT: Scrape the paragraphs of the first 5 articles...

library(rvest) #load package to scrape HTML
library(httr) #alternative package useing the 'GET' command

paragraphs <- c() #this is an empty vector in which we will store our paragraphs
for(n in 1:5){ #this is a for-loop
#html <- read_html(articles$item_link[n]) #load the website and 'decode' it (time-out after 10 seconds)
html <- read_html(GET(articles$item_link[n], timeout(30))) #alternative code as above, allowing for a longer
time-out
url <- html_text(html_nodes(html, 'a')) #obtain URL link that Google News forwards to (marked by <a> in the
HTML)
html <- read_html(GET(url, timeout(30))) #alternative code as above, allowing for a longer time-out
para <- html_text(html_nodes(html, 'p')) #select the paragraphs (market by <p> in HTML)
paragraphs <- c(paragraphs, para) #add newly scraped paragraphs to vector of paragraphs
Sys.sleep(10) #take a break (10 seconds)
}

#Now, do some cleaning of the data

para_clean <- as.data.frame(table(paragraphs)); para_clean <- subset(para_clean, Freq < 2) #remove repeated
paragraphs
write.csv(para_clean, 'news-corpus.csv', row.names= F) #write your paragraphs to a file (CSV)
#manually remove lines at the top in CSV (containing dates, key words, advertisements, etc.)
para_clean <- read.csv('news-corpus.csv') #re-load the corpus after cleaning manually

cabinet reshuffle location:malaysia site:malaymail.com

location:malaysia site:chinadaily.com.cn
poverty location:kenya site:nation.africa
blackpink site:allkpop.com

## SESSION 2: CONTENT/TEXT ANALYSIS

#Load the corpus

library(quanteda)
para_clean$paragraphs <- as.character(para_clean$paragraphs) #make sure paragraphs are 'characters'
#mycorpus <- corpus(para_clean$paragraphs) #load paragraphs
mycorpus <- corpus(titles) #or load titles

#Clean and standardize the corpus (generate tokens)

mytokens <- tokens(mycorpus, remove_punct = TRUE, remove_numbers = TRUE) #remove numbers and
punctuation
mytokens <- tokens_select(mytokens, stopwords('en', source='stopwords-iso'), selection='remove') #remove
stopwords
mytokens <- tokens_wordstem(mytokens) #condense words to wordstems
mytokens <- tokens_tolower(mytokens) #make everything lower-case
#mytokens <- tokens_select(mytokens, c('milei', 'milei\'s', 'javier', 'buenos', 'aires', 'times', 'argentina' ),
selection='remove') #remove other words
mydfm <- dfm(mytokens) #generate a frequency matrix

#Analysis - Word Frequency

wfreq <-topfeatures(mydfm, 100) %>% as.data.frame() #100 most frequently occurring words (topics)

library(ggplot2) #generate a word frequency graph

wfreq <-topfeatures(mydfm, 10) %>% as.data.frame() #10 most frequently occurring words (topics)
wfreq$n <- as.numeric(wfreq$.)
wfreq$word <- row.names(wfreq)
ggplot(wfreq, aes(x = reorder(word, n, function(n) -n), y = n)) + geom_bar(stat = "identity") + xlab('')

library(quanteda.textplots) #generate a word frequency cloud

set.seed(123); textplot_wordcloud(mydfm, max_words = 100)

#Analysis - Co-word Analysis

myfcm <- fcm(mydfm) #generate a co-word matrix
dim(myfcm)

feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)

size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network

jpeg('cowords.jpg', width = 500, height = 500) #save the picture (open the file)
textplot_network(myselect, min_freq = 0.8, vertex_size = size / max(size) * 3) #generate a co-word network
dev.off() #close the file

## Session 3: Network Analysis on the Characters of "A Tale of Two Cities" (UM AEI 16 Dec 2023)

download.file('https://pstek.nl/psfiles/two-cities.zip', 'two-cities.zip') # Original text from

https://www.gutenberg.org/cache/epub/98/pg98.txt
unzip('two-cities.zip')

## STEP 1: GENERATING THE CORPUS

# Loading the file as a single vector (maybe not a good idea with a very large text...)
filepath <- 'a_tale_of_two_cities.txt'
all_lines <- c()
con = file(filepath, "r")
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
all_lines <- c(all_lines, line)
}
close(con)

# Combining chapters and their paragraphs into a single line

chap <- c()
para <- c()
for(n in all_lines){
if(grepl('CHAPTER', n, ignore.case= F)){
print(n)
add_para <- paste(para, collapse = ' ')
chap <- c(chap, add_para)
para <- c()
}
para <- c(para, n)
}

df <- data.frame('chap'= chap[47:90]) #select only the chapters (not the introductions, etc.)

## STEP 2: IDENTIFY THE CHARACTERS IN THE CORPUS

#Replace names with character labels
characters <- read.csv('two-cities-characters.csv') #Based on: https://www.sparknotes.com/lit/a-tale-of-two-
cities/characters/
for(n in 1:nrow(characters)){
df$chap <- gsub(characters$name[n], characters$label[n], df$chap)
}

#Isolate characters
df$characters <- gsub(paste0('(', paste0(characters$label, collapse= '|'), ')|.'), "\\1", df$chap)

library(quanteda)
mycorpus <- corpus(df$characters) #load as corpus
mytokens <- tokens(mycorpus) #generate tokens
mydfm <- dfm(mytokens) #generate a frequency matrix

library(ggplot2) #generate a word frequency graph (now only for characters)

wfreq <-topfeatures(mydfm, 10) %>% as.data.frame() #10 most frequently occurring characters
wfreq$n <- as.numeric(wfreq$.)
wfreq$word <- row.names(wfreq)
ggplot(wfreq, aes(x = reorder(word, n, function(n) -n), y = n)) + geom_bar(stat = "identity") + xlab('')

#Analysis - Co-character Analysis

library(quanteda.textplots)
myfcm <- fcm(mydfm) #generate a co-word matrix
dim(myfcm)

feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)

size <- log(colSums(dfm_select(mydfm, feat, selection = "keep"))) #calculate weight of co-word connections
set.seed(112)
textplot_network(myselect, vertex_size = size / max(size) * 3) #generate a co-word network

netw <- as.igraph(myfcm) #"export" to igraph

library(igraph)
degr <- as.data.frame(degree(netw)) #degree centrality
btwn <- as.data.frame(betweenness(netw)) #betweenness centrality

## Session 4: Network Analysis with Patent Inventor Data (UM AEI 16 Dec 2023)

library(igraph)
download.file('https://pstek.nl/psfiles/patent-inv-example.zip', 'patent-inv-example.zip')
unzip('patent-inv-example.zip')
data <- read.csv('ipab20210107-inv.csv')
# Identify edges
net1 <- c(); net2 <- c()
for(n in 1:length(data[,1])){
pn <- data[n,1] #get patent/paper number
cn <- data[n,6] #get region code
if( n == length(data[,1])){break}
for(m in n:length(data[,1])){
m <- m + 1 #check if next item(s) has/have the same patent number
if( m > nrow(data)){break}
pm <- data[m,1]
cm <- data[m,6]
if( pm == pn ){
net1 <- c(net1, cn)
net2 <- c(net2, cm)
} else {break}
}
}
df <- data.frame(net1,net2)
df <- df[net1 != net2,]

#dfmy <- subset(df, net1 == "MY" | net2 == "MY")

dfnlmy <- subset(df, net1 == "NL" | net2 == "NL" | net1 == "MY" | net2 == "MY")
df <- dfnlmy

netw <- graph.data.frame(df, directed= F) #load as a network

netw <- simplify(netw, edge.attr.comb="sum") #sum edge weightings

# Network analysis
degr <- as.data.frame(degree(netw)) #degree centrality
btwn <- as.data.frame(betweenness(netw)) #betweenness centrality

netw <- delete.vertices(netw, which(degree(netw) < 10))

plot(netw)
plot(netw, vertex.label.color = 'black', vertex.label.cex=0.6, vertex.label.dist=1, vertex.color='white',
vertex.size=degree(netw), edge.width=E(netw)$weight, layout=layout.fruchterman.reingold)
【自己操作】爬取 China Dail 网页：poverty 关键词的程序：

# Session 1 & 2: Google News Article Scraping (UM AEI 16 Dec 2023)

## SESSION 1: WEB SCRAPING

#Write a query...
query <- 'Poverty,location:China site:Chinadaily.com.cn'

library(tidyRSS) #load package to clean up RSS feeds

articles <- tidyfeed(url) #download RSS feed from Google News (maximum 100 articles per request)
#If you want more than 100 articles, do multiple queries and then use rbind() to combine them

#STOP: check your articles...

titles <- articles$item_title #save titles for further analysis

## SESSION 2: CONTENT/TEXT ANALYSIS

#Load the corpus

library(quanteda)
mycorpus <- corpus(titles) #or load titles

#Clean and standardize the corpus (generate tokens)

mytokens <- tokens(mycorpus, remove_punct = TRUE, remove_numbers = TRUE) #remove numbers and
punctuation
mytokens <- tokens_select(mytokens, stopwords('en', source='stopwords-iso'), selection='remove') #remove
stopwords
mytokens <- tokens_wordstem(mytokens) #condense words to wordstems
mytokens <- tokens_tolower(mytokens) #make everything lower-case
mytokens <- tokens_select(mytokens, c('chinadaily.com.cn', 'china', 'daili', 'mountain'), selection='remove')
#remove other words（爬取后要删掉的无关变量）【注意：不要一次跑太多程序，否则图表中可能没有
词】
mydfm <- dfm(mytokens) #generate a frequency matrix

#Analysis - Word Frequency

wfreq <-topfeatures(mydfm, 100) %>% as.data.frame() #100 most frequently occurring words (topics)

library(ggplot2) #generate a word frequency graph

library(quanteda.textplots) #generate a word frequency cloud

set.seed(123); textplot_wordcloud(mydfm, max_words = 100)

#Analysis - Co-word Analysis

myfcm <- fcm(mydfm) #generate a co-word matrix
dim(myfcm)

feat <- names(topfeatures(myfcm, 50)) #reduce the size of the co-word matrix
myselect <- fcm_select(myfcm, pattern = feat, selection = "keep")
dim(myselect)

Functions Vs Scripts and Datasets
No ratings yet
Functions Vs Scripts and Datasets
25 pages
Bibliometrix Manual
100% (1)
Bibliometrix Manual
68 pages
Tools For Data Science Notes
No ratings yet
Tools For Data Science Notes
16 pages
Module 3
No ratings yet
Module 3
71 pages
DR Fayaz Presentation
No ratings yet
DR Fayaz Presentation
160 pages
Ifmis Training
50% (2)
Ifmis Training
60 pages
R Programming Notes
No ratings yet
R Programming Notes
35 pages
SMTA - Lab Record - Aim, Procedures and Results
No ratings yet
SMTA - Lab Record - Aim, Procedures and Results
31 pages
Week 8
No ratings yet
Week 8
24 pages
Data Analysis With R
No ratings yet
Data Analysis With R
72 pages
Biblio Metrix
No ratings yet
Biblio Metrix
72 pages
Manual Bibliometrix
No ratings yet
Manual Bibliometrix
68 pages
Lab3 Instructions
No ratings yet
Lab3 Instructions
25 pages
RMD Tut For West Lab
No ratings yet
RMD Tut For West Lab
34 pages
Text Analysis
No ratings yet
Text Analysis
15 pages
M3 Dar
No ratings yet
M3 Dar
52 pages
RDataMining Slides Text Mining
No ratings yet
RDataMining Slides Text Mining
35 pages
2013 - Notes - R Trinker'S - Notes
No ratings yet
2013 - Notes - R Trinker'S - Notes
274 pages
BDS306C - Imp Questions & Answers - Module 2-2
No ratings yet
BDS306C - Imp Questions & Answers - Module 2-2
14 pages
Reading Data in R
No ratings yet
Reading Data in R
11 pages
Rathod Yash Project 2
No ratings yet
Rathod Yash Project 2
9 pages
R Programming Lab Manual-24-25
No ratings yet
R Programming Lab Manual-24-25
17 pages
Ex 3
No ratings yet
Ex 3
20 pages
Step 1: Create A CSV File: # For Text Mining
No ratings yet
Step 1: Create A CSV File: # For Text Mining
9 pages
Data Science Solved
No ratings yet
Data Science Solved
12 pages
Word Cloud
No ratings yet
Word Cloud
3 pages
R BasicCommands
No ratings yet
R BasicCommands
5 pages
R Program
No ratings yet
R Program
22 pages
Word Cloud
No ratings yet
Word Cloud
10 pages
DEV Manual
No ratings yet
DEV Manual
23 pages
Text Analysis
No ratings yet
Text Analysis
15 pages
Text Mining Package and Datacleaning: #Cleaning The Text or Text Transformation
No ratings yet
Text Mining Package and Datacleaning: #Cleaning The Text or Text Transformation
6 pages
Lesson2 Dataframe
No ratings yet
Lesson2 Dataframe
4 pages
Text Mining Code
No ratings yet
Text Mining Code
2 pages
Experiment: 1
No ratings yet
Experiment: 1
28 pages
Text Mining Code
No ratings yet
Text Mining Code
3 pages
R Commands
No ratings yet
R Commands
18 pages
Cas13 R ch2 3.R
No ratings yet
Cas13 R ch2 3.R
7 pages
R Practical File
No ratings yet
R Practical File
17 pages
Sma 3
No ratings yet
Sma 3
3 pages
Amazon Sentimental Analysis
No ratings yet
Amazon Sentimental Analysis
8 pages
Web Mining: 19BCE2483 Anubhav Bhandary Prob.1
No ratings yet
Web Mining: 19BCE2483 Anubhav Bhandary Prob.1
4 pages
R
No ratings yet
R
2 pages
Data Science Capstone - Week 2 Milestone - Exploratory Data Analysis On Text Files
No ratings yet
Data Science Capstone - Week 2 Milestone - Exploratory Data Analysis On Text Files
7 pages
Stewart LabHandout
No ratings yet
Stewart LabHandout
11 pages
Basic Textual Analysis in R
No ratings yet
Basic Textual Analysis in R
2 pages
6 - Text Vectorization-CSC688-SP22
No ratings yet
6 - Text Vectorization-CSC688-SP22
5 pages
Expt. No. Basic Math Date
No ratings yet
Expt. No. Basic Math Date
24 pages
Data Science With R Text Mining by Graham Williams
No ratings yet
Data Science With R Text Mining by Graham Williams
21 pages
Tmcode Text Mining
No ratings yet
Tmcode Text Mining
2 pages
Mod3 Tables EPP
No ratings yet
Mod3 Tables EPP
9 pages
R Script To Scrape Reddit Comments
No ratings yet
R Script To Scrape Reddit Comments
4 pages
Getting and Cleaning Data Course Notes: Xing Su
No ratings yet
Getting and Cleaning Data Course Notes: Xing Su
27 pages
Intro To Statistic Using R - Session 2
No ratings yet
Intro To Statistic Using R - Session 2
1 page
Order Tasks and Milestones Assignment
No ratings yet
Order Tasks and Milestones Assignment
6 pages
5 Paso S Text Mining
No ratings yet
5 Paso S Text Mining
4 pages
RDataMining Slides Text Mining
No ratings yet
RDataMining Slides Text Mining
34 pages
Traffic Congestion
100% (2)
Traffic Congestion
63 pages
Cps Motorola PDF
No ratings yet
Cps Motorola PDF
96 pages
Must-On Board Diagnostics II PCED PDF
No ratings yet
Must-On Board Diagnostics II PCED PDF
19 pages
Popular Science
100% (1)
Popular Science
68 pages
Ijfeat: Modern Three Axis Hydraulic Trailer
No ratings yet
Ijfeat: Modern Three Axis Hydraulic Trailer
3 pages
Neoway N720 FCC Certificate
No ratings yet
Neoway N720 FCC Certificate
2 pages
Evernote Presentation
No ratings yet
Evernote Presentation
21 pages
Technical Manual
No ratings yet
Technical Manual
22 pages
Online Ship Control System Using Supervisory Control and Data Acquisition (Scada)
No ratings yet
Online Ship Control System Using Supervisory Control and Data Acquisition (Scada)
5 pages
Kledbetter Module1resume 0115
No ratings yet
Kledbetter Module1resume 0115
5 pages
Conversation Guide - IBM Power Virtual Server
No ratings yet
Conversation Guide - IBM Power Virtual Server
23 pages
Sharp Gf777
No ratings yet
Sharp Gf777
6 pages
Toxic Comment Analyser: Indian Institute of Information Technology, Kalyani
No ratings yet
Toxic Comment Analyser: Indian Institute of Information Technology, Kalyani
37 pages
Escala PL250 Series Datasheet
No ratings yet
Escala PL250 Series Datasheet
2 pages
Introducing The Personal GPS: Tracker Tracking
No ratings yet
Introducing The Personal GPS: Tracker Tracking
17 pages
02 Whole
No ratings yet
02 Whole
0 pages
ABB PCM600 Brochure Artwork Web
No ratings yet
ABB PCM600 Brochure Artwork Web
3 pages
Kladovasilakis 2021 IOP Conf. Ser. Mater. Sci. Eng. 1037 012018
No ratings yet
Kladovasilakis 2021 IOP Conf. Ser. Mater. Sci. Eng. 1037 012018
8 pages
Cable Cubby Series VerD1
No ratings yet
Cable Cubby Series VerD1
12 pages
Gtoli
No ratings yet
Gtoli
2 pages
BIT English-Taught Program For Undergraduate
No ratings yet
BIT English-Taught Program For Undergraduate
11 pages
Unit 14 Research Proposal Writing
No ratings yet
Unit 14 Research Proposal Writing
12 pages
History & Structure
No ratings yet
History & Structure
17 pages
Datasheet Hmi5070nl
No ratings yet
Datasheet Hmi5070nl
2 pages
Rentcubo Com
No ratings yet
Rentcubo Com
6 pages
GSX16 Brochure
No ratings yet
GSX16 Brochure
6 pages
Pattern-Reconfigurable Antenna Using Low-Profile Electric and Magnetic Radiators
No ratings yet
Pattern-Reconfigurable Antenna Using Low-Profile Electric and Magnetic Radiators
5 pages
Truvu MPC I/O Expanders: I-Vu Building Automation System
No ratings yet
Truvu MPC I/O Expanders: I-Vu Building Automation System
2 pages
ITIL Edition 2011 - COBIT 5 - Mapping-22
No ratings yet
ITIL Edition 2011 - COBIT 5 - Mapping-22
1 page
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
From Everand
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
Manish Soni
No ratings yet
Inspiring Powershell Articles
From Everand
Inspiring Powershell Articles
Murat Yildirimoglu
No ratings yet

R语言基础入门指令 (tips)

Uploaded by

R语言基础入门指令 (tips)

Uploaded by

阅读书籍：《R for Data Science》

data <- read.csv("two-cities-characters.csv")

## SESSION 1: WEB SCRAPING

# Setting things up...

library(tidyRSS) #load package to clean up RSS feeds

#STOP: check your articles...

#NEXT: Scrape the paragraphs of the first 5 articles...

#Now, do some cleaning of the data

cabinet reshuffle location:malaysia site:malaymail.com

## SESSION 2: CONTENT/TEXT ANALYSIS

#Load the corpus

#Clean and standardize the corpus (generate tokens)

#Analysis - Word Frequency

library(ggplot2) #generate a word frequency graph

library(quanteda.textplots) #generate a word frequency cloud

#Analysis - Co-word Analysis

download.file('https://pstek.nl/psfiles/two-cities.zip', 'two-cities.zip') # Original text from

## STEP 1: GENERATING THE CORPUS

# Combining chapters and their paragraphs into a single line

## STEP 2: IDENTIFY THE CHARACTERS IN THE CORPUS

library(ggplot2) #generate a word frequency graph (now only for characters)

#Analysis - Co-character Analysis

netw <- as.igraph(myfcm) #"export" to igraph

#dfmy <- subset(df, net1 == "MY" | net2 == "MY")

netw <- graph.data.frame(df, directed= F) #load as a network

netw <- delete.vertices(netw, which(degree(netw) < 10))

## SESSION 1: WEB SCRAPING

library(tidyRSS) #load package to clean up RSS feeds

#STOP: check your articles...

## SESSION 2: CONTENT/TEXT ANALYSIS

#Load the corpus

#Clean and standardize the corpus (generate tokens)

#Analysis - Word Frequency

library(ggplot2) #generate a word frequency graph

library(quanteda.textplots) #generate a word frequency cloud

#Analysis - Co-word Analysis

You might also like