R自动数据收集第一章概述——《List of World Heritage in Danger》
library(stringr)
library(XML)
library(maps)
heritage_parsed <- htmlParse("http://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger",encoding = "UTF-8")
Error: failed to load external entity "
page 2
Credit: Suryapratim Sarkar (2015-06-25)
Wikipedia changed its server communication from HTTP to HTTPS. As a result, the following lines on page 2 return an error:(的确,如果你直接粘贴原http协议的地址,浏览器的地址栏会自动帮你转成https)
heritage_parsed <- htmlParse("http://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger",
encoding = "UTF-8")
## Error: failed to load external entity "http://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger"
There are at least two solutions to the problem:
Use
getURL()
and specify the location of CA signatures (see Section 9.1.7 of our book).Use Hadley Wickham’s
rvest
package, which came out after our book was published. It facilitates scraping with R considerably, in particular in such scenarios(情形、脚本). In this specific example, use the following code instead:
library(rvest) # the new package, version 0.3.0
heritage_parsed <- read_html("http://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger", encoding = "UTF-8") # read_html() from the rvest package is the new htmlParse() from the XML package
tables <- html_table(heritage_parsed, fill = TRUE) # html_table() from the rvest package, which replaces readHTMLTable() from the XML package
From thereon, the rest of the chapter code should work. If you want to learn more about the rvest package, have a look here. We are planning to cover it extensively in the next edition of our book.
heritage_parsed <- read_html(
"http://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger", encoding = "UTF-8")
# str(heritage_parsed)
#我曾经尝试将heritage_parsed写到本地文件,但是失败了
#无法将list通过cat()输入到本地,查阅了sof,但是还是没有解决
#从heritage_parsed中取出所有的表格
tables <- html_table(heritage_parsed, fill = TRUE)
#所以,我怎么知道这是第二个表格呢?
danger_table <- tables[[2]]
#查看表格的列名
names(danger_table)
# [1] "Name" "Image" "Location"
# [4] "Criteria" "Area\nha (acre)" "Year (WHS)"
# [7] "Endangered" "Reason" "Refs"
#感兴趣的只有表格中的1, 3, 4, 6, 7列
#名字、位置、标准分类、申遗成功时间、何时定为处于危险的时间
#覆盖掉原来的表格
danger_table <- danger_table[, c(1, 3, 4, 6, 7)]
#为了更方便,对列重命名,
colnames(danger_table) <- c("name", "locn", "crit", "yins", "yend")
#取几个遗产的名字查看下
danger_table$name[1:3]
#str_detect是stringr包中的函数
#用于在前一个参数中按照后一个参数的规则查找是否存在特定字符
#返回值是逻辑值向量
#对crit列按照自然和人文遗产重新编码
#Natural景观遗产编码为nat,否则为culture,即cul
danger_table$crit <- ifelse(
str_detect(danger_table$crit, "Natural") ==TRUE,
"nat",
"cult"
)
fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "a")
# [1] TRUE TRUE TRUE TRUE
# 将申遗成功时间转为数值类型
danger_table$yins <- as.numeric(danger_table$yins)
#按照正则表达式取出(我们只取出濒危时间,去掉横杠,并转化为数值型)
yend_clean <- unlist(
str_extract_all(danger_table$yend, "[[:digit:]]4$")
)
yend_clean1 <- unlist(str_extract_all(danger_table$yend, "\\d{4}-"))
yend_clean <- unlist(str_extract_all(yend_clean1, "\\d{4}"))
length(yend_clean)
length(danger_table$yend)
danger_table$yend <- as.numeric(yend_clean)
danger_table$yend[1:3]
#[1] 2001 1992 2013
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d")
# [1] "4" NA NA "2"
str_extract(shopping_list, "[a-z]+")
# [1] "apples" "bag" "bag" "milk"
str_extract_all(shopping_list, "[a-z]+")
# [[1]]
# [1] "apples" "x"
#
# [[2]]
# [1] "bag" "of" "flour"
#
# [[3]]
# [1] "bag" "of" "sugar"
#
# [[4]]
# [1] "milk" "x"
danger_table$locn[c(1, 3, 5)]
[1] "EgyAbusir, Egypt30°50′30″N 29°39′50″E / 30.84167°N 29.66389°E / 30.84167; 29.66389 (Abu Mena)"
[2] "Syria !Aleppo Governorate, Syria36°14′0″N 37°10′0″E / 36.23333°N 37.16667°E / 36.23333; 37.16667 (Ancient City of Aleppo)"
[3] "Syria !Damascus Governorate, Syria33°30′41″N 36°18′23″E / 33.51139°N 36.30639°E / 33.51139; 36.30639 (Ancient City of Damascus)"
reg_y <- "[/][ -]*[[:digit:]]*[.]*[[:digit:]]*[;]"
reg_x <- "[;][ -]*[[:digit:]]*[.]*[[:digit:]]*"
y_coords <- str_extract(danger_table$locn, reg_y)
y_coords
[1] "/ 30.84167;" "/ 18.283;" "/ 36.23333;"
[4] "/ 32.51806;" "/ 33.51139;" "/ 36.33417;"
[7] "/ 32.82500;" "/ 32.63833;" "/ 32.80528;"
[10] "/ 35.45667;" "/ 42.26222;" "/ 17.317;"
[13] "/ -8.11111;" "/ 31.70444;" "/ 9.167;"
[16] "/ 11.417;" "/ 34.78167;" "/ 34.83194;"
[19] "/ -11.68306;" "/ 25.317;" "/ 9.55389;"
[22] "/ 4.000;" "/ 35.58806000;" "/ 39.05000;"
[25] "/ 14.200;" "/ -20.20833;" "/ -2.500;"
[28] "/ 53.40667;" "/ 9.000;" "/ 34.39667;"
[31] "/ 42.66111;" "/ 7.600;" "/ 6.83972;"
[34] "/ 13.000;" "/ 2.000;" "/ 31.77667;"
[37] "/ 15.35556;" "/ 30.13333;" "/ 13.90639;"
[40] "/ 15.92694000;" "/ -14.467;" "/ 15.74444;"
[43] "/ 24.83333;" "/ -8.95778;" "/ -2.000;"
[46] "/ 34.200;" "/ 13.183;" "/ 34.55417;"
[49] "/ 16.77333;" "/ 16.2893333;" "/ 0.32917;"
[52] "/ -2.500;" "/ 0.917;" "/ 31.71972;"
y_coords <- as.numeric(str_sub(y_coords, 3, -2))
danger_table$y_coords <- y_coords
x_coords <- str_extract(danger_table$locn, reg_x)
x_coords <- as.numeric(str_sub(x_coords, 3, -1))
danger_table$x_coords <- x_coords
danger_table$locn <- NULL
names(danger_table)
# [1] "name" "crit" "yins" "yend" "y_coords"
# [6] "x_coords"
round(danger_table$y_coords, 2)[1:3]
round(danger_table$x_coords, 2)[1:3]
#查看维度
dim(danger_table)
head(danger_table)
#对人文和自然景观设置不同的点的形状
pch <- ifelse(danger_table$crit == "nat", 10, 11)
#对分属人文和自然景观的遗产设置不同的颜色参数
cols <- ifelse(danger_table$crit == "nat", 'deepskyblue4', 'brown1')
#嗯,输入到本地文件,免得RStudio的绘图窗口太小
#注意宽高的调整
png("d:\\map.png",width = 1300, height = 720)
map("world", col = "maroon",
lwd = 0.5,
mar = c(0.1, 0.1, 0.1, 0.1),
bg='seashell')
points(danger_table$x_coords,
danger_table$y_coords,
pch = pch,
col=cols,
cex=1.2,lwd=1.3
)
#添加图例
#leg.txt中设置的是图例文本
leg.txt <- c("Natural", "Cultural")
#当取x=0,y=0的时候是位于图正中央
#text.col设置图例文字颜色
#cex设置图例区域大小
legend("topright", leg.txt, horiz = TRUE,
pch=c(10, 11),
col = c('deepskyblue4', 'brown1'),
text.col=c('deepskyblue4', 'brown1'),
cex =2.3)
#box是边框,其样式lty的参数有点意思哈
box(lty='1373', lwd =5,col = 'red')
#关闭输出设备
dev.off()
table(danger_table$crit)
#
# cult nat
# 37 17
library(RColorBrewer)
cols1=brewer.pal(8,'Set3')
par(bg='lightcyan')
hist(danger_table$yend,
freq = TRUE,
xlab = "Year when site was put on the list of endangered sites",
col=cols1,border = cols1,
main = "")
cols2=brewer.pal(8,'Paired')
par(bg='lightcyan')
hist(
danger_table$yend,
freq = TRUE,density = TRUE,
xlab = "Year when site was put on the list of endangered sites",
col=cols2,main = "")