爬取 StackOverFlow 上有关于 Python 的问题

给定起始页面以及爬取页数,要求得到每一个问题的标题、票数、回答数、查看数

stackflow <- function(page){
    url <- "http://stackoverflow.com/questions/tagged/"
    require(rvest)
    u <- paste(url,"python?page=",as.character(page),"&sort=votes&pagesize=15",sep="")
    html <- read_html(u)
    t <- list(title = html %>% html_nodes(xpath="//div[@class='summary']/h3") %>% html_text(),
               vote = html %>% html_nodes(".vote-count-post") %>% html_text() %>% as.numeric(),
               answer = html %>% html_nodes(xpath="//div[@class='stats']/div[2]/strong") %>% html_text() %>% as.numeric(),
               views = html %>% html_nodes(xpath="//div[@class='statscontainer']/div[3]") %>% 
                  html_attr("title") %>% str_extract_all(pattern="[\\d\\,]+") %>% 
                  str_replace_all(pattern="\\,+",replacement="") %>% as.numeric()
    )
    return(t)
}
lapply(1:5, stackflow)
posted @ 2018-01-18 15:58  嘻呵呵  阅读(598)  评论(0编辑  收藏  举报