R-大数据分析挖掘(4-R爬虫实现)
library("XML")
#获取全部的链接 url <- 'http://www.csdn.net/tag/' i_url_parse<-htmlParse(url,encoding="UTF-8") xpath_1 <- "//div[@class='overflow']/a" node<-getNodeSet(i_url_parse,xpath_1) for(j in 1:length(node)){ value2<-xmlGetAttr(node[[j]],name='href') print(value2) } handler <- function(url){ xpath_1 <- "//div[@class='line_list']/*" i_url_parse<-htmlParse(url,encoding="UTF-8"); node<-getNodeSet(i_url_parse,xpath_1) #get all div for(j in 1:length(node)){ value <- xmlGetAttr(node[[j]],name='class'); if(value == 'tit_list'){ tital <- xmlValue(node[[j]]) }else if (value == 'dwon_words'){ xpath_2 <- "//div[@class='dwon_words']/span[@class='tag_source']/a" node1 <-getNodeSet(i_url_parse,xpath_2) for(k in 1: length(node1)){ value <- xmlGetAttr(node1[[k]],name='href'); text <- xmlValue(node1[[k]]); } }
} xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']" node3 <-getNodeSet(i_url_parse,xpath_3) for(m in 1: length(node3)){ value1 <- xmlValue(node3[[m]]) if(value1 == '下一页'){ next_url = xmlGetAttr(node3[[m]],name='href'); next_url <- paste('http://www.csdn.net',next_url,sep="") print(next_url) Sys.sleep(25) handler(next_url) } } } #处理news url <- 'http://www.csdn.net/tag/android/news' xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']" node3 <-getNodeSet(i_url_parse,xpath_3) for(m in 1: length(node3)){ value1 <- xmlValue(node3[[m]]) if(value1 == '下一页'){ next_url = xmlGetAttr(node3[[m]],name='href'); print(next_url) handler(next_url) } }
#获取全部的链接 url <- 'http://www.csdn.net/tag/' i_url_parse<-htmlParse(url,encoding="UTF-8") xpath_1 <- "//div[@class='overflow']/a" node<-getNodeSet(i_url_parse,xpath_1) for(j in 1:length(node)){ value2<-xmlGetAttr(node[[j]],name='href') print(value2) }
结果:
R中处理函数:
handler <- function(url){ xpath_1 <- "//div[@class='line_list']/*" i_url_parse<-htmlParse(url,encoding="UTF-8"); node<-getNodeSet(i_url_parse,xpath_1) #get all div for(j in 1:length(node)){ value <- xmlGetAttr(node[[j]],name='class'); if(value == 'tit_list'){ tital <- xmlValue(node[[j]]) }else if (value == 'dwon_words'){ xpath_2 <- "//div[@class='dwon_words']/span[@class='tag_source']/a" node1 <-getNodeSet(i_url_parse,xpath_2) for(k in 1: length(node1)){ value <- xmlGetAttr(node1[[k]],name='href'); text <- xmlValue(node1[[k]]); } }
}
递归调用handler函数:
xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']" node3 <-getNodeSet(i_url_parse,xpath_3) for(m in 1: length(node3)){ value1 <- xmlValue(node3[[m]]) if(value1 == '下一页'){ next_url = xmlGetAttr(node3[[m]],name='href'); next_url <- paste('http://www.csdn.net',next_url,sep="") print(next_url) Sys.sleep(25) handler(next_url) } }
单独处理新闻的函数:
#处理news url <- 'http://www.csdn.net/tag/android/news' xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']" node3 <-getNodeSet(i_url_parse,xpath_3) for(m in 1: length(node3)){ value1 <- xmlValue(node3[[m]]) if(value1 == '下一页'){ next_url = xmlGetAttr(node3[[m]],name='href'); print(next_url) handler(next_url) } }
代码段2:
rm(list=ls()) gc() library(bitops) library(RCurl) library(curl) library(XML) alphabet <- c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","#") tags <- data.frame('字母分类具体标签'=NA) temp <- data.frame('循环多个标签临时'=NA) result <- data.frame('文章标题'=NA,'文章来源'=NA,'文章标签'=NA,'文章发布者'=NA,'字母分类'=NA,'字母分类具体标签'=NA) x <- getURLContent("http://www.csdn.net/tag/",encoding="UTF-8") url_tag <- htmlParse(x, encoding="UTF-8") num <- 0 for (i in 1:27){ xpath_tag <- paste("/html/body/div/div/ul/li[",i,"]/div/div/a",sep="") node_tag <- getNodeSet(url_tag, xpath_tag) m <- length(node_tag) print(paste(alphabet[i],"字母分类有",m,"个具体标签!",sep=" ")) if(m==0){ print(paste("没有找到",alphabet[i],"字母分类的具体标签相关节点!~~~",sep=" ")) }else{ for (j in 1:m){ tags[j,] <- xmlValue(node_tag[[j]]) k <- 0 for(k in 1:10000){ url <- paste("http://www.csdn.net/tag/",tags[j,1],"/news-",k,sep="") y <- getURLContent(url,encoding="UTF-8") url_news <- htmlParse(y, encoding="UTF-8") node_not_exists <- getNodeSet(url_news,"//div[@class='not_search']") if(length(node_not_exists)!=0){ break }else{ node_news <- getNodeSet(url_news, "//div[@class='line_list']") n <- length(node_news) if(n==0){ print(paste("我,",alphabet[i],"字母分类具体标签",tags[j,1],"的资讯不存在!~~~",sep=" ")) }else{ print(paste(alphabet[i],"字母分类的具体标签",tags[j,1],"第",k,"页有",n,"条资讯!",sep=" ")) for(p in 1:10){ num <- num+1 node_title <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/a',sep="")) node_source <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[1]/a',sep="")) node_tags <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[2]/a',sep="")) node_author <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[3]/a',sep="")) result[num,1] <- xmlValue(node_title[[1]]) result[num,2] <- xmlValue(node_source[[1]]) for(q in 1:length(node_tags)){ temp[q,1] <- xmlValue(node_tags[[q]]) } result[num,3] <- paste(temp[1],sep=",") result[num,4] <- xmlValue(node_author[[1]]) result[num,5] <- alphabet[i] result[num,6] <- tags[j,1] } if(num <= 10){ write.table(result[1:num,1:6],file=paste(alphabet[i],".r",sep=""),append=TRUE,col.names=TRUE) }else{ write.table(result[(num-n+1):num,1:6],file=paste(alphabet[i],".r",sep=""),append=TRUE,col.names=FALSE) } Sys.sleep(2) #The time interval to suspend execution for, in seconds. rm(result) Sys.sleep(1) #The time interval to suspend execution for, in seconds. gc() result <- data.frame('文章标题'=NA,'文章来源'=NA,'文章标签'=NA,'文章发布者'=NA,'字母分类'=NA,'字母分类具体标签'=NA) print(paste(alphabet[i],"字母分类的具体标签",tags[j,1],"第",k,"页的",n,"条资讯已抓取完毕!",sep=" ")) print(paste("字母分类i数字:",i,"字母分类的具体标签j数字:",j,"资讯页码k数字:",k,"资讯条目n数字:",n,"已抓取条数num数字:",num,sep="")) } } } } } }
在GUI中跑的结果:
有问题,扔需要改进
(二)R的多线程
1.安装R多线程的包
2.载入程序包
3.指定cpu核数
4.调用log函数并赋值给result
5.停止集群
注:
(三)R语言代理设置
curl<-‐getCurlHandle(proxy="10.10.10.10:8080");
getURL("hBp://baidu.com",curl=curl)
博客地址:http://www.cnblogs.com/jackchen-Net/