佛山价格数据爬取

Posted on 2018-11-29 16:32  豪顿  阅读(122)  评论(0编辑  收藏  举报

library(rvest)
library(dplyr)
library(RMySQL)
url="http://www.fsnyj.gov.cn/zwgk/jgxq/index.html"
base_url="http://www.fsnyj.gov.cn/zwgk/jgxq/"
urls=c(url)
tables=data.frame(title=c(),href=c(),date=c())
for(url in urls){
html=read_html(url)
node=html%>%html_nodes("div.alist ul li a")
href=gsub("\\./",base_url,html_attr(node,"href"))
title=gsub("\\n","",html_text(node))
date=html%>%html_nodes("div.alist ul li span")%>%html_text()
table=data.frame(title=title,href=href,date=date)
tables=rbind(tables,table)
print(url)
}
tables$title=gsub(" ","",tables$title)
tables=data.frame(tables,stringsAsFactors = F)
f=function(yestDate){
detail_tables=data.frame("序号"=c(),"品种"=c(),"规格等级"=c(),
"产地品牌"=c(),"计量单位"=c(),"顺德区"=c(),
"禅城区"=c(),"南海区"=c(),"三水区"=c(),
"高明区"=c(),"今日平均"=c(),"备注"=c(),
date=c()
)
for(j in 1:yestDate){

table_html=read_html(as.vector(tables$href[j]))
table=table_html%>%html_nodes("table[width='910']")%>%html_table()
head(table)
#table[[1]]$X1
detail_table_num=length(table[[1]]$X1)
detail_table=data.frame("序号"=table[[1]]$X1,"品种"=table[[1]]$X2,"规格等级"=table[[1]]$X3,
"产地品牌"=table[[1]]$X4,"计量单位"=table[[1]]$X5,"顺德区"=table[[1]]$X6,
"禅城区"=table[[1]]$X7,"南海区"=table[[1]]$X8,"三水区"=table[[1]]$X9,
"高明区"=table[[1]]$X10,"今日平均"=table[[1]]$X11,"备注"=table[[1]]$X12,
date=rep(as.vector(tables$date[j]),detail_table_num)
)
detail_table=detail_table[2:detail_table_num,]
detail_tables=rbind.data.frame(detail_tables,detail_table)
print(j)
print(paste(tables$date[j],"正在爬取....完"))
Sys.sleep(1)
}
return(detail_tables)
}

yestDate=4 #
dates=tables$date[(Sys.Date()-yestDate)<=as.Date(tables$date)]
choice=as.Date(dates)>=(Sys.Date()-yestDate)
if(length(choice)>0){
print(paste(dates,"正在爬取...."))
detail_tables=f(length(dates))
}

write.csv(detail_tables,file=paste0("D:\\地市调研\\地市对接详情\\detail_tables",(Sys.Date()-1),".csv"))

Copyright © 2024 豪顿
Powered by .NET 8.0 on Kubernetes