完整代码 -- 爬取国家粮食局历年水稻数据
import requests from lxml import etree import time #获取 urls_province # 获取源码 headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } url = r"https://ricedata.cn/variety/" response = requests.get(url,headers) response.encoding = "utf-8" #获取 urls_province html = etree.HTML(response.text) results = html.xpath('/html/body//tr[4]/td/div/a/@href') # 拼接 https://ricedata.cn/variety/ + result urls_province = ["https://ricedata.cn/variety/"+ result for result in results] #print(len(urls_province)) # 获取所有的额 privince_pages privince_pages = [] for url_province in urls_province: # 获取 page_urls 以农业部为例 https://ricedata.cn/variety/identified/nation_1.htm headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } url = url_province response = requests.get(url,headers) response.encoding = "utf-8" html_page = etree.HTML(response.text) #获取 urls_province results_page = html_page.xpath('/html/body/table[2]/caption/b/a/@href') num = (results_page[-1].split('_')[-1]).split('.')[0] num = (results_page[-1].split('_')[-1]).split('.')[0] parser = results_page[-1].split(num) privince_page = ["https://ricedata.cn/variety/identified/"+parser[0]+str(i)+parser[-1] for i in range(1,int(num)+1)] #print(privince_page) privince_pages.extend(privince_page) time.sleep(0.2) print(len(privince_pages)) # 获取地区的具体数据 data_content = [] for privince_page in privince_pages: headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } url = privince_page print(privince_page) response = requests.get(url,headers) response.encoding = "gbk" contents = etree.HTML(response.text,etree.HTMLParser()) tr_content = contents.xpath('/html/body/table[2]//tr') # /html/body/table[2]/tbody[2]/tr[1] content = [] for tr in tr_content: result = tr.xpath('./td/text()') content.append(result) data_content.extend(content) time.sleep(0.2) #print(content) print(len((data_content)))