import requests
import parsel
import time
import pandas as pd
def get_rice_data(page=1):
start = page
url = f"https://www.ricedata.cn/variety/identified/nation_{start}.htm"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
res = requests.get(url, headers=headers)
# 获取网页编码格式并设置
res.encoding = res.apparent_encoding
html = parsel.Selector(res.text)
# 获取父标签
datas = html.xpath('/html/body/table[2]/tr')
rice_data = []
rice_header = ['序号', '品种名称', '亲本来源', '类型', '原产地/选育单位', '审定编号']
for data in datas[1:]:
rice_num = data.xpath('td[1]/text()').get()
rice_category = data.xpath('td[2]/a/text()').get()
rice_source = data.xpath('td[3]/text()').get()
rice_type = data.xpath('td[4]/text()').get()
rice_country = data.xpath('td[5]/text()').get()
rice_ID = data.xpath('td[6]/text()').get()
rice_data.append([
rice_num,
rice_category,
rice_source,
rice_type,
rice_country,
rice_ID,
])
rice_data = pd.DataFrame(columns=rice_header, data=rice_data)
return rice_data
if __name__ == "__main__":
rice_datas = []
for l in range(1, 100):
time.sleep(1)
print(f"正在获取第{l}页数据")
data = get_rice_data(l)
rice_datas.append(data)
rice_info = pd.concat(rice_datas, ignore_index=True)
rice_info.to_csv('rice_data.csv', mode='w', index=False, sep=',')
print("保存成功!")