第5课-中国天气网爬虫案例
一、中国天气网爬虫案例
#中国天气网爬虫 import requests from pyecharts.charts import Bar from bs4 import BeautifulSoup import copy import html5lib datas = [] data = { "city":None, "day":None, "higher_temp":None, "lower_temp":None } HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Referer": "http://www.weather.com.cn/textFC/db.shtml" } def weather_spider_dome(url): html = requests.get(url=url,headers=HEADERS).content.decode("utf-8") soup = BeautifulSoup(html,"html5lib") conMidtabs = soup.find_all(attrs={"class":"conMidtab"}) if url.find("gat")!= "-1": for conMidtab in conMidtabs: tables = conMidtab.find(attrs={"class":"conMidtab2"}).find_all("table") for table in tables: trs = table.find_all("tr") for i,tr in enumerate(trs): global cur_day tds = trs[i].find_all("td") if i == 0: start = tds[2].string.find("(") end = tds[2].string.find(")") cur_day = tds[2].string[start+1:end] data["day"] = cur_day elif i==2: data["city"] = list(tds[1].stripped_strings)[0] higher_temp = tds[4].string lower_temp = tds[7].string data["higher_temp"] = higher_temp data["lower_temp"] = lower_temp datas.append(copy.copy(data)) elif i>=3: data["city"] = list(tds[0].stripped_strings)[0] higher_temp = tds[3].string lower_temp = tds[6].string data["higher_temp"] = higher_temp data["lower_temp"] = lower_temp datas.append(copy.copy(data)) else: for conMidtab in conMidtabs: conMidtab2s = conMidtab.find_all(attrs={"class":"conMidtab2"}) for conMidtab2 in conMidtab2s: trs = conMidtab2.find_all("tr") for i,tr in enumerate(trs): tds = trs[i].find_all("td") if i == 0: start = tds[2].string.find("(") end = tds[2].string.find(")") cur_day = tds[2].string[start+1:end] data["day"] = cur_day elif i>1: higher_temp = "" lower_temp = "" if i==2: higher_temp = copy.copy(tds[4].string) lower_temp = copy.copy(tds[7].string) data["higher_temp"] = higher_temp data["lower_temp"] = lower_temp elif i>2: higher_temp = tds[3].string lower_temp = tds[6].string data["higher_temp"] = higher_temp data["lower_temp"] = lower_temp data["city"] = list(tds[0].stripped_strings) print(data) datas.append(copy.copy(data)) if __name__=="__main__": urls = ["http://www.weather.com.cn/textFC/hb.shtml", "http://www.weather.com.cn/textFC/db.shtml", "http://www.weather.com.cn/textFC/hd.shtml", "http://www.weather.com.cn/textFC/hz.shtml", "http://www.weather.com.cn/textFC/hn.shtml", "http://www.weather.com.cn/textFC/xb.shtml", "http://www.weather.com.cn/textFC/xn.shtml", "http://www.weather.com.cn/textFC/gat.shtml"] for url in urls: weather_spider_dome(url) for i in datas: print(i) # cities = [] # temp = [] # for i in datas: # if i["day"] == "12月11日": # cities.append(i["city"]) # cities.append(i["city"]) # temp.append(i["higher_temp"]) # temp.append(i["lower_temp"]) # print(cities) # print(temp) # bar = Bar() # # # bar.add_xaxis(cities) # bar.add_yaxis("12月11日", temp) # bar.render("weather.html")