17 中国天气网信息爬取
1 """中国天气网爬虫""" 2 3 import requests 4 from bs4 import BeautifulSoup 5 6 HEADERS = { 7 'User-Agent': 'Mozilla/5.0', 8 } 9 10 def parse_detail_page(url, is_html5lib): 11 """爬取具体页面具体数据""" 12 13 respose = requests.get(url, headers=HEADERS) 14 text = respose.content.decode('utf-8') 15 # with open('weather.html', 'w', encoding='utf-8') as fp: 16 # fp.write(text) 17 if is_html5lib == False: 18 soup = BeautifulSoup(text, 'lxml') 19 else: 20 soup = BeautifulSoup(text, 'html5lib') 21 # 以下为具体爬取数据方法 22 conMidtab = soup.find_all('div', attrs={'class':'conMidtab'}) 23 tables = conMidtab[0].find_all('table') 24 for table in tables: 25 trs = table.find_all('tr')[2:] 26 for index,tr in enumerate(trs): 27 tds = tr.find_all('td') 28 city_td = tds[0] 29 if index == 0: 30 city_td = tds[1] 31 city = list(city_td.stripped_strings)[0] 32 temp_td = tds[-2] 33 min_temp = list(temp_td.stripped_strings)[0] 34 # 输出城市及其最低温度 35 print({'city': city, 'min_temp': min_temp}) 36 37 print("="*40) 38 39 def get_detail_urls(url, base_url): 40 """得到华北、东北、华东、华中、华南、西北、西南、港澳台的具体页面链接""" 41 42 urllists = [] # 具体的页面信息列表 43 respose = requests.get(url, headers=HEADERS) 44 text = respose.content.decode('utf-8') 45 soup = BeautifulSoup(text, 'lxml') 46 # 数据爬取 47 uls = soup.find_all('ul', class_='lq_contentboxTab2') 48 alists = uls[0].find_all('a') 49 for list in alists: 50 newurl = base_url + list['href'] 51 urllists.append(newurl) 52 53 return urllists 54 55 def spider(): 56 """""" 57 58 # 初始爬取页面 59 src_url = "http://www.weather.com.cn/textFC/hb.shtml" 60 base_url = "http://www.weather.com.cn" 61 urllists = [] 62 urllists = get_detail_urls(src_url, base_url) 63 #print(urllists) 64 is_html5lib = False # 爬取页面是否用html5lib库 65 for index,urllist in enumerate(urllists): 66 if index != len(urllists)-1: 67 parse_detail_page(urllist, is_html5lib) 68 else: 69 is_html5lib = True 70 # url = "http://www.weather.com.cn/textFC/gat.shtml"这个页面需要用html5lib库解析,不然数据有错 71 parse_detail_page(urllist, is_html5lib) 72 73 if __name__ == '__main__': 74 spider()