Python 爬虫
1. 配置第三方包
# 时间的模块 import datetime # 数据分析模块,用来处理excel import pandas as pd #用来构造xlsx文件的模块 import xlsxwriter as xlw # 用来爬取数据的模块 from urllib import request # HTML或XML标签中的内容解析器 from bs4 import BeautifulSoup as bs
2.获取时间序列函数
# 产生时间序列 def dateRange1(start, end): datelist1 = [datetime.datetime.strftime(x, '%Y%m') for x in list( pd.date_range(start=start, end=end))] datelist = sorted(list(set(datelist1))) return datelist # ['202005', '202006', '202007', '202008', '202009', '202010']
3.爬取网页
# 爬取网页数据,解析HTML文件,筛选数据,转换成列表格式数据 def getCommentsById(city, start, end): weather_result = [] # 获取时间序列 datelist = dateRange1(start, end) # [ '202009', '202010'] for i in datelist: url = 'http://lishi.tianqi.com/' + city + '/' + i + '.html' # 请求天气数据 opener = request.Request(url) # 添加 HTTP请求头 opener.add_header( 'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') req = request.urlopen(opener).read() # 解析html 数据 soup = bs(req, 'html.parser') # 'div .thrui > li ' 筛选html数据 weather_m = soup.select('div .thrui > li ') # 循环获取的数据 for i in weather_m[0:]: tt = [] for j in range(5): t = i.find_all('div')[j].string if t is not None: # 存在None值的进行处理,否则不能写入到excel tt.append(t) else: tt.append('None') weather_result.append(tt) print(weather_result) return weather_result
4.输出excel文件
# 将list数据写入到本地excel中 def list_to_excel(weather_result, filename): # 创建excel 名称,路径 workbook = xlw.Workbook('E:\\%s.xlsx' % filename) # 添加工作簿 sheet = workbook.add_worksheet('weather_report') # 添加excel头标题文字 title = ['日期', '最高气温', '最低气温', '天气', '风向',] for i in range(len(title)): # 将标题文字写入excel表头,字体加粗 sheet.write_string(0, i, title[i], workbook.add_format({'bold': True})) row, col = 1, 0 for a, b, c, d , e in weather_result: # 依次将数据 写入表格 sheet.write_string(row, col, a) sheet.write_string(row, col + 1, b) sheet.write_string(row, col + 2, c) sheet.write_string(row, col + 3, d) sheet.write_string(row, col + 4, d) row += 1 # 关闭表格 workbook.close()
5.调用
# 你要查询的城市的名称(拼音),起始时间,结束时间。 data = getCommentsById('hunan', '2020-09', '2020-10') # 获取的data值,excel的文件名 list_to_excel(data, '湖南天气202009-202010')
全部源码
# 时间的模块 import datetime # 数据分析模块,用来处理excel import pandas as pd #用来构造xlsx文件的模块 import xlsxwriter as xlw # 用来爬取数据的模块 from urllib import request # HTML或XML标签中的内容解析器 from bs4 import BeautifulSoup as bs # 产生时间序列 def dateRange1(start, end): datelist1 = [datetime.datetime.strftime(x, '%Y%m') for x in list( pd.date_range(start=start, end=end))] datelist = sorted(list(set(datelist1))) return datelist # ['202005', '202006', '202007', '202008', '202009', '202010'] # 爬取网页数据,解析HTML文件,筛选数据,转换成列表格式数据 def getCommentsById(city, start, end): weather_result = [] # 获取时间序列 datelist = dateRange1(start, end) # [ '202009', '202010'] for i in datelist: url = 'http://lishi.tianqi.com/' + city + '/' + i + '.html' # 请求天气数据 opener = request.Request(url) # 添加 HTTP请求头 opener.add_header( 'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') req = request.urlopen(opener).read() # 解析html 数据 soup = bs(req, 'html.parser') # 'div .thrui > li ' 筛选html数据 weather_m = soup.select('div .thrui > li ') # 循环获取的数据 for i in weather_m[0:]: tt = [] for j in range(5): t = i.find_all('div')[j].string if t is not None: # 存在None值的进行处理,否则不能写入到excel tt.append(t) else: tt.append('None') weather_result.append(tt) print(weather_result) return weather_result # 将list数据写入到本地excel中 def list_to_excel(weather_result, filename): # 创建excel 名称,路径 workbook = xlw.Workbook('E:\\%s.xlsx' % filename) # 添加工作簿 sheet = workbook.add_worksheet('weather_report') # 添加excel头标题文字 title = ['日期', '最高气温', '最低气温', '天气', '风向',] for i in range(len(title)): # 将标题文字写入excel表头,字体加粗 sheet.write_string(0, i, title[i], workbook.add_format({'bold': True})) row, col = 1, 0 for a, b, c, d , e in weather_result: # 依次将数据 写入表格 sheet.write_string(row, col, a) sheet.write_string(row, col + 1, b) sheet.write_string(row, col + 2, c) sheet.write_string(row, col + 3, d) sheet.write_string(row, col + 4, d) row += 1 # 关闭表格 workbook.close() # 你要查询的城市的名称(拼音),起始时间,结束时间。 data = getCommentsById('hunan', '2020-09', '2020-10') # 获取的data值,excel的文件名 list_to_excel(data, '湖南天气202009-202010')