Python 天气爬虫接口

天气预报API

功能

从中国天气网抓取数据返回1-7天的天气数据，包括：日期、天气、温度、风力、风向.

 # 输入：城市名，如西安、北京，因为数据引用中国气象网，因此只支持中国城市
# 返回：列表，包括1-7的天气数据，每一天的分别为一个列表成员，列表内为字符串，代表内容如上所示
# 或返回None，城市不在city_code_in_weather_report.txt内
def get_weather(city:str):

检查网页源代码可知，我们需要的数据在tag=li，且class=’skyskyid lv2 on‘或者class=’skyskyid lv2‘中，注意看，这个lv2不同的城市可能不同，因此需要用到正则表达式去匹配，剩下的就看代码了

 <ul class="t clearfix">
  <li class="sky skyid lv2 on">
    <h1>14日（今天）</h1>
    <big class="png40 n00"></big>
    <p title="晴" class="wea">晴</p>
    <p class="tem">
      <i>0℃</i>
    </p>
    <p class="win">
      <em>
        <span title="北风" class="N"></span>
      </em>
      <i>3级</i>
    </p>
    <div class="slid"></div>
  </li>
  <li class="sky skyid lv2">
    <h1>15日（明天）</h1>
    <big class="png40 n00"></big>
    <p title="晴" class="wea">晴</p>
    <p class="tem">
      <span>15℃</span>/ <i>0℃</i>
    </p>
    <p class="win">
      <span title="西南风" class="SW"></span>
      <span title="西风" class="W"></span>
      <i>3级</i>
    </p>
    <div class="slid"></div>
  </li>
  <li class="sky skyid lv2">
    <h1>16日（后天）</h1>
  </li>
</ul>

 import requests
import re
from bs4 import BeautifulSoup
 
head = 'http://www.weather.com.cn/weather/'
suffix = '.shtml'
txt = 'city_code_in_weather_report.txt'
 
 
def __load_code():  # .txt->dict
    result = dict()
    with open(txt, 'r', encoding='utf-8') as f:
        line = f.readline()  # 每行格式为  101010100=北京
        while line != '':
            line.strip()
            if line is not None:
                line = line.strip()
                index = line.find('=')
                code = line[:index]
                city = line[index + 1:]
                result[city] = code
            line = f.readline()
    return result
 
 
def get_list(s_html):  # html->字符串list
    result = []
    for i in s_html:
        arr = i.get_text().split('\n')
        while True:  # 删除空白字符
            try:
                arr.remove('')
            except ValueError:
                break
        result.append(arr)
        # 结果的array应该如：['14日（今天）', '多云转晴', '14/4℃', '<3级']
    return result
 
 
def wind_dir(s_html):  # 风向比较特殊，有效内容在class内，特殊处理
    result = []
    pattern = r'title=".*"'
    for i in s_html:
        tags = i.find_all('span')
 
        for j in tags:
            res = re.search(pattern, str(j))
            if res is not None:
                result.append(res.group()[7:-2])
    return result
 
 
def get_html(soup, tag):  # 筛选tag为‘li’的内容，因为有效数据在这里
    res = []
    html = soup.find_all(tag)
    for i in html:
        try:
            # sky skyid lv2 被分成了['sky', 'skyid', 'lv2']
            if i['class'][0] == 'sky' and i['class'][1] == 'skyid' and re.fullmatch('lv\d', str(i['class'][2])):
                res.append(i)
        except KeyError:
            pass
    return res
 
 
def get_weather(city:str):
    dic = __load_code()
    if city not in dic:
        return None
    r = requests.get(head + dic[city] + suffix)
    r.encoding = r.apparent_encoding
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    ori_html = get_html(soup, 'li')
 
    w_list = get_list(ori_html)
    wind_list = wind_dir(ori_html)
    index = 0
    for i in w_list:  # 合并风向和其他参数
        i.append(wind_list[index])
        index += 1
    # 得到了1-7天完整的数据
    return w_list
 
 
if __name__ == "__main__":
    print(get_weather('西安'))

posted @ 2022-11-17 16:19 October- 阅读(140) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 爬取天气信息

· 数据集WebVision 1.0 (google 子集)介绍附pytorch下的简单使用

· python之天气爬虫

· Python 爬取历史天气数据

· 天气预报爬虫示例

阅读排行：
· 10年+ .NET Coder 心语 ── 封装的思维：从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源！
· 周边上新：园子的第一款马克杯温暖上架

Python 天气爬虫接口

天气预报API

功能

最新随笔

我的标签

积分与排名

随笔分类 (66)

随笔档案 (66)

阅读排行榜

评论排行榜

最新评论

	# 输入：城市名，如西安、北京，因为数据引用中国气象网，因此只支持中国城市
	# 返回：列表，包括1-7的天气数据，每一天的分别为一个列表成员，列表内为字符串，代表内容如上所示
	# 或返回None，城市不在city_code_in_weather_report.txt内
	def get_weather(city:str):

	<ul class="t clearfix">
	<li class="sky skyid lv2 on">
	<h1>14日（今天）</h1>
	<big class="png40 n00"></big>
	<p title="晴" class="wea">晴</p>
	<p class="tem">
	<i>0℃</i>
	</p>
	<p class="win">
	<em>
	<span title="北风" class="N"></span>
	</em>
	<i>3级</i>
	</p>
	<div class="slid"></div>
	</li>
	<li class="sky skyid lv2">
	<h1>15日（明天）</h1>
	<big class="png40 n00"></big>
	<p title="晴" class="wea">晴</p>
	<p class="tem">
	<span>15℃</span>/ <i>0℃</i>
	</p>
	<p class="win">
	<span title="西南风" class="SW"></span>
	<span title="西风" class="W"></span>
	<i>3级</i>
	</p>
	<div class="slid"></div>
	</li>
	<li class="sky skyid lv2">
	<h1>16日（后天）</h1>
	</li>
	</ul>

	import requests
	import re
	from bs4 import BeautifulSoup

	head = 'http://www.weather.com.cn/weather/'
	suffix = '.shtml'
	txt = 'city_code_in_weather_report.txt'


	def __load_code(): # .txt->dict
	result = dict()
	with open(txt, 'r', encoding='utf-8') as f:
	line = f.readline() # 每行格式为 101010100=北京
	while line != '':
	line.strip()
	if line is not None:
	line = line.strip()
	index = line.find('=')
	code = line[:index]
	city = line[index + 1:]
	result[city] = code
	line = f.readline()
	return result


	def get_list(s_html): # html->字符串list
	result = []
	for i in s_html:
	arr = i.get_text().split('\n')
	while True: # 删除空白字符
	try:
	arr.remove('')
	except ValueError:
	break
	result.append(arr)
	# 结果的array应该如：['14日（今天）', '多云转晴', '14/4℃', '<3级']
	return result


	def wind_dir(s_html): # 风向比较特殊，有效内容在class内，特殊处理
	result = []
	pattern = r'title=".*"'
	for i in s_html:
	tags = i.find_all('span')

	for j in tags:
	res = re.search(pattern, str(j))
	if res is not None:
	result.append(res.group()[7:-2])
	return result


	def get_html(soup, tag): # 筛选tag为‘li’的内容，因为有效数据在这里
	res = []
	html = soup.find_all(tag)
	for i in html:
	try:
	# sky skyid lv2 被分成了['sky', 'skyid', 'lv2']
	if i['class'][0] == 'sky' and i['class'][1] == 'skyid' and re.fullmatch('lv\d', str(i['class'][2])):
	res.append(i)
	except KeyError:
	pass
	return res


	def get_weather(city:str):
	dic = __load_code()
	if city not in dic:
	return None
	r = requests.get(head + dic[city] + suffix)
	r.encoding = r.apparent_encoding
	html_doc = r.text
	soup = BeautifulSoup(html_doc, 'html.parser')
	ori_html = get_html(soup, 'li')

	w_list = get_list(ori_html)
	wind_list = wind_dir(ori_html)
	index = 0
	for i in w_list: # 合并风向和其他参数
	i.append(wind_list[index])
	index += 1
	# 得到了1-7天完整的数据
	return w_list


	if __name__ == "__main__":
	print(get_weather('西安'))