Python爬虫爬取微博热搜保存为 Markdown 文件
微博热搜榜python爬虫,仅供学习交流
源码及注释:
1 # -*- coding=UTF-8 -*- 2 #!usr/bin/env python 3 4 import os 5 import time 6 import requests 7 from lxml import etree 8 9 url = "https://s.weibo.com/top/summary?cate=realtimehot" 10 headers={ 11 'Host': 's.weibo.com', 12 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 13 'Accept-Encoding': 'gzip, deflate, br', 14 'Accept-Language': 'zh-CN,zh;q=0.9', 15 'Connection': 'keep-alive', 16 'Referer': 'https://weibo.com/', 17 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36' 18 } 19 20 r = requests.get(url,headers=headers) 21 print(r.status_code) 22 23 html_xpath = etree.HTML(r.text) 24 data = html_xpath.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]') 25 num = -1 26 27 28 # # 解决存储路径 29 # time_path = time.strftime('%Y{y}%m{m}%d{d}',time.localtime()).format(y='年', m='月', d='日') 30 # time_name = time.strftime('%Y{y}%m{m}%d{d}%H{h}',time.localtime()).format(y='年', m='月', d='日',h='点') 31 # root = "./" + time_path + "/" 32 # path = root + time_name + '.md' 33 # if not os.path.exists(root): 34 # os.mkdir(root) 35 36 37 38 39 # 解决存储路径 40 time_path = time.strftime('%Y{y}%m{m}%d{d}',time.localtime()).format(y='年', m='月', d='日') 41 time_name = time.strftime('%Y{y}%m{m}%d{d}%H{h}',time.localtime()).format(y='年', m='月', d='日',h='点') 42 year_path = time.strftime('%Y{y}',time.localtime()).format(y='年') 43 month_path = time.strftime('%m{m}',time.localtime()).format(m='月') 44 day_month = time.strftime('%d{d}',time.localtime()).format(d='日') 45 all_path = "./" + year_path + '/'+ month_path + '/' + day_month 46 if not os.path.exists(all_path): 47 # 创建多层路径 48 os.makedirs(all_path) 49 50 51 # 最终文件存储位置 52 root = all_path + "/" 53 path = root + time_name + '.md' 54 55 56 print(path) 57 # 文件头部信息 58 with open(path,'a') as f: 59 f.write('{} {}\n\n'.format('# ',time_name+'数据')) 60 f.close() 61 62 for tr in (data): 63 title = tr.xpath('./a/text()') 64 hot_score = tr.xpath('./span/text()') 65 66 num += 1 67 68 # 过滤第 0 条 69 if num == 0: 70 pass 71 else: 72 with open(path,'a') as f: 73 74 f.write('{} {}、{}\n\n'.format('###',num,title[0])) 75 f.write('{} {}\n\n'.format('微博当时热度为:',hot_score[0])) 76 77 f.close() 78 79 print(num,title[0],'微博此时的热度为:',hot_score[0])
运行:
运行结束后会在当前文件夹下生成以时间命名的文件夹,并且会生成以具体小时为单位的具体时间命名的 Markdown 文件,如下:
查看: