【Python爬虫】查自己博客每月发帖量的小程序
【前提:安装beautifulsoup4和requests】
pip install beautifulsoup4
pip install requests
【代码】
#encoding=utf-8 from bs4 import BeautifulSoup import requests import re user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)' headers={'User-Agent':user_agent} dic={}; #定义个字典对象,存月份和个数 for i in range(1,139): html=requests.get('http://www.cnblogs.com/heyang78/p/?page='+str(i),headers=headers) soup= BeautifulSoup(html.text,'html.parser'); for descDiv in soup.find_all(class_="postDesc2"): rawInfo=descDiv.text #得到class="postDesc2"的div的内容 yearMonth=re.search(r'\d{4}-\d{2}',rawInfo).group() #用正则表达式去匹配年月并取其值 # 将年月存入字典,如果存在就在原基础上加一 if yearMonth in dic: dic[yearMonth]=dic[yearMonth]+1 else: dic[yearMonth]=1 list=sorted(dic.items(),key=lambda x:x[0]) #将排序后的字典转化为数组 #存入文件 with open(r'output.txt','w') as outfile: for item in list: print(item) outfile.write(str(item)+"\n")
【输出示例】
('2016-02', 8) ('2016-03', 14) ('2016-05', 1) ('2016-06', 1) ('2016-07', 17) ('2016-08', 12) ('2016-10', 1) ('2017-01', 19) ('2017-02', 3) ('2017-03', 2) ('2017-04', 1) ('2017-05', 1) ('2017-06', 20) ('2017-07', 10) ('2017-08', 16) ('2017-09', 78) ('2017-10', 5) ('2017-11', 32) ('2017-12', 21) ('2018-01', 7) ('2018-03', 19) ('2018-04', 45) ('2018-05', 43) ('2018-06', 2) ('2018-07', 2) ('2019-03', 37) ('2019-04', 1) ('2019-05', 2) ('2019-07', 1) ('2019-08', 17) ('2019-09', 41) ('2019-10', 63) ('2019-11', 73) ('2019-12', 64) ('2020-01', 80) ('2020-02', 42) ('2020-03', 61) ('2020-04', 43) ('2020-05', 68) ('2020-06', 26) ('2020-09', 1) ('2021-08', 39) ('2021-09', 73) ('2021-10', 61) ('2021-11', 42) ('2021-12', 46) ('2022-01', 30) ('2022-02', 63) ('2022-03', 26)
END