'''
对崔庆才的个人博客上的文章基本信息的爬取 (共41页)存入mongo
https://cuiqingcai.com/page/1
标题、链接、浏览的数目、评论的数目以及喜欢的人数
分别将浏览数、评论数以及喜欢数排前十的文章统计出来并绘制出图表。
'''
import logging
import re
import aiohttp
import asyncio
from lxml import etree
import pymongo
import pyecharts as pye

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

'''将所有的方法用协程的方式实现'''
info_list = []
db = pymongo.MongoClient(host='127.0.0.1',port=27017)
col = db['abc']['blogs']
# 获取网页内容
async def parse_url(url):
    # 异步http请求
    session = aiohttp.ClientSession()
    response = await session.get(url)
    text = await response.text()
    await session.close()
    return text

# 解析网页,得到所需要数据
async def parse_content(url):
    text = await parse_url(url)
    tree = etree.HTML(text)
    articles = tree.xpath("//div[@class='content']/article")

    for article in articles:
        title = article.xpath("./header/h2/a/text()")[0] if article.xpath("./header/h2/a/text()") else None
        author = article.xpath('./p/span[1]/a/text()')[0]
        publish_time = article.xpath("./p/span[2]/text()")[0]
        page_view = article.xpath("./p/span[3]/text()")[0]
        page_view = int(re.findall('\d+', page_view)[0])
        comment = article.xpath("./p/span[4]/a/text()")[0]
        comment = int(re.findall('\d+', comment)[0])
        data_dic = {'title': title, 'author': author, 'publish_time': publish_time,
                    'page_view': page_view, 'comment': comment}
        info_list.append(data_dic)

# 根据数据画出柱状图
def plot(title,name,index_list,value_list):
    index_list = index_list[::-1]
    value_list = value_list[::-1]
    bar = pye.Bar(title=title)
    bar.add('数量',index_list,value_list,is_convert=True,is_label_show=True,label_pos='right')
    bar.render('{}.html'.format(name))

# 分析数据,画图
def analy():
    data = [(i['title'],i['page_view'],i['comment']) for i in col.find()]
    # 按照page_view 进行排序
    list1 = sorted(data,key=lambda x:int(x[1]),reverse=True)
    # print(11111111,list1[:10])
    # 按照评论进行排序
    list2 = sorted(data,key=lambda x:int(x[2]),reverse=True)
    # print(2222222,list2[:10])
    #浏览量前十的文章,名字列表和浏览数列表
    plot('浏览量前十的文章','page_view',[i[0] for i in list1[:10]], [i[1] for i in list1[:10]])
    plot('评论量前十的文章','comment',[i[0] for i in list2[:10]],[i[1] for i in list2[:10]])


def main():
    urls = ['https://cuiqingcai.com/page/{}'.format(i) for i in range(1,41)]
    tasks = [asyncio.ensure_future(parse_content(url)) for url in urls]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    col.insert(info_list)

    # 进行分析,画图
    analy()


if __name__ == '__main__':
    main()

posted on 2019-10-25 17:18  Afrafre  阅读(84)  评论(0编辑  收藏  举报