crawl wechat page

#!/usr/bin/python
# coding: utf-8
import re
from collections import Counter

import requests
import time
from bs4 import BeautifulSoup


def count_zero(text):
    zero = dict(Counter(text)).get('0', 0)
    if zero > 1:
        return zero
    else:
        return False


def get_normal_title(text):
    start_index = text[:3]

    titles = re.split(pattern='\d\d\d', string=text, maxsplit=20, flags=1)

    titles_rm_quesiton = [item.replace('?', ' ') for item in titles]
    titles_rm_blank = [t for t in titles_rm_quesiton if t]

    titles_normal = []
    for index, i in enumerate(titles_rm_blank):
        if 1 == len(str(int(start_index))):
            t = '00' + str(int(start_index) + index) + i
        elif str(start_index).startswith('0'):
            t = '0' + str(int(start_index) + index) + i
        else:
            t = str(int(start_index) + index) + i

        titles_normal.append(t)

    return titles_normal


def eliminate_question(title):
    return str(title).replace('\xa0', '')


def get_title_url(response):
    title_url_dict = {}
    soup = BeautifulSoup(response, 'html.parser')
    tag_p = soup.find_all('p')
    for each_p in tag_p:
        urls = []
        text = each_p.get_text()
        pattern = re.compile('^\d{3}.*$')
        if pattern.match(text):
            zero_num = count_zero(text)
            if zero_num:
                titles = get_normal_title(text)
                for each_a in each_p.find_all('a'):
                    url = each_a.get('href')
                    urls.append(url)
                title_url_tuple = zip([eliminate_question(t) for t in titles], urls)
                for i in title_url_tuple:
                    title_url_dict.setdefault(i[0], i[1])
            else:
                text = eliminate_question(text)
                url = each_p.find('a').get('href')
                title_url_dict.setdefault(text, url)

    return title_url_dict


def download_content(url, title):
    response = requests.get(url=url).text

    with open(title + '.html', 'w', encoding='utf-8') as f:
        f.write(response)


def main():
    url_wechat_index = 'https://mp.weixin.qq.com/s/7o8QxGydMTUe4Q7Tz46Diw'
    response = requests.get(url=url_wechat_index).text
    title_url_dict = get_title_url(response)
    for title, url in title_url_dict.items():
        time.sleep(5)
        download_content(url, title)


if __name__ == "__main__":
    main()
posted @ 2018-08-28 15:24  idlewith  阅读(144)  评论(0编辑  收藏  举报