#!/usr/bin/python
# coding: utf-8
import re
from collections import Counter
import requests
import time
from bs4 import BeautifulSoup
def count_zero(text):
zero = dict(Counter(text)).get('0', 0)
if zero > 1:
return zero
else:
return False
def get_normal_title(text):
start_index = text[:3]
titles = re.split(pattern='\d\d\d', string=text, maxsplit=20, flags=1)
titles_rm_quesiton = [item.replace('?', ' ') for item in titles]
titles_rm_blank = [t for t in titles_rm_quesiton if t]
titles_normal = []
for index, i in enumerate(titles_rm_blank):
if 1 == len(str(int(start_index))):
t = '00' + str(int(start_index) + index) + i
elif str(start_index).startswith('0'):
t = '0' + str(int(start_index) + index) + i
else:
t = str(int(start_index) + index) + i
titles_normal.append(t)
return titles_normal
def eliminate_question(title):
return str(title).replace('\xa0', '')
def get_title_url(response):
title_url_dict = {}
soup = BeautifulSoup(response, 'html.parser')
tag_p = soup.find_all('p')
for each_p in tag_p:
urls = []
text = each_p.get_text()
pattern = re.compile('^\d{3}.*$')
if pattern.match(text):
zero_num = count_zero(text)
if zero_num:
titles = get_normal_title(text)
for each_a in each_p.find_all('a'):
url = each_a.get('href')
urls.append(url)
title_url_tuple = zip([eliminate_question(t) for t in titles], urls)
for i in title_url_tuple:
title_url_dict.setdefault(i[0], i[1])
else:
text = eliminate_question(text)
url = each_p.find('a').get('href')
title_url_dict.setdefault(text, url)
return title_url_dict
def download_content(url, title):
response = requests.get(url=url).text
with open(title + '.html', 'w', encoding='utf-8') as f:
f.write(response)
def main():
url_wechat_index = 'https://mp.weixin.qq.com/s/7o8QxGydMTUe4Q7Tz46Diw'
response = requests.get(url=url_wechat_index).text
title_url_dict = get_title_url(response)
for title, url in title_url_dict.items():
time.sleep(5)
download_content(url, title)
if __name__ == "__main__":
main()