Breastcancer社区评论下载
首页
某个社区
某社区的一个话题
目标:获取这个网站所有话题的所有评论相关信息
python实现
# -*- coding: utf-8 -*- """ @Datetime: 2019/3/17 @Author: Zhang Yafei """ import functools import os import re import traceback import time from concurrent.futures import ThreadPoolExecutor, wait, as_completed, ProcessPoolExecutor from itertools import chain from urllib.request import urljoin import pandas as pd from lxml import etree from selenium import webdriver from selenium.webdriver.chrome.options import Options # 控制台输出所有列 pd.set_option('display.max_columns', None) forum_info = pd.read_csv('all_forum_info.csv') def timeit(fun): @functools.wraps(fun) def wrapper(*args, **kwargs): start_time = time.time() res = fun(*args, **kwargs) print('运行时间为%.6f' % (time.time() - start_time)) return res return wrapper class Download(object): """ 请求的下载 """ def __init__(self, url): self.url = url self.forum_topic = '【{}】'.format(forum_info[forum_info.forum_url == url[0]].forum_topic.values[0]) self.dir_path = os.path.join('download', self.forum_topic).replace('/', '-').replace(',', '').replace(':', '-').replace('"', '') if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) self.chrome_options = Options() self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--disable-gpu') self.browser = None self.dispatch(pool=True) def dispatch(self, pool=True): """ 下载所有社区页面 """ topic_count = int(self.url[1].replace(',', '')) pages = divmod(topic_count, 30)[0] + 1 remain_pages = self.filter_url(pages) urls = [self.url[0] + '?page={}'.format(page) for page in remain_pages] # [self.forum_start(urls), list(map(self.forum_start, self.url))][len(self.url) > 1 if urls and pool: pool = ThreadPoolExecutor(max_workers=4) pool.map(self.forum_start, urls, timeout=60) pool.shutdown() elif urls: list(map(self.forum_start, urls)) else: print(self.dir_path + '共需下载{}页'.format(pages) + '\t下载完成') def filter_url(self, pages): has_pages = list(map(lambda x: int(x.strip('.html')), os.listdir(self.dir_path))) down_pages = list(set(range(1, pages + 1)) - set(has_pages)) if len(down_pages): print(self.dir_path + '共需下载:{}页'.format(pages) + ' 已下载:{}页'.format(len(has_pages)) + ' 未下载:{}页'.format(len(down_pages))) return down_pages def forum_start(self, url, header=False): if header: browser = webdriver.Chrome() else: browser = webdriver.Chrome(chrome_options=self.chrome_options) browser.get(url=url) html = browser.page_source browser.close() file_path = os.path.join(self.dir_path, url.split('?page=')[-1] + '.html') with open(file_path, mode='w', encoding='utf-8') as f: f.write(html) print(url + '下载成功') # def close(self): # self.browser.close() class BreastCancer(object): """ BreastCancer社区评论下载 """ def __init__(self): self.base_url = 'https://community.breastcancer.org/' self.all_forum_columns = ['hgroup', 'forum_topic', 'forum_url', 'count_topic', 'count_post'] self.all_forum_file = 'all_forum_info.csv' self.all_topic_columns = ['forum', 'topic_url', 'founder', 'count_posts', 'count_views', 'created_time', 'file_path'] self.all_topic_file = 'all_topic_info.csv' if not os.path.exists(self.all_topic_file): self.write_to_file(columns=self.all_topic_columns, file=self.all_topic_file, mode='w') def download_forums_index(self): """ 下载社区首页 """ browser = webdriver.Chrome() browser.get(url=self.base_url) html = browser.page_source with open('download/community.html', mode='w', encoding='utf-8') as f: f.write(html) browser.close() def parse_forums_index(self): """ 解析社区首页 获取所有社区页面id """ # 创建csv文件, 写入表头 if not os.path.exists(self.all_forum_file): data = pd.DataFrame(columns=self.all_forum_columns) data.to_csv(self.all_forum_file, index=False) # 读取html文件,提取有用信息,写入到文件尾部 with open('download/community.html', encoding='utf-8') as f: response = f.read() response = etree.HTML(response) rowgroups = response.xpath('//div[@class="rowgroup"]') list(map(self.get_all_forums_ids, rowgroups)) def get_all_forums_ids(self, response): """ 获取所有: hgroup、 forum_topic(社区主题)、 url(社区页面地址)、 count_topic(话题数量)、 count_post(评论数量) """ hgroup = response.xpath('.//h2/text()')[0] forums = response.xpath('.//li') data_list = [] for forum in forums: forum_topic = forum.xpath('h3/a/text()')[0] forum_url = urljoin('https://community.breastcancer.org/', forum.xpath('h3/a/@href')[0]) count_topic = forum.xpath('.//span[@class="count_topic"]/strong/text()')[0] count_post = forum.xpath('.//span[@class="count_post"]/strong/text()')[0] data_dict = {'hgroup': hgroup, 'forum_topic': forum_topic, 'forum_url': forum_url, 'count_topic': count_topic, 'count_post': count_post} data_list.append(data_dict) df = pd.DataFrame(data=data_list) # 固定输出列的顺序 df = df.loc[:, self.all_forum_columns] df.to_csv(self.all_forum_file, index=False, header=False, mode='a') print(hgroup + '数据解析完成') def get_all_forum_page(self, pool=True): """ 获取所有社区url 并下载所有社区相关页面 """ df = pd.read_csv(self.all_forum_file) forum_urls = df.apply(lambda x: (x.forum_url, x.count_topic), axis=1) if pool: # 多线程下载 # BreastCancer.thread_pool_download(forum_urls) # 多进程 BreastCancer.process_pool_download(forum_urls) else: # 单线程下载 list(map(Download, forum_urls)) @staticmethod def thread_pool_download(urls): """ 多线程下载 """ pool = ThreadPoolExecutor(max_workers=4) tasks = [pool.submit(Download, url) for url in urls] wait(tasks) # pool.map(Download, urls) pool.shutdown() @staticmethod def process_pool_download(urls): """ 多进程下载 """ pool = ProcessPoolExecutor(max_workers=4) pool.map(Download, urls) pool.shutdown() def get_topic_info(self, response): """ 获取所有话题相关信息 """ topic_url = urljoin(self.base_url, response.xpath('./h3/a/@href')[0]) count_posts = response.xpath('./p[1]/span[1]/strong/text()')[0] count_views = response.xpath('./p[1]/span[2]/strong/text()')[0] founder = response.xpath('./p[2]/a/text()')[0] created = response.xpath('./p[2]')[0].xpath('string(.)').replace('\n', '').replace(' ', '') created_time = re.search('on(.*)', created).group(1) data_dict = {'topic_url': topic_url, 'founder': founder, 'count_posts': count_posts, 'count_views': count_views, 'created_time': created_time } return data_dict @staticmethod def get_file_path_list(row): forum_topic = '【{}】'.format(row.forum_topic) dir_path = os.path.join('download', forum_topic).replace('/', '-').replace(',', '').replace(':', '-').replace( '"', '') file_path_list = list(map(lambda file: os.path.join(dir_path, file), os.listdir(dir_path))) return file_path_list @timeit def start_parse_forum_page(self, pool=False): """ 开始解析社区页面 """ file_path_lists = forum_info.apply(BreastCancer.get_file_path_list, axis=1).tolist() file_path_lists = functools.reduce(lambda x, y: x + y, file_path_lists) # 每次解析前过滤已解析的页面 file_path_lists = self.filter_file_path(file_path_lists) if pool: pool = ThreadPoolExecutor(max_workers=4) pool.map(self.parse_forum_page, file_path_lists) pool.shutdown() else: list(map(self.parse_forum_page, file_path_lists)) def filter_file_path(self, file_path_list): parse_file = pd.read_csv(self.all_topic_file).file_path.unique() file_list = set(file_path_list) - set(parse_file) print('共{}个页面'.format(len(set(file_path_list))), '\n\r已解析{}个页面'.format(len(set(parse_file)))) return list(file_list) def parse_forum_page(self, file_path): """ 解析社区页面具体功能实现 """ with open(file_path, encoding='utf-8') as f: response = f.read() response = etree.HTML(response) try: forum = response.xpath('//div[@id="section-content"]/h1/text()')[0] forum = re.search('Forum: (.*)', forum).group(1) topics = response.cssselect('#section-content > ul.rowgroup.topic-list > li') data_list = list(map(self.get_topic_info, topics)) for data_dict in data_list: data_dict['forum'] = forum data_dict['file_path'] = file_path BreastCancer.write_to_file(data=data_list, file=self.all_topic_file, columns=self.all_topic_columns, mode='a') print(file_path + '解析完成') except Exception: traceback.print_exc() print(file_path + '解析错误') return @staticmethod def write_to_file(file, data=None, columns=None, mode=None, index=False, header=False): if mode == 'w': data = pd.DataFrame(columns=columns) data.to_csv(file, index=False) elif mode == 'a': df = pd.DataFrame(data=data) df = df.loc[:, columns] df.to_csv(file, mode=mode, index=index, header=header) @staticmethod def all_forum_stat(): all_hgroup_count = forum_info.hgroup.nunique() all_forum_count = forum_info.forum_topic.nunique() all_topic_count = pd.to_numeric(forum_info.count_topic.str.replace(',', ''), downcast='integer').sum() all_post_count = pd.to_numeric(forum_info.count_post.str.replace(',', ''), downcast='integer').sum() all_topic_pages = forum_info.apply(lambda x: divmod(int(x.count_topic.replace(',', '')), 30)[0] + 1, axis=1).sum() print('共有hgroup:{}个'.format(all_hgroup_count)) print('共有社区:{}个'.format(all_forum_count)) print('共有话题:{}个'.format(all_topic_count)) print('共有评论:{}个'.format(all_post_count)) print('共需下载:{}个页面'.format(all_topic_pages)) if __name__ == '__main__': breastcancer = BreastCancer() # breastcancer.all_forum_stat() # 1. 下载社区首页 # breastcancer.parse_forums_index() # 2. 获取所有社区页面url # breastcancer.get_all_forums_ids() # 3. 下载所有社区页面 # breastcancer.get_all_forum_page(pool=False) # 4. 解析所有社区页面 breastcancer.start_parse_forum_page()