电子灵魂

golang,go,C#,JAVA,PYTHON,PHP

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理



import re # 引入正则表达式
import json #  引入 json
import pymongo # 引入mongo数据库
import requests # 引入HTTP请求协议
from hashlib import md5 # 引入MD5
from bs4 import BeautifulSoup #引入BeautifulSoup 信息查询框架
from multiprocessing import Pool # 引入 多线程池
from urllib.parse import urlencode #引入网页解析
from json.decoder import JSONDecodeError #引入json错误异常
from requests.exceptions import RequestException #引入 HTTP异常

from config import * #导入数据库配置信息

client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]

# 抓取索引
def get_page_index(offset,keyword):
    # 构造请求数据信息
    data ={
        'office':offset, # 默认页码
        'format': 'json', # 数据格式
        'keyword': 'keyword', # 关键字
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3,
    }
    url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        response = requests.get(url)
        # 判断是否有正常获取到网页信息
        if response.status_code == 200:
            # 如果访问正常泽返回数据,否则为空
            return response.text
        return None
    except RequestException:
        print('请求索引出错')
        return None

def parse_page_index(html):
   try:
        data = json.loads(html)
        if data and 'data' in data.keys():
            for item in data.get('data'):
                yield item.get('article_url')
   except JSONDecodeError:
       pass

def get_page_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
                return response.text
        return None
    except RequestException:
        print('请求详情页出错',url)
        print(url)


def parse_page_detail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    print(title)
    images_pattern = re.compile('var gallery = (.*?)',re.S)
    result = re.search(images_pattern,html)
    if result:
        data = json.loads(result.group(1))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images: download_image(image)
            return {
                'title':title,
                'url':url,
                'images':images,

            }

def save_to_monogo(result):
    if db[MONGO_TABLE].insert(result):
        print('存储到MonogoDB成功',result)
        return True
    return False

def download_image(url):
    print('正在下载',url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
                # return response.text
            save_image(response.content)
        return None
    except RequestException:
        print('请求图片出错出错',url)
        return None

def save_image(content):
    file_path = '{0}/{1}.{2}'.format(ls.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.writable(content)
            f.close()


def main(offset):
    # html = get_page_index(0,'街拍')
    html = get_page_index(offset,KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
           result = parse_page_detail(html,url)
           if result: save_to_monogo(result)

           print(result)
if __name__ == '__main__':
    # main()
    groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
    pool = Pool()
    pool.map(main, groups)


posted on 2018-10-21 13:05  conncent  阅读(384)  评论(0编辑  收藏  举报