风飞花

博客园 首页 新随笔 联系 订阅 管理

spider.py

  1 # -*- coding:utf-8 -*-
  2 from urllib import urlencode
  3 import requests
  4 from requests.exceptions import RequestException
  5 import json
  6 import re
  7 import os
  8 from hashlib import md5
  9 from bs4 import BeautifulSoup
 10 import pymongo
 11 from multiprocessing import Pool
 12 from json.decoder import JSONDecoder
 13 from config import *
 14 
 15 client = pymongo.MongoClient(MONGO_URL, connect=False)
 16 db = client[MONGO_DB]
 17 
 18 def get_page_index(offset,keyword):
 19     data = {
 20         'offset': offset,
 21         'format': 'json',
 22         'keyword': keyword,
 23         'autoload': 'true',
 24         'count': '20',
 25         'cur_tab': 3
 26     }
 27     url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
 28     try:
 29         response = requests.get(url)
 30         if response.status_code == 200:
 31             return response.text
 32         return None
 33     except RequestException:
 34         print u'请求索引页失败', url
 35         return None
 36 
 37 def parse_page_index(html):
 38     data = json.loads(html)
 39     if data and 'data' in data.keys():
 40         for item in data.get('data'):
 41             yield item.get('article_url')
 42 
 43 def get_page_detail(url):
 44     try:
 45         response = requests.get(url)
 46         if response.status_code == 200:
 47             return response.text
 48         return None
 49     except RequestException:
 50         print u'请求详情页失败', url
 51         return None
 52 
 53 def parse_page_detail(html, url):
 54     soup = BeautifulSoup(html, 'lxml')
 55     title = soup.select('title')[0].get_text()
 56     print(title)
 57     images_pattern = re.compile('gallery: (.*?),\n', re.S)
 58     result = re.search(images_pattern, html)
 59     if result:
 60         data = json.loads(result.group(1))
 61         if data and 'sub_images' in data.keys():
 62             sub_images = data.get('sub_images')
 63             images = [item.get('url') for item in sub_images]
 64             for image in images: download_image(image)
 65             return {
 66                 'title': title,
 67                 'url': url,
 68                 'images': images
 69             }
 70 
 71 def save_to_mongo(result):
 72     if db[MONGO_TABLE].insert(result):
 73         print u'存储到MongoDB成功', result
 74         return True
 75     return False
 76 
 77 def download_image(url):
 78     print u'正在下载', url
 79     try:
 80         response = requests.get(url)
 81         if response.status_code == 200:
 82             save_image(response.content)
 83         return None
 84     except RequestException:
 85         print u'请求图片失败', url
 86         return None
 87 
 88 def save_image(content):
 89     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
 90     if not os.path.exists(file_path):
 91         with open(file_path, 'wb') as f:
 92             f.write(content)
 93             f.close()
 94 
 95 def main(offset):
 96     html = get_page_index(offset, KEYWORD)
 97     for url in parse_page_index(html):
 98         html = get_page_detail(url)
 99         if html:
100             result = parse_page_detail(html, url)
101             if result: save_to_mongo(result)
102 
103 if __name__ == '__main__':
104     groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
105     pool = Pool()
106     pool.map(main, groups)
View Code

config.py

1 # -*- coding:utf-8 -*-
2 MONGO_URL = 'localhost'
3 MONGO_DB = 'toutiao'
4 MONGO_TABLE = 'toutiao'
5 
6 GROUP_START = 0
7 GROUP_END = 20
8 
9 KEYWORD = '街拍'
View Code

 

posted on 2017-10-10 11:20  风飞花  阅读(261)  评论(0编辑  收藏  举报