spider.py
1 # -*- coding:utf-8 -*- 2 from urllib import urlencode 3 import requests 4 from requests.exceptions import RequestException 5 import json 6 import re 7 import os 8 from hashlib import md5 9 from bs4 import BeautifulSoup 10 import pymongo 11 from multiprocessing import Pool 12 from json.decoder import JSONDecoder 13 from config import * 14 15 client = pymongo.MongoClient(MONGO_URL, connect=False) 16 db = client[MONGO_DB] 17 18 def get_page_index(offset,keyword): 19 data = { 20 'offset': offset, 21 'format': 'json', 22 'keyword': keyword, 23 'autoload': 'true', 24 'count': '20', 25 'cur_tab': 3 26 } 27 url = 'http://www.toutiao.com/search_content/?' + urlencode(data) 28 try: 29 response = requests.get(url) 30 if response.status_code == 200: 31 return response.text 32 return None 33 except RequestException: 34 print u'请求索引页失败', url 35 return None 36 37 def parse_page_index(html): 38 data = json.loads(html) 39 if data and 'data' in data.keys(): 40 for item in data.get('data'): 41 yield item.get('article_url') 42 43 def get_page_detail(url): 44 try: 45 response = requests.get(url) 46 if response.status_code == 200: 47 return response.text 48 return None 49 except RequestException: 50 print u'请求详情页失败', url 51 return None 52 53 def parse_page_detail(html, url): 54 soup = BeautifulSoup(html, 'lxml') 55 title = soup.select('title')[0].get_text() 56 print(title) 57 images_pattern = re.compile('gallery: (.*?),\n', re.S) 58 result = re.search(images_pattern, html) 59 if result: 60 data = json.loads(result.group(1)) 61 if data and 'sub_images' in data.keys(): 62 sub_images = data.get('sub_images') 63 images = [item.get('url') for item in sub_images] 64 for image in images: download_image(image) 65 return { 66 'title': title, 67 'url': url, 68 'images': images 69 } 70 71 def save_to_mongo(result): 72 if db[MONGO_TABLE].insert(result): 73 print u'存储到MongoDB成功', result 74 return True 75 return False 76 77 def download_image(url): 78 print u'正在下载', url 79 try: 80 response = requests.get(url) 81 if response.status_code == 200: 82 save_image(response.content) 83 return None 84 except RequestException: 85 print u'请求图片失败', url 86 return None 87 88 def save_image(content): 89 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') 90 if not os.path.exists(file_path): 91 with open(file_path, 'wb') as f: 92 f.write(content) 93 f.close() 94 95 def main(offset): 96 html = get_page_index(offset, KEYWORD) 97 for url in parse_page_index(html): 98 html = get_page_detail(url) 99 if html: 100 result = parse_page_detail(html, url) 101 if result: save_to_mongo(result) 102 103 if __name__ == '__main__': 104 groups = [x*20 for x in range(GROUP_START, GROUP_END+1)] 105 pool = Pool() 106 pool.map(main, groups)
config.py
1 # -*- coding:utf-8 -*- 2 MONGO_URL = 'localhost' 3 MONGO_DB = 'toutiao' 4 MONGO_TABLE = 'toutiao' 5 6 GROUP_START = 0 7 GROUP_END = 20 8 9 KEYWORD = '街拍'