import re # 引入正则表达式 import json # 引入 json import pymongo # 引入mongo数据库 import requests # 引入HTTP请求协议 from hashlib import md5 # 引入MD5 from bs4 import BeautifulSoup #引入BeautifulSoup 信息查询框架 from multiprocessing import Pool # 引入 多线程池 from urllib.parse import urlencode #引入网页解析 from json.decoder import JSONDecodeError #引入json错误异常 from requests.exceptions import RequestException #引入 HTTP异常 from config import * #导入数据库配置信息 client = pymongo.MongoClient(MONGO_URL,connect=False) db = client[MONGO_DB] # 抓取索引 def get_page_index(offset,keyword): # 构造请求数据信息 data ={ 'office':offset, # 默认页码 'format': 'json', # 数据格式 'keyword': 'keyword', # 关键字 'autoload': 'true', 'count': '20', 'cur_tab': 3, } url = 'http://www.toutiao.com/search_content/?' + urlencode(data) try: response = requests.get(url) # 判断是否有正常获取到网页信息 if response.status_code == 200: # 如果访问正常泽返回数据,否则为空 return response.text return None except RequestException: print('请求索引出错') return None def parse_page_index(html): try: data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError: pass def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: print('请求详情页出错',url) print(url) def parse_page_detail(html,url): soup = BeautifulSoup(html,'lxml') title = soup.select('title')[0].get_text() print(title) images_pattern = re.compile('var gallery = (.*?)',re.S) result = re.search(images_pattern,html) if result: data = json.loads(result.group(1)) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return { 'title':title, 'url':url, 'images':images, } def save_to_monogo(result): if db[MONGO_TABLE].insert(result): print('存储到MonogoDB成功',result) return True return False def download_image(url): print('正在下载',url) try: response = requests.get(url) if response.status_code == 200: # return response.text save_image(response.content) return None except RequestException: print('请求图片出错出错',url) return None def save_image(content): file_path = '{0}/{1}.{2}'.format(ls.getcwd(),md5(content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f: f.writable(content) f.close() def main(offset): # html = get_page_index(0,'街拍') html = get_page_index(offset,KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) if html: result = parse_page_detail(html,url) if result: save_to_monogo(result) print(result) if __name__ == '__main__': # main() groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)] pool = Pool() pool.map(main, groups)