动态网站爬取实例

  1 import requests
  2 import json
  3 from requests.exceptions import ConnectionError
  4 from json.decoder import JSONDecodeError
  5 from lxml import etree
  6 import re
  7 #from config import *
  8 import pymongo
  9 import os
 10 from hashlib import md5
 11 from multiprocessing import Pool
 12 
 13 client=pymongo.MongoClient('lacalhost')
 14 db=client['toutiao1']
 15 def get_page_parse(offset,keyword):
 16     data={
 17         'offset': offset,
 18         'format': 'json',
 19         'keyword':keyword,
 20         'autoload':'true',
 21         'count': 20,
 22         'cur_tab': 3
 23     }
 24     url="https://www.toutiao.com/search_content/"
 25     try:
 26         response = requests.get(url,params=data)
 27         if response.status_code == 200:
 28             #json_data=response.json()
 29             json_data=response.text
 30             return json_data
 31         return None
 32     except ConnectionError:
 33             print('Error occurred')
 34             return None
 35 
 36 def parse_page_index(text):
 37     try:
 38         data = json.loads(text)
 39         for item in data.get('data'):
 40             #print(item["article_url"])
 41             if "article_url" in item:
 42                 yield item.get("article_url")
 43             #yield item.get('article_url','default_value')
 44     except JSONDecodeError:
 45         pass
 46 #html=get_page_parse(0,'街拍')
 47 #for url in parse_page_index(html):
 48     #if len(url)>9:
 49         #print(url)
 50 def get_page_detail(url):
 51     try:
 52         response = requests.get(url)
 53         if response.status_code == 200:
 54             return response.text
 55         return None
 56     except ConnectionError:
 57         print('Error occurred')
 58         return None
 59 
 60 def parse_page_details(html,url):
 61     html1 = etree.HTML(html)
 62     title = html1.xpath('//head/title/text()')
 63     pattern = re.compile('gallery: JSON.parse\("(.*?)"\),\s+siblingList', re.S)
 64     urls = re.findall(pattern, html)
 65     # print(urls)
 66     #print(title)
 67     d = ",".join(urls)
 68     s = d.replace('\\', "")
 69     j = json.loads(s)
 70     images_urls = [item.get('url') for item in j["sub_images"]]
 71     for images_url in images_urls:download_image(images_url)
 72     return {
 73         'title':title,
 74         'url':url,
 75         'images_urls': images_urls
 76 
 77     }
 78 
 79 def save_to_mongo(resuit):
 80     if db['toutiao1'].insert(resuit):
 81         print("yes")
 82         return True
 83     else:
 84         return False
 85 
 86 def download_image(url):
 87     print('brgain',url)
 88     try:
 89         response = requests.get(url)
 90         if response.status_code == 200:
 91             save_image(response.content)
 92         return None
 93     except ConnectionError:
 94         print('Error occurred')
 95         return None
 96 
 97 def save_image(content):
 98     file_path='{0}/{1}/{2}.{3}'.format(os.getcwd(),'pictyre',md5(content).hexdigest(),'jpg')
 99     if not os.path.exists(file_path):
100         with open(file_path,'wb') as f:
101             f.write(content)
102             f.close()
103 
104 
105 def main(offset):
106     text=get_page_parse(offset,'街拍')
107     urls=parse_page_index(text)
108     for url in urls:
109         html=get_page_detail(url)
110         parse_page_details(html,url)
111         #print(result)
112         #save_to_mongo(result)
113 
114 if __name__=='__main__':
115     for i in range(1,2):
116         pool=Pool()
117         pool.map(main,[offset*20 for offset in range(1,2)])

 

posted @ 2017-12-03 15:52  不可叽叽歪歪  阅读(752)  评论(0编辑  收藏  举报