基于pydpier爬取1药网(转载)
1.商品爬取
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2019-02-02 08:59:40 # Project: oneDrug from pyspider.libs.base_handler import * from pymongo import MongoClient import re class Handler(BaseHandler): crawl_config = { } def __init__(self): self.client = MongoClient('mongodb://localhost:27017') self.drug = self.client.drug def insert_goods(self, data): collection = self.drug['goods'] collection.update({'goods_id': data['goods_id']}, data, True) def insert_comments(self, data): collection = self.drug['comments'] collection.insert_one(data) @every(minutes=24 * 60) def on_start(self): self.crawl('https://www.111.com.cn/categories/', callback=self.categories_page, validate_cert=False, fetch_type='js') @config(age=10 * 24 * 60 * 60) def categories_page(self, response): for each in response.doc('.allsort em > a').items(): self.crawl(each.attr.href, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js') @config(priority=1) def cagetory_list_page(self, response): for each in response.doc('#itemSearchList a[target="_blank"][class="product_pic pro_img"]').items(): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, fetch_type='js') next = response.doc('#search_table > div.turnPageBottom > a.page_next').attr.href self.crawl(next, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js') @config(priority=2) def detail_page(self, response): goods_id = response.doc('#gallery_view > ul > li.item_number').text() cagetory_one = response.doc('body > div.wrap.clearfix > div > span:nth-child(3) > a').text() cagetory_two = response.doc('body > div.wrap.clearfix > div > span:nth-child(5) > a').text() cagetory_three = response.doc('body > div.wrap.clearfix > div > span:nth-child(7) > a').text() merchants = response.doc('div.middle_property > span:nth-child(1)').text() goods_name = response.doc('div.middle_property > h1').text() goods_desc = response.doc('div.middle_property > span.red.giftRed').text() goods_price = response.doc( 'div.middle_property > div.shangpin_info > dl:nth-child(2) > dd > span.good_price').text() total_comments = response.doc('#fristReviewCount > span > a').text() brand = response.doc( '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(2)').text() spec = response.doc( '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(4)').text() weight = response.doc( '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(2)').text() manufacturers = response.doc( '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(4)').text() approval_number = response.doc( '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(2)').text() drug_type = response.doc( '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(4)').text() instructions = {} if response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child(1) > th').text(): for i in range(3, 22): instructions_key = \ response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > th'.format(i)).text().split( " ")[0] instructions_value = response.doc( '#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > td'.format(i)).text() instructions[instructions_key] = instructions_value total_comments = response.doc('#itemComments > span').text() good_comments = response.doc('#productExperience > div > ul > li:nth-child(2) > a > span').text() mid_comments = response.doc('#productExperience > div > ul > li:nth-child(3) > a > span').text() bad_comments = response.doc('#productExperience > div > ul > li:nth-child(4) > a > span').text() url_id = re.findall('\d+', response.url)[1] goods_data = { 'url_id': url_id, 'goods_id': goods_id, 'goods_name': goods_name, 'goods_desc': goods_desc, 'goods_price': goods_price, 'merchants': merchants, 'cagetory': { '1': cagetory_one, '2': cagetory_two, '3': cagetory_three }, 'drug_detail': { 'brand': brand, 'spec': spec, 'weight': weight, 'manufacturers': manufacturers, 'approval_number': approval_number, 'drug_type': drug_type }, 'instructions': instructions, 'comments': { 'total_comments': total_comments, 'good_comments': good_comments, 'mid_comments': mid_comments, 'bad_comments': bad_comments } } self.insert_goods(goods_data)
2.评论爬取
from pymongo import MongoClient import requests from bs4 import BeautifulSoup import re import socket class Drug: def __init__(self): self.clint = MongoClient('mongodb://localhost:27017') self.drug = self.clint.drug self.collection = self.drug['goods'] self.comm_collection = self.drug['comments'] def dbmodify(self): for data in self.collection.find({},{"goods_id":1,"goods_price":1}): try: _id = data['_id'] id = data['goods_id'].split(":")[1] price = data['goods_price'].split("¥")[1] self.collection.update({'_id': _id},{'$set':{'goods_id':id,'goods_price':price}}) print(_id, id, price) except IndexError: pass def getBaseArgument(self,goods_id): base_url = 'https://www.111.com.cn/interfaces/review/list/html.action' data = { 'goodsId': goods_id, 'pageIndex': 1, 'score': '1&_19020301' } try: self.collection.update_one({'url_id': goods_id}, {'$set': {'commspider': True}}) requests.packages.urllib3.disable_warnings() requests.adapters.DEFAULT_RETRIES = 5 # 设置连接活跃状态为False s = requests.session() s.keep_alive = False r = s.get(base_url, params=data, timeout = 5,verify=False) r.close() soup = BeautifulSoup(r.text, 'html.parser') if soup.find_all("div", class_="view_no_result"): return "No Comments!" else: total_page_text = soup.find_all(text=re.compile(r'共\d+页'))[0] pattern = re.compile(r'\d+') total_page = pattern.findall(total_page_text) return total_page[0] except requests.exceptions.RequestException as e: print(e) def getCommlist(self,goods_id, total_page): base_url = 'https://www.111.com.cn/interfaces/review/list/html.action' try: for i in range(1, int(total_page)): data = { 'goodsId': goods_id, 'pageIndex': i, 'score': '1&_19020301' } try: requests.packages.urllib3.disable_warnings() requests.adapters.DEFAULT_RETRIES = 15 # 设置连接活跃状态为False s = requests.session() s.keep_alive = False r = s.get(base_url, params=data, timeout = 5,verify=False) r.close() soup = BeautifulSoup(r.text, 'html.parser') for tr in soup.find_all("tr"): comments = {} try: comments['goodsId'] = goods_id comments['content'] = tr.find('p').text.strip() comments['date'] = tr.find('p', attrs={'class': 'eval_date'}).text.strip() self.comm_collection.insert_one(comments) except: print(goods_id + "Have some problem!\n") print(comments) except requests.exceptions.RequestException as e: print(e) except ValueError: return "No Comments! Try next!" def getComments(self): i = 0 goods_list = [] for data in self.collection.find({'commspider': False}, {"url_id"}): id = data['url_id'] goods_list.append(id) length = len(goods_list) print("总共 {} 条商品".format(length)) for good in goods_list: total_page = self.getBaseArgument(good) comments = self.getCommlist(good,total_page) i = i + 1 print("总共 {} 条商品\n目前第 {} 条\n商品编号 {} \n".format(length,i, good)) print(comments) test = Drug().getComments()