配置scrapy-splash+python爬取医院信息(利用了scrapy-splash)
北京艾丽斯妇科医院(http://fuke.fuke120.com/)
首先先说一下配置splash
1.利用pip安装scrapy-splash库
pip install scrapy-splash
2.现在就要用到另一个神器(Docker)
Docker下载地址:https://www.docker.com/community-edition#/windows
3.安装好Docker后启动Docker拉取镜像
docker pull scrapinghub/splash
4.利用Docker运行splash
docker run -p 8050:8050 scrapinghub/splash(运行之后大家可以去浏览器输入http://192.168.99.100:8050检查Docker是否正确)
5settings.py配置
SPLASH_URL = 'http://192.168.99.100:8050'(重中之重,一个大坑,一定要注意这个IP就是192.168.99.100,我就一直用的自己IP一直没运行成功) DOWNLOADER_MIDDLEWARES = { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, } SPIDER_MIDDLEWARES = { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, } DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = True(此处注意,有的网站是True,而有的网站需要把它改成False)
爬虫的py文件1.py
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from scrapy.http import Request # from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy.selector import HtmlXPathSelector client = pymongo.MongoClient(host="127.0.0.1") db = client.Health collection = db.Healthclass # 表名classification import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class healthcareClassSpider(scrapy.Spider): name = "HealthCare" allowed_domains = ["fuke120.com"] # 允许访问的域 start_urls = [ "http://fuke.fuke120.com/", ] # 每爬完一个网页会回调parse方法 def parse(self, response): global ii hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@id="allsort"]/div[@class="item"]/span/a') hx1 = hxs.select('//div[@id="allsort"]/div[@class="item born"]/span/a') # hx2 = hxs.select('//div[@id="allsort"]/div[@class="item"]/div[@class="i-mc"]/div[@class="i-mc01"]/ul[@class="w_ul01"]/li/a') for secItem in hx: ii+=1 url = secItem.select("@href").extract() c = "http://fuke.fuke120.com"+url[0] name = secItem.select("text()").extract() print(c) print(name) classid = collection.insert({'healthclass': name, 'pid': None}) healthurl = '%s,%s,%s' % (classid, c, ii) r.lpush('healthclassurl',healthurl) for secItem1 in hx1: url = secItem1.select("@href").extract() c1 = "http://fuke.fuke120.com"+url[0] name1 = secItem1.select("text()").extract() print(c1) print(name1) classid = collection.insert({'healthclass': name1, 'pid': None}) healthurl = '%s,%s,%s' % (classid, c1, 0) r.lpush('healthclassurl', healthurl)
2.py
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy.selector import HtmlXPathSelector from bson.objectid import ObjectId # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor from urllib.request import Request,ProxyHandler from urllib.request import build_opener client = pymongo.MongoClient(host="127.0.0.1") db = client.Health #库名dianping collection = db.Diseaseclass #表名classification import redis #导入redis数据库 r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8') class healthcareClassSpider(scrapy.Spider): name = "HealthCare1" allowed_domains = ["fuke120.com"] # 允许访问的域 dict = {} start_urls = [] def __init__(self): a = r.lrange('healthclassurl', 0,-1) for item in a: healthurl = bytes.decode(item) arr = healthurl.split(',') healthcareClassSpider.start_urls.append(arr[1]) num = arr[2] pid = arr[0] url = arr[1] self.dict[url] = {"pid": pid, "num": num} def parse(self, response): nameInfo = self.dict[response.url] pid1 = nameInfo['pid'] pid = ObjectId(pid1) num = nameInfo['num'] hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@class="x_con02_2"]/div[@class="x_con02_3"]/ul/li/p/a') for secItem in hx: url = secItem.select("@href").extract() url = "http://fuke.fuke120.com"+url[0] name = secItem.select("text()").extract() print(url) print(name) classid = collection.insert({'Diseaseclass': name, 'pid': pid}) diseaseclassurl = '%s,%s,%s' % (classid, url, pid) r.lpush('diseaseclassurl', diseaseclassurl)
3.py
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy_splash import SplashMiddleware from scrapy.http import Request, HtmlResponse from scrapy_splash import SplashRequest from scrapy.selector import Selector from scrapy.selector import HtmlXPathSelector from bson.objectid import ObjectId # from diseaseHealth.diseaseHealth.spiders.SpiderJsDynamic import phantomjs1 # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request client = pymongo.MongoClient(host="127.0.0.1") db = client.Health # 库名dianping collection = db.Treatclass # 表名classification # import redis # 导入redis数据库 # r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8') class healthcareClassSpider(scrapy.Spider): name = "HealthCare2" allowed_domains = ["fuke120.com"] # 允许访问的域 dict = {} start_urls = [] def __init__(self): a = r.lrange('diseaseclassurl', 0,-1) for item in a: healthurl = bytes.decode(item) arr = healthurl.split(',') healthcareClassSpider.start_urls.append(arr[1]) num = arr[2] pid = arr[0] url = arr[1] self.dict[url] = {"pid": pid, "num": num} def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, args={'wait': 0.5}) def parse(self, response): # a = response.body.decode('utf-8') # print(a) nameInfo = self.dict[response.url] pid1 = nameInfo['pid'] pid = ObjectId(pid1) num = nameInfo['num'] print(num) print(pid) hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@class="dh01"]/ul[@class="ul_bg01"]/li/a') for secItem in hx: url = secItem.select("@href").extract() c = "http://fuke.fuke120.com" + url[0] name = secItem.select("text()").extract() print(c) print(name) classid = collection.insert({'Treatclass': name, 'pid': pid}) treatclassurl = '%s,%s,%s' % (classid, c, pid) r.lpush('treatclassurl', treatclassurl)
大功告成,主要还是为了使用scrapy-splash。