爬取豆瓣电影储存到数据库MONGDB中以及反反爬虫
1.代码如下:
doubanmoive.py
# -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class DoubamovieSpider(scrapy.Spider): name = "doubanmovie" allowed_domains = ["movie.douban.com"] offset = 0 url = "https://movie.douban.com/top250?start=" start_urls = ( url+str(offset), ) def parse(self, response): item = DoubanItem() movies = response.xpath("//div[@class='info']") for each in movies: # 标题 item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0] # 信息 item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0] # 评分 item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0] # 简介 quote = each.xpath(".//p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
items.py
import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # 标题 title = scrapy.Field() # 信息 bd = scrapy.Field() # 评分 star = scrapy.Field() # 简介 quote = scrapy.Field()
2.在管道文件中更改储存位置
import pymongo from scrapy.conf import settings class DoubanPipeline(object): def __init__(self): host = settings["MONGODB_HOST"] port = settings["MONGODB_PORT"] dbname = settings["MONGODB_DBNAME"] sheetname= settings["MONGODB_SHEETNAME"] # 创建MONGODB数据库链接 client = pymongo.MongoClient(host = host, port = port) # 指定数据库 mydb = client[dbname] # 存放数据的数据库表名 self.sheet = mydb[sheetname] def process_item(self, item, spider): data = dict(item) self.sheet.insert(data) return item
3.新建中间件 middlewares.py 进行反反爬虫
1 # -*- coding:utf-8 -*- 2 3 import random 4 import base64 5 6 from settings import USER_AGENTS 7 from settings import PROXIES 8 9 # 随机的User-Agent 10 class RandomUserAgent(object): 11 def process_request(self, request, spider): 12 useragent = random.choice(USER_AGENTS) 13 #print useragent 14 request.headers.setdefault("User-Agent", useragent) 15 16 class RandomProxy(object): 17 def process_request(self, request, spider): 18 proxy = random.choice(PROXIES) 19 20 if proxy['user_passwd'] is None: 21 # 没有代理账户验证的代理使用方式 22 request.meta['proxy'] = "http://" + proxy['ip_port'] 23 24 else: 25 # 对账户密码进行base64编码转换 26 base64_userpasswd = base64.b64encode(proxy['user_passwd']) 27 # 对应到代理服务器的信令格式里 28 request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd 29 30 request.meta['proxy'] = "http://" + proxy['ip_port']
4.setting的设置
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for douban project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'douban' 13 14 SPIDER_MODULES = ['douban.spiders'] 15 NEWSPIDER_MODULE = 'douban.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;" 20 21 # Obey robots.txt rules 22 #ROBOTSTXT_OBEY = True 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 DOWNLOAD_DELAY = 2.5 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'douban.middlewares.MyCustomSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 DOWNLOADER_MIDDLEWARES = { 56 'douban.middlewares.RandomUserAgent': 100, 57 'douban.middlewares.RandomProxy': 200, 58 } 59 60 USER_AGENTS = [ 61 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 62 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 63 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 64 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 65 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 66 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 67 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' 68 ] 69 70 PROXIES = [ 71 {"ip_port" :"121.42.140.113:16816", "user_passwd" : "mr_mao_hacker:sffqry9r"}, 72 #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} 73 #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} 74 #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} 75 ] 76 77 #LOG_FILE = "douban.log" 78 #LOG_LEVEL = "DEBUG" 79 # Enable or disable extensions 80 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 81 #EXTENSIONS = { 82 # 'scrapy.extensions.telnet.TelnetConsole': None, 83 #} 84 85 # Configure item pipelines 86 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 87 ITEM_PIPELINES = { 88 'douban.pipelines.DoubanPipeline': 300, 89 } 90 91 # MONGODB 主机名 92 MONGODB_HOST = "127.0.0.1" 93 94 # MONGODB 端口号 95 MONGODB_PORT = 27017 96 97 # 数据库名称 98 MONGODB_DBNAME = "Douban" 99 100 # 存放数据的表名称 101 MONGODB_SHEETNAME = "doubanmovies" 102 103 # Enable and configure the AutoThrottle extension (disabled by default) 104 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 105 #AUTOTHROTTLE_ENABLED = True 106 # The initial download delay 107 #AUTOTHROTTLE_START_DELAY = 5 108 # The maximum download delay to be set in case of high latencies 109 #AUTOTHROTTLE_MAX_DELAY = 60 110 # The average number of requests Scrapy should be sending in parallel to 111 # each remote server 112 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 113 # Enable showing throttling stats for every response received: 114 #AUTOTHROTTLE_DEBUG = False 115 116 # Enable and configure HTTP caching (disabled by default) 117 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 118 #HTTPCACHE_ENABLED = True 119 #HTTPCACHE_EXPIRATION_SECS = 0 120 #HTTPCACHE_DIR = 'httpcache' 121 #HTTPCACHE_IGNORE_HTTP_CODES = [] 122 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'