python Scrapy -9 douban top250
item.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | # -*- coding: utf-8 -*- import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # 标题 title = scrapy.Field() # 信息 bd = scrapy.Field() # 评分 star = scrapy.Field() # 简介 quote = scrapy.Field() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | # -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class DoubamovieSpider(scrapy.Spider): name = "doubanmovie" allowed_domains = [ "movie.douban.com" ] offset = 0 url = "https://movie.douban.com/top250?start=" start_urls = ( url + str (offset), ) def parse( self , response): item = DoubanItem() movies = response.xpath( "//div[@class='info']" ) for each in movies: # 标题 item[ 'title' ] = each.xpath( ".//span[@class='title'][1]/text()" ).extract()[ 0 ] # 信息 item[ 'bd' ] = each.xpath( ".//div[@class='bd']/p/text()" ).extract()[ 0 ] # 评分 item[ 'star' ] = each.xpath( ".//div[@class='star']/span[@class='rating_num']/text()" ).extract()[ 0 ] # 简介 quote = each.xpath( ".//p[@class='quote']/span/text()" ).extract() if len (quote) ! = 0 : item[ 'quote' ] = quote[ 0 ] yield item if self .offset < 225 : self .offset + = 25 yield scrapy.Request( self .url + str ( self .offset), callback = self .parse) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | # -*- coding: utf-8 -*- import pymongo from scrapy.conf import settings class DoubanPipeline( object ): def __init__( self ): host = settings[ "MONGODB_HOST" ] port = settings[ "MONGODB_PORT" ] dbname = settings[ "MONGODB_DBNAME" ] sheetname = settings[ "MONGODB_SHEETNAME" ] # 创建MONGODB数据库链接 client = pymongo.MongoClient(host = host, port = port) # 指定数据库 mydb = client[dbname] # 存放数据的数据库表名 self .sheet = mydb[sheetname] def process_item( self , item, spider): data = dict (item) self .sheet.insert(data) return item |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | # -*- coding: utf-8 -*- BOT_NAME = 'douban' SPIDER_MODULES = [ 'douban.spiders' ] NEWSPIDER_MODULE = 'douban.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;" # Obey robots.txt rules #ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 2.5 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'douban.middlewares.MyCustomSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'douban.middlewares.RandomUserAgent' : 100 , 'douban.middlewares.RandomProxy' : 200 , } USER_AGENTS = [ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)' , 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)' , 'Opera/9.27 (Windows NT 5.2; U; zh-cn)' , 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)' , 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0' , 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30' , 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' ] PROXIES = [ { "ip_port" : "121.42.140.113:16816" , "user_passwd" : "mr_mao_hacker:sffqry9r" }, #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} ] ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline' : 300 , } # MONGODB 主机名 MONGODB_HOST = "127.0.0.1" # MONGODB 端口号 MONGODB_PORT = 27017 # 数据库名称 MONGODB_DBNAME = "Douban" # 存放数据的表名称 MONGODB_SHEETNAME = "doubanmovies" |
----快代理----购买代理
middlewares.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | #!/usr/bin/env python # -*- coding:utf-8 -*- import random import base64 from settings import USER_AGENTS from settings import PROXIES # 随机的User-Agent class RandomUserAgent( object ): def process_request( self , request, spider): useragent = random.choice(USER_AGENTS) #print useragent request.headers.setdefault( "User-Agent" , useragent) class RandomProxy( object ): def process_request( self , request, spider): proxy = random.choice(PROXIES) if proxy[ 'user_passwd' ] is None : # 没有代理账户验证的代理使用方式 request.meta[ 'proxy' ] = "http://" + proxy[ 'ip_port' ] else : # 对账户密码进行base64编码转换 base64_userpasswd = base64.b64encode(proxy[ 'user_passwd' ]) # 对应到代理服务器的信令格式里 request.headers[ 'Proxy-Authorization' ] = 'Basic ' + base64_userpasswd request.meta[ 'proxy' ] = "http://" + proxy[ 'ip_port' ] |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)