爬虫项目

新片场

BOT_NAME = 'xpc'

SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders'

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES = {
            'scrapy_redis.pipelines.RedisPipeline': 300
                }
REDIS_URL = 'redis://127.0.0.1:6379'
SCHEDULER_PERSIST = True

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DOWNLOAD_TIMEOUT = 10

MYSQL_CONFIG = dict(host='127.0.0.1',
            port=3306,
            user='root',
            password='',
            db='xpc_hz1804',
            charset='utf8mb4')

PROXIES = [
       'http://47.95.193.101:1801',
       'http://47.96.123.201:1801',
       'http://47.96.64.101:1804',
       'http://39.106.27.237:1801',
       'http://118.24.49.46:1801',
       'http://47.100.58.124:1801',
       'http://101.132.185.43:1801',
       'http://39.105.49.81:1801',
       'http://47.99.115.152:1801',
       'http://47.99.131.3:1801',
       'http://39.105.22.186:1801',
       'http://39.106.42.87:1801',
       'http://116.62.143.107:1801',
       'http://120.79.221.245:1804',
       'http://47.106.188.175:1804',
       'http://47.99.54.144:1801',
       'http://101.200.55.163:1801',
       'http://119.27.183.40:1801',
       'http://118.24.64.185:1801',
       'http://106.14.214.226:1801',
       'http://203.195.164.241:1801',
       'http://47.95.235.90:1801',
       'http://47.94.162.16:1801',
       'http://118.25.226.251:1801',
       'http://132.232.187.251:1801',
       'http://47.94.224.67:1801',
       'http://118.24.145.88:1801',
       'http://122.114.180.120:1801',
       'http://39.106.109.148:1801',
       'http://118.25.227.120:1801',
       'http://60.205.179.182:1804',
       'http://39.104.230.114:1806',

        # failed proxy
    'http://119.27.187.59:1804',
     'http://39.106.221.204:1801',
     'http://111.231.73.145:1801',
     'http://111.231.117.197:1801',
     'http://60.205.176.40:1888',
     'http://193.112.68.34:1801',
     'http://39.106.220.99:1801'
]

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 8
COOKIES_DEBUG = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    'Accept-Encoding': "gzip, deflate",
    'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
    'Cache-Control': "no-cache",
    'Connection': "keep-alive",
    'DNT': "1",
    'Host': "www.xinpianchang.com",
    'Upgrade-Insecure-Requests': "1",
    'Postman-Token': "64ae163e-794d-e3bc-84dc-77d6f5d6cd50"
}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'xpc.middlewares.XpcSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'xpc.middlewares.RandomProxyMiddleware': 749,
# }

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'xpc.pipelines.MysqlPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


import os
import sys
if os.path.exists('/data/settings_prd.py'):
    sys.path.append('/data')
    from settings_prd import *
settings
import pymysql
from xpc import settings


class MysqlPipeline(object):

    def open_spider(self, spider):
        print('spider %s opened' % spider.name)
        self.conn = pymysql.connect(**settings.MYSQL_CONFIG)
        self.cur = self.conn.cursor()

    def process_item(self, item, spider):
        cols, values = zip(*item.items())
        sql = "INSERT INTO `{}` ({}) VALUES ({}) " \
              "ON DUPLICATE KEY UPDATE {}".format(
            item.table,
            ','.join(['`%s`' % k for k in cols]),
            ','.join(['%s'] * len(values)),
            ','.join(['`{}`=%s'.format(k) for k in cols])
            )
        self.cur.execute(sql, values * 2)
        # print(self.cur._last_executed)
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()
        print('spider %s closed' % spider.name)
pipelines
import random
from scrapy.exceptions import NotConfigured


class RandomProxyMiddleware(object):

    def __init__(self, settings):
        # 2. 初始化中间件
        if not settings.getlist('PROXIES'):
            raise NotConfigured
        self.proxies = settings.getlist('PROXIES')
        # 初始化统计次数,所有的代理失败次数设置为0
        self.stats = {}.fromkeys(self.proxies, 0)
        self.max_failed = 3

    @classmethod
    def from_crawler(cls, crawler):
        # 1. 创建一个中间件对象
        if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
            raise NotConfigured
        return cls(crawler.settings)

    def process_request(self, request, spider):
        # 3. 设置一个随机代理
        if 'proxy' not in request.meta:
            request.meta['proxy'] = random.choice(self.proxies)
            print('use proxy: %s ' % request.meta['proxy'])

    def process_response(self, request, response, spider):
        # 4. 每次响应回来的时候都会调用
        cur_proxy = request.meta['proxy']
        if response.status != 200:
            print('none 200 status code: %s when use %s' %
                  (response.status, cur_proxy))
            # 当返回的http状态码异常时,给当前代理的失败次数加1
            if response.status >= 400:
                self.stats[cur_proxy] += 1

            # 当代理的失败累计到一定数量时,将该代理从代理池中删除
            if self.stats[cur_proxy] > self.max_failed:
                self.remove_proxy(cur_proxy)
                del request.meta['proxy']
                return request
        return response

    def process_exception(self, request, exception, spider):
        # 4. 当代理服务器出现异常的时候会执行
        cur_proxy = request.meta['proxy']
        print('raise exception: %s when use %s' % (exception, cur_proxy))
        # 从当前代理池中将代理删除
        self.remove_proxy(cur_proxy)
        del request.meta['proxy']
        request.dont_filter = True
        return request

    def remove_proxy(self, proxy):
        if proxy in self.proxies:
            self.proxies.remove(proxy)
            print('proxy %s removed from proxies list' % proxy)
middlewares
import scrapy
from scrapy import Field


class PostItem(scrapy.Item):
    table = 'posts'
    pid = Field()
    thumbnail = Field()
    title = Field()
    category = Field()
    created_at = Field()
    like_counts = Field()
    play_counts = Field()
    description = Field()
    video = Field()
    preview = Field()
    duration = Field()


class CommentItem(scrapy.Item):
    table = 'comments'
    commentid = Field()
    pid = Field()
    content = Field()
    created_at = Field()
    like_counts = Field()
    cid = Field()
    avatar = Field()
    uname = Field()
    reply = Field()


class ComposerItem(scrapy.Item):
    table = 'composers'
    cid = Field()
    banner = Field()
    avatar = Field()
    name = Field()
    intro = Field()
    like_counts = Field()
    fans_counts = Field()
    follow_counts = Field()
    location = Field()
    career = Field()


class CopyrightItem(scrapy.Item):
    table = 'copyrights'
    pcid = Field()
    cid = Field()
    pid = Field()
    roles = Field()
items
import re
import json
import scrapy
import scrapy_redis
from scrapy_redis.spiders import RedisSpider
from scrapy import Request
from scrapy.shell import inspect_response
from xpc.items import PostItem, CommentItem, ComposerItem, CopyrightItem


def strip(s):
    if s:
        return s.strip().replace(' ', '')
    return ''


def convert_int(s):
    if not s:
        return 0
    return int(s.replace(',', ''))
ci = convert_int


class DiscoverySpider(RedisSpider):
    name = 'discovery'
    allowed_domains = ['xinpianchang.com', 'vmovier.com']
    start_urls = [
        # 'http://www.xinpianchang.com/channel/index/type-0/sort-like/duration_type-0/resolution_type-/page-21'
    ]
    cookies = {
        "Authorization": "D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA",
        # "channel_page": "apY%3D",
    }

    def start_requests(self):
        first_cookies = dict(self.cookies)
        first_cookies['channel_page'] = "b5Y%3D"
        for url in self.start_urls:
            request = Request(url, dont_filter=True, callback=self.parse, cookies=first_cookies)
            request.meta['first_request'] = 1
            yield request

    def make_requests_from_url(self, url):
        first_cookies = dict(self.cookies)
        first_cookies['channel_page'] = "b5Y%3D"
        request = Request(url, dont_filter=True, callback=self.parse, cookies=first_cookies)
        return request
        #channel_page=aZec #21
        # b5Y%3D # 72
        #13994324984
        #18041804
        # Authorization = D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA;
    def parse(self, response):

        url = 'http://www.xinpianchang.com/a%s?from=ArticleList'
        # 先取li节点
        post_list = response.xpath('//ul[@class="video-list"]/li')
        # 遍历li节点,在遍历的过程中取pid和thumbnail
        # 这样为了能让pid和thumbnail一一对应,并减少代码量
        for post in post_list:
            pid = post.xpath('./@data-articleid').extract_first()
            # 注意图片的src属性是延迟加载的,所以这里要取_src属性
            thumbnail = post.xpath('./a/img/@_src').get()
            request = Request(url % pid, callback=self.parse_post)
            request.meta['pid'] = pid
            request.meta['thumbnail'] = thumbnail
            yield request
        # other_pages = response.xpath('//div[@class="page"]/a/@href').extract()
        other_pages = response.xpath('//a[@title="下一页"]/@href').extract()
        if not other_pages:
            # inspect_response(response, self)
            tip = response.xpath('//div[@class="notice-tip"]/p/text()').get()
            if '系统繁忙,请稍候再试' in tip:
                print('---'*50, '遇到系统繁忙了')
                response.request.dont_filter = True
                response.request.cookies['PHPSESSID'] = '123'
                # request = Request(
                #     response.url,
                #     dont_filter=True,
                #     cookies=self.cookies)
                yield response.request
        for page in other_pages:
            request = Request(page, callback=self.parse, cookies=self.cookies)
            yield request

    def parse_post(self, response):
        # 一次性去除网页源码中的所有\t符号
        response.replace(body=response.text.replace('\t', '').encode('utf-8'))
        pid = response.meta['pid']
        post = {
            'pid': pid,
            'thumbnail': response.meta['thumbnail'],
        }
        post = PostItem(post)
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').extract_first()
        cates = response.xpath(
            '//span[contains(@class, "cate")]/a/text()').extract()
        post['category'] = '-'.join([strip(cate) for cate in cates])
        post['created_at'] = response.xpath(
            '//span[contains(@class, "update-time")]/i/text()').get()
        post['play_counts'] = response.xpath(
            '//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath(
            '//span[contains(@class, "like-counts")]/@data-counts').get()
        post['description'] = response.xpath('//p[contains(@class, "desc")]/text()').get()
        url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?'
        # 通过正则找出页面中js代码中的一个关键变量的值
        # 正则匹配这样的字符串:
        # vid: "5BA5015D7FB4E",
        vid = re.findall('vid\:\s\"(\w+)\"\,', response.text)
        if vid:
            # 去请求视频接口的信息
            url = url % vid[0]
            request = Request(url, callback=self.parse_video)
            request.meta['post'] = post
            yield request

        # 请求评论API,注意ajax参数
        comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
        request = Request(comment_api % pid, callback=self.parse_comment)
        yield request

        # 请求用户页面
        user_url = 'http://www.xinpianchang.com/u%s?from=articleList'
        composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li')
        for composer in composer_list:
            cid = composer.xpath('./a/@data-userid').get()
            request = Request(user_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request
            # 用户和视频的对应关系
            cr = CopyrightItem()
            cr['pcid'] = '%s_%s' % (cid, pid)
            cr['cid'] = cid
            cr['pid'] = pid
            cr['roles'] = composer.xpath('.//span[contains(@class, "roles")]/text()').get()
            yield cr

    def parse_video(self, response):
        # 该接口返回的是json数据,所以不用解析,直接json.loads
        post = response.meta['post']
        result = json.loads(response.text)
        post['video'] = result['data']['resource']['default']['url']
        post['preview'] = result['data']['video']['cover']
        post['duration'] = result['data']['video']['duration']
        yield post

    def parse_comment(self, response):
        # 该接口返回的是json数据,所以不用解析,直接json.loads
        result = json.loads(response.text)
        comments = result['data']['list']
        for comment in comments:
            c = CommentItem()
            c['commentid'] = comment['commentid']
            c['pid'] = comment['articleid']
            c['content'] = comment['content']
            c['created_at'] = comment['addtime_int']
            c['like_counts'] = comment['count_approve']
            c['cid'] = comment['userInfo']['userid']
            c['avatar'] = comment['userInfo']['face']
            c['uname'] = comment['userInfo']['username']
            # reply字段内包含的是该条评论是回复的另外哪一条评论,如果没有回复则为空
            if comment['reply']:
                # 只需要保存commentid就可以了
                c['reply'] = comment['reply']['commentid']
            yield c
        next_page = result['data']['next_page_url']
        if next_page:
            yield Request(next_page, callback=self.parse_comment)

    def parse_composer(self, response):
        composer = ComposerItem()
        composer['cid'] = response.meta['cid']
        banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
        # 截取background-image:url()内的值
        composer['banner'] = banner[21:-1]
        composer['avatar'] = response.xpath(
            '//span[@class="avator-wrap-s"]/img/@src').get()
        composer['name'] = response.xpath(
            '//p[contains(@class, "creator-name")]/text()').get()
        composer['intro'] = response.xpath(
            '//p[contains(@class, "creator-desc")]/text()').get()
        composer['like_counts'] = ci(response.xpath(
            '//span[contains(@class, "like-counts")]/text()').get())
        composer['fans_counts'] = ci(response.xpath(
            '//span[contains(@class, "fans-counts")]/@data-counts').get())
        composer['follow_counts'] = ci(response.xpath(
            '//span[@class="follow-wrap"]/span[2]/text()').get())
        # 取class为icon-location的span节点的下一个span节点内的文本
        composer['location'] = strip(response.xpath(
            '//span[contains(@class, "icon-location")]/'
            'following-sibling::span[1]/text()').get())
        composer['career'] = response.xpath(
            '//span[contains(@class, "icon-career")]/'
            'following-sibling::span[1]/text()').get()
        yield composer
spider-分布式
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request
from scrapy.shell import inspect_response
from xpc.items import PostItem, CommentItem, ComposerItem, CopyrightItem


def strip(s):
    if s:
        return s.strip().replace(' ', '')
    return ''


def convert_int(s):
    if not s:
        return 0
    return int(s.replace(',', ''))
ci = convert_int


class DiscoverySpider(scrapy.Spider):
    name = 'discovery'
    allowed_domains = ['xinpianchang.com', 'vmovier.com']
    start_urls = [
        'http://www.xinpianchang.com/channel/index/'
        'type-0/sort-like/duration_type-0/resolution_type-/page-21']
    cookies = {
        "Authorization": "D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA",
        # "channel_page": "apY%3D",
    }

    def start_requests(self):
        first_cookies = dict(self.cookies)
        first_cookies['channel_page'] = "b5Y%3D"
        for url in self.start_urls:
            request = Request(url, dont_filter=True, callback=self.parse, cookies=first_cookies)
            request.meta['first_request'] = 1
            yield request

#channel_page=aZec #21
            # b5Y%3D # 72
#13994324984
#18041804
# Authorization = D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA;
    def parse(self, response):

        url = 'http://www.xinpianchang.com/a%s?from=ArticleList'
        # 先取li节点
        post_list = response.xpath('//ul[@class="video-list"]/li')
        # 遍历li节点,在遍历的过程中取pid和thumbnail
        # 这样为了能让pid和thumbnail一一对应,并减少代码量
        for post in post_list:
            pid = post.xpath('./@data-articleid').extract_first()
            # 注意图片的src属性是延迟加载的,所以这里要取_src属性
            thumbnail = post.xpath('./a/img/@_src').get()
            request = Request(url % pid, callback=self.parse_post)
            request.meta['pid'] = pid
            request.meta['thumbnail'] = thumbnail
            # yield request
        # other_pages = response.xpath('//div[@class="page"]/a/@href').extract()
        other_pages = response.xpath('//a[@title="下一页"]/@href').extract()
        if not other_pages:
            # inspect_response(response, self)
            tip = response.xpath('//div[@class="notice-tip"]/p/text()').get()
            print('==='*50, tip)
            if '系统繁忙,请稍候再试' in tip:
                print('---'*50, '遇到系统繁忙了')
                response.request.dont_filter = True
                response.request.cookies['PHPSESSID'] = '123'
                # request = Request(
                #     response.url,
                #     dont_filter=True,
                #     cookies=self.cookies)
                yield response.request
        for page in other_pages:
            request = Request(page, callback=self.parse, cookies=self.cookies)
            yield request

    def parse_post(self, response):
        # 一次性去除网页源码中的所有\t符号
        response.replace(body=response.text.replace('\t', '').encode('utf-8'))
        pid = response.meta['pid']
        post = {
            'pid': pid,
            'thumbnail': response.meta['thumbnail'],
        }
        post = PostItem(post)
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').extract_first()
        cates = response.xpath(
            '//span[contains(@class, "cate")]/a/text()').extract()
        post['category'] = '-'.join([strip(cate) for cate in cates])
        post['created_at'] = response.xpath(
            '//span[contains(@class, "update-time")]/i/text()').get()
        post['play_counts'] = response.xpath(
            '//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath(
            '//span[contains(@class, "like-counts")]/@data-counts').get()
        post['description'] = response.xpath('//p[contains(@class, "desc")]/text()').get()
        url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?'
        # 通过正则找出页面中js代码中的一个关键变量的值
        # 正则匹配这样的字符串:
        # vid: "5BA5015D7FB4E",
        vid = re.findall('vid\:\s\"(\w+)\"\,', response.text)
        if vid:
            # 去请求视频接口的信息
            url = url % vid[0]
            request = Request(url, callback=self.parse_video)
            request.meta['post'] = post
            yield request

        # 请求评论API,注意ajax参数
        comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
        request = Request(comment_api % pid, callback=self.parse_comment)
        yield request

        # 请求用户页面
        user_url = 'http://www.xinpianchang.com/u%s?from=articleList'
        composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li')
        for composer in composer_list:
            cid = composer.xpath('./a/@data-userid').get()
            request = Request(user_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request
            # 用户和视频的对应关系
            cr = CopyrightItem()
            cr['pcid'] = '%s_%s' % (cid, pid)
            cr['cid'] = cid
            cr['pid'] = pid
            cr['roles'] = composer.xpath('.//span[contains(@class, "roles")]/text()').get()
            yield cr

    def parse_video(self, response):
        # 该接口返回的是json数据,所以不用解析,直接json.loads
        post = response.meta['post']
        result = json.loads(response.text)
        post['video'] = result['data']['resource']['default']['url']
        post['preview'] = result['data']['video']['cover']
        post['duration'] = result['data']['video']['duration']
        yield post

    def parse_comment(self, response):
        # 该接口返回的是json数据,所以不用解析,直接json.loads
        result = json.loads(response.text)
        comments = result['data']['list']
        for comment in comments:
            c = CommentItem()
            c['commentid'] = comment['commentid']
            c['pid'] = comment['articleid']
            c['content'] = comment['content']
            c['created_at'] = comment['addtime_int']
            c['like_counts'] = comment['count_approve']
            c['cid'] = comment['userInfo']['userid']
            c['avatar'] = comment['userInfo']['face']
            c['uname'] = comment['userInfo']['username']
            # reply字段内包含的是该条评论是回复的另外哪一条评论,如果没有回复则为空
            if comment['reply']:
                # 只需要保存commentid就可以了
                c['reply'] = comment['reply']['commentid']
            yield c
        next_page = result['data']['next_page_url']
        if next_page:
            yield Request(next_page, callback=self.parse_comment)

    def parse_composer(self, response):
        composer = ComposerItem()
        composer['cid'] = response.meta['cid']
        banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
        # 截取background-image:url()内的值
        composer['banner'] = banner[21:-1]
        composer['avatar'] = response.xpath(
            '//span[@class="avator-wrap-s"]/img/@src').get()
        composer['name'] = response.xpath(
            '//p[contains(@class, "creator-name")]/text()').get()
        composer['intro'] = response.xpath(
            '//p[contains(@class, "creator-desc")]/text()').get()
        composer['like_counts'] = ci(response.xpath(
            '//span[contains(@class, "like-counts")]/text()').get())
        composer['fans_counts'] = ci(response.xpath(
            '//span[contains(@class, "fans-counts")]/@data-counts').get())
        composer['follow_counts'] = ci(response.xpath(
            '//span[@class="follow-wrap"]/span[2]/text()').get())
        # 取class为icon-location的span节点的下一个span节点内的文本
        composer['location'] = strip(response.xpath(
            '//span[contains(@class, "icon-location")]/'
            'following-sibling::span[1]/text()').get())
        composer['career'] = response.xpath(
            '//span[contains(@class, "icon-career")]/'
            'following-sibling::span[1]/text()').get()
        yield composer
spiders-常规

 淘宝滑块

https://blog.csdn.net/u012067766/article/details/79793264

https://www.cnblogs.com/garvicker/p/8947121.html

 

淘宝列表gevent爬虫

# coding: utf-8
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool
import gevent
from queue import Queue

import random
import re
import sys
import time
import datetime
import json
import copy
import hashlib
import requests
import hashlib
from urllib.parse import quote

numbers=0
is_in = []
goods_all_q = Queue()
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN',
    'referer': 'https://s.m.taobao.com/'
}

#获取随机请求头
def set_ua():
    # ua=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
    #     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    #     'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16']
    ua = ['Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
          'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0',
          'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0',
          'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
          'Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01',
          'Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0',
          'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
          'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0',
          'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36',
          'Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10',
          'Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
          'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10',
          'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)',
          'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0',
          'Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0',
          'Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',
          'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5',
          'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
          'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',
          'Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)',
          'Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0',
          'Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5',
          'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0',
          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0',
          'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
          'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)',
          'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0',
          'Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
          'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)',
          'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
          'Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
          'Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5',
          'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)',
          'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
          'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
          'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01',
          'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5',
          'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0',
          'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0',
          'Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00',
          'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14',
          'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
          'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0',
          'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)',
          'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
          'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
          'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0',
          'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0',
          'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F',
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)',
          'Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
          'Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00',
          'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36',
          'Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15',
          'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
          'Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5',
          'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)',
          'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
          'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0',
          'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)',
          'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3',
          'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
          'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0',
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',
          'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0',
          'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
          'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
          'Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00',
          'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
          'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0',
          'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)',
          'Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
          'Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00',
          'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5',
          'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
          'Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01',
          'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0',
          'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
          'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
          'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0',
          'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
          'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)']
    return random.choice(ua)


# 获取cookie
def get_cookies():
    url = 'https://h5api.m.taobao.com/h5/mtop.taobao.wireless.home.load/1.0/?jsv=2.5.0&appKey=12574478&t=1545354577214&sign=2e7e0d72845d57e773df67565fb5e949&api=mtop.taobao.wireless.home.load&v=1.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22containerId%22%3A%22main%22%2C%22ext%22%3A%22%7B%5C%22h5_platform%5C%22%3A%5C%22h5%5C%22%2C%5C%22h5_ttid%5C%22%3A%5C%2260000%40taobao_h5_1.0.0%5C%22%7D%22%7D'
    s = requests.Session()
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'referer': 'https://h5.m.taobao.com/',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    s.get(
        url=url,
        headers=headers
    )
    cookies = s.cookies.get_dict()
    if cookies:
        return cookies
    else:
        print('未获取到cookie')
        raise NameError



# 获取不同城市的最大页码
def get_page(cookies, params, city, headers, retry=0, keywords='', ):
    t1 = time.time()
    headers['User-Agent'] = set_ua()
    if retry >= 4:
        # print('获取最大城市页码重试3次后仍错误,任务失败')
        raise NameError
    stop = 1
    try:
        t2 = time.time()
        # #print('设置请求头时长',t2-t1)
        response = requests.get(
            url='https://acs.m.taobao.com/h5/mtop.taobao.wsearch.h5search/1.0/',
            headers=headers,
            cookies=cookies,
            params=params,
            timeout=5
        )
        # t3=time.time()
        # print('*' * 100)
        # print(response.text)
        # print('*' * 100)
        if str(response.status_code).startswith('2'):
            if 'callback' in response.text and response.text:
                res_text = re.findall('callback\((.*)\)', response.text)[0]
                res_json = json.loads(res_text)
                pages_num = int(res_json.get('data', {}).get('totalPage', 0))
                if pages_num and pages_num!= 0:
                    print({city: pages_num})
                    return {city: pages_num}
            else:
                # print('认证错误,重新请求',response.text)
                retry += 2
                res = get_page(cookies, params, city, headers, retry)
                return res
        else:
            # print('请求失败,状态码', response.status_code)
            retry += 1
            # #print('城市页码的页面失败,重新请求', city, retry)
            res = get_page(cookies, params, city, headers, retry)
            return res

    except requests.exceptions.ConnectTimeout:
        retry += 1
        # print('获取城市最大页码超时,重新请求', retry)
        res = get_page(cookies, params, city, headers, retry)
        return res
    except requests.exceptions.ConnectionError:
        # print('请求城市最大页码网络异常,3秒后重新请求...',  '?' * 20)
        # print('第{}次重试'.format(retry))
        retry += 1
        time.sleep(3)
        res = get_page(cookies, params, city, headers, retry)
        return res
    except Exception as e:
        print('未知异常,跳过当前页面', e)
        if stop == 0:
            raise ValueError

#解析不同城市的商品url
def parse_citys(cookies, params, city, headers, retry=1):
    if retry == 4:
        # print('解析列表重试多次仍错误,任务失败')
        raise NameError
    try:
        response = requests.get(
            url='https://acs.m.taobao.com/h5/mtop.taobao.wsearch.h5search/1.0/',
            headers=headers,
            cookies=cookies,
            params=params,
            timeout=5
        )
        if str(response.status_code).startswith('2'):
            if 'ERROR' not in response.text and response.text:
                # html_encode = response.apparent_encoding
                # #print('当前页面编码方式为:', html_encode)
                # response.encoding = response.apparent_encoding
                res_text = re.findall('callback\((.*)\)', response.text)[0]
                # #print(res_text)
                res_json = json.loads(res_text)
                goods_list = res_json.get('data', '没有数据').get('listItem', None)
                # t = time.time()
                # pool=Pool(3)
                # print('resjson', res_json)
                for goods in goods_list:
                    # t=time.time()
                    # print('循环列表中...')
                    tb = {}
                    goods_id = goods.get('item_id', '')
                    act = goods.get('act') if goods.get('act', '') else 0
                    area = goods.get('area', '')
                    price = goods.get('price', '')
                    price = goods.get('price', '')
                    user_type = goods.get('userType', '')
                    tb['productId'] = goods_id
                    tb['productListPrice'] = price
                    tb['shipAddress'] = area
                    # 0是淘宝,1是天猫
                    if user_type == '0':
                        tb['productUrl'] = 'https://item.taobao.com/item.htm?id=' + goods_id
                        if tb:
                            # print(tb)
                            goods_all_q.put({json.dumps(tb): int(act)})
                    if user_type == '1':
                        tb['productUrl'] = 'https://detail.tmall.com/item.htm?id=' + goods_id
                        if tb:
                            # print(tb)
                            goods_all_q.put({json.dumps(tb): int(act)})
                            # #print('单个城市的数据',city_all)
                            # return city_all
            else:
                # print('请求成功,但商品列表数据为空,重新请求中...')
                retry += 1
                parse_citys(cookies, params, city, headers, retry)
        else:
            # print('商品列表请求不成功,状态码:',response.status_code)
            retry += 1
            parse_citys(cookies, params, city, headers, retry)


    except requests.exceptions.ConnectTimeout:
        retry += 1
        # print('商品列表页请求超时,重新请求中...')
        parse_citys(cookies, params, city, headers, retry)

    except requests.exceptions.ConnectionError:  # requests.exceptions.ConnectionError
        # print('请求商品列表页网络异常,3秒后重新请求', '?' * 20)
        # print('第{}次重试'.format(retry))
        retry += 1
        time.sleep(3)
        parse_citys(cookies, params, city, headers, retry)

    except Exception as e:
        print('请求商品列表数据异常,跳过当前商品,错误:', e)

#数据汇总,排序
def g_sort(goods_list, taskId, webcrawlerTaskId, webcrawlerSubTaskId, taskList, taskChannel, keywords):
    try:
        goods_sort = []
        # print(taskList)
        for city in goods_list:
            # print(city, type(city))
            index = goods_list.index(city)
            for i, k in enumerate(city):
                v = city.get(k)
                dic_k = json.loads(k)
                new_city = copy.deepcopy(dic_k)
                new_city['taskId'] = taskId
                new_city['webcrawlerTaskId'] = webcrawlerTaskId
                new_city['webcrawlerSubTaskId'] = webcrawlerSubTaskId
                new_city['spiderRunTaskId'] = taskList[index // 44].split(':')[1]
                new_city['taskChannel'] = taskChannel
                new_city['keywords'] = keywords
                new_city['pageIndex'] = (index + 1) % 44 + 1
                new_city['pageNo'] = (index // 44) + 1
                goods_sort.append(new_city)
        # print('&' * 20, '数据爬完了,共计:{},最终数据开始发送给服务器...', len(goods_sort))
        return goods_sort
    except Exception as e:
        print('商品切片后的排序异常', e, '?' * 20)
        raise NameError

#爬虫成功,发送最后的请求
def req(goods_sort, localhost, webcrawlerSubTaskId):
    print('最后一次的请求了...')
    data = {
        "pyStatus": 0,
        "proUrlList": '',
        "webcrawlerSubTaskId": webcrawlerSubTaskId,
    }
    try:
        if goods_sort:
            data['proUrlList'] = json.dumps(goods_sort)
        res = requests.post(
            url=localhost,
            data=data,
            timeout=5,
        )
        print(data, '以上为发送的数据,接口已经调用,response为:', res.text)
    except Exception as e:
        print('我方服务器接口错误,错误{},接口为{}'.format(e, localhost))

#任务失败,发送的请求
def error_req(goods_sort, localhost, webcrawlerSubTaskId):
    print('本次爬虫任务失败,最后一次的请求了...')
    data = {
        "pyStatus": 1,
        "proUrlList": '',
        "webcrawlerSubTaskId": webcrawlerSubTaskId,
    }
    try:
        res = requests.post(
            url=localhost,
            data=data,
            timeout=5,
        )
        print(goods_sort[-100:], '以上为发送的数据,接口已经调用,response为:', res.text)
    except Exception as e:
        print('我方服务器接口错误,错误:', e)

#启动的主函数
def main():
    search_citys = ['上海', '北京', '天津', '重庆市', '河北', '山西', '辽宁', '吉林', '黑龙江', '江苏', '浙江省', '安徽省', '福建', '江西', '山东',
                    '河南',
                    '湖北', '湖南', '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾', '内蒙古', '广西', '西藏', '宁夏', '新疆',
                    '香港', '澳门', '海外']
    try:
        search_data_list = sys.argv[1:-1]
        search_data = ' '.join(search_data_list)
        # print('接收的参数为:', search_data)
        localhost = sys.argv[-1] + 'taoBaoListPythonToDB.action'  # 爬虫完毕后,汇总发送的ip
        # print('接收的请求连接为:', localhost)
        search_data = search_data.replace('\'', '\"')
        # print('修改后的参数为:', search_data, type(search_data))
        search_data = json.loads(search_data)
        # print('转换json后的为', search_data, type(search_data))
        keywords = search_data.get('keywords', None)  # 搜索的关键字
        taskList = search_data.get('taskList', None)  # 四级id
        # print('四级任务列表:', taskList)
        startPage = int(taskList[0].split(':')[0])  # 筛选的起始页码
        spider_startPage = ''
        endPage = int(taskList[-1].split(':')[0])  # 筛选的结尾页码
        spider_endPage = ''
        startPrice = search_data.get('startPrice', 0)  # 筛选的最低价格
        endPrice = search_data.get('endPrice', 0)  # 筛选的最高价格
        spider_endPrice = '' if endPrice == 0 else int(endPrice)
        sortType = search_data.get('sortType', None)  # 排序方式
        urlPathId = search_data.get('urlPathId', None)
        platFormId = search_data.get('platFormId', None)  # 平台类型
        spider_platFormId = 'tab_mall' if platFormId == 3 else ''
        webcrawlerSubTaskId = search_data.get('webcrawlerSubTaskId', None)
        taskId = search_data.get('taskId', None)
        webcrawlerTaskId = search_data.get('webcrawlerTaskId', None)
        taskChannel = search_data.get('taskChannel', None)
        try:
            cookies = get_cookies()
            # get_page(search_citys,keywords,cookie_list,headers,spider_platFormId=spider_platFormId)
            # cookies=random.choice(cookie_list)
            token = cookies.get('_m_h5_tk').split('_')[0]
            appKey = 12574478
            t = int(time.time() * 1000)
            city_andpage_list = []
            for city in search_citys:
                data = {"q": keywords, "search": "提交", "tab": "all", "sst": "1", "n": 20, "buying": "buyitnow",
                        "m": "api4h5",
                        "token4h5": "", "style": "list", "closeModues": "nav,selecthot,onesearch", "sort": "_sale",
                        "loc": city,}
                if spider_platFormId:
                    data['filter'] = 'tab_mall'
                data = '{}'.format(data)
                m2 = hashlib.md5()
                m2.update('{}&{}&{}&{}'.format(token, t, appKey, data).encode('utf-8'))
                sign = m2.hexdigest()
                params = {
                    'jsv': '2.3.16',
                    'v': '1.0',
                    'H5Request': 'true',
                    'ecode': '1',

                    'appKey': appKey,
                    't': t,
                    'sign': sign,
                    'api': 'mtop.taobao.wsearch.h5search',
                    'type': 'jsonp',
                    'dataType': 'jsonp',
                    'data': data
                }
                # print(city)
                res = get_page(cookies, params, city, headers, retry=0, keywords=keywords)
                # print('res',res)
                if res:
                    city_andpage_list.append(res)
            print('*' * 10, '城市个数共计', len(city_andpage_list), '获取了城市页面,如下:', city_andpage_list)
            pool = Pool(5)
            # goods_all_q = Queue()
            for city_andpage in city_andpage_list:
                for city, pages in city_andpage.items():
                    # print(city, pages)
                    if int(pages) >= endPage * 2:
                        pages = endPage * 2
                    # print('{}需要爬取{}页'.format(city,pages))
                    print('pages',pages)
                    for page in range(1, pages + 1):
                        data = {"q": keywords, "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "",
                                "sort": "_sale", "loc": city, "page": page}
                        # data = {"q": keywords, "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "",
                        #         "style": "list", "closeModues": "nav,selecthot,onesearch", "loc": city,"page": page}
                        data = {"q": keywords, "search": "提交", "tab": "all", "sst": "1", "n": 20, "buying": "buyitnow",
                                "m": "api4h5", "token4h5": "", "style": "list",
                                "closeModues": "nav,selecthot,onesearch",
                                "sort": "_sale", "loc": city, "page": page}
                        # data = {"event_submit_do_new_search_auction": "1", "_input_charset": "utf-8", "topSearch": "1",
                        #         "atype": "b", "searchfrom": "1", "action": "home:redirect_app_action", "from": "1",
                        #         "q": keywords, "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "",
                        #         "style": "list", "closeModues": "nav,selecthot,onesearch",
                        #         "sort": "_sale", "loc": city, "page": page}
                        if '全国' == city:
                            del data['loc']
                        if spider_platFormId:
                            data['filter'] = 'tab_mall'
                        data = '{}'.format(data)
                        m2 = hashlib.md5()
                        m2.update('{}&{}&{}&{}'.format(token, t, appKey, data).encode('utf-8'))
                        sign = m2.hexdigest()
                        params = {
                            # 'jsv': '2.3.16',
                            # 'v': '1.0',
                            # 'H5Request': 'true',
                            # 'ecode': '1',

                            'appKey': appKey,
                            't': t,
                            'sign': sign,
                            'api': 'mtop.taobao.wsearch.h5search',
                            'type': 'jsonp',
                            'dataType': 'jsonp',
                            'data': data
                        }
                        pool.add(gevent.spawn(parse_citys, cookies, params, city, headers))
            goods_all = []

            time.sleep(5)
            if goods_all_q.empty():
                # print('goods_all为空')
                raise NameError
            while not goods_all_q.empty():
                goods_data = goods_all_q.get()
                goods_all.append(goods_data)
            print('总共获取数据的有{}个'.format(len(goods_all)))
            if not goods_all:
                req('', localhost, webcrawlerSubTaskId)
            sort_a = sorted({j: i[j] for i in goods_all for j in i.keys()}.items(), key=lambda a: a[1], reverse=True)
            sort_b = [{i[0]: i[1]} for i in sort_a]
            goods_list = sort_b[(startPage - 1) * 44:endPage * 44]
            print('发送的数据共计', len(goods_list))
            goods_sort = g_sort(goods_list, taskId, webcrawlerTaskId, webcrawlerSubTaskId, taskList, taskChannel,
                                keywords)
            req(goods_sort, localhost, webcrawlerSubTaskId)
        except ValueError:
            print('采集的关键字淘宝数据为空,停止爬虫')
            req('', localhost, webcrawlerSubTaskId)
        except NameError:
            error_req('', localhost, webcrawlerSubTaskId)
        except Exception as e:
            print('爬虫出错,有可能传参有问题', e, search_data)
            response = error_req('', localhost, webcrawlerSubTaskId)
            # print('响应的结果为', response)
            # print('爬虫执行完毕,最终的发送地址为', search_data, localhost)
    except Exception as e:
        print('参数异常,无法发送最终的请求,错误:', e)


if __name__ == '__main__':
    start_t = time.time()
    print('开始启动爬虫任务,当前时间为:',datetime.datetime.now())
    main()
    print('爬虫结束,当前时间为:',datetime.datetime.now())
    print('本次爬虫exe程序执行时长为:', time.time() - start_t)
View Code

 

posted on 2018-12-26 15:17  Joy-z  阅读(354)  评论(0编辑  收藏  举报

导航