爬虫项目
新片场
BOT_NAME = 'xpc' SPIDER_MODULES = ['xpc.spiders'] NEWSPIDER_MODULE = 'xpc.spiders' SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 300 } REDIS_URL = 'redis://127.0.0.1:6379' SCHEDULER_PERSIST = True # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False DOWNLOAD_TIMEOUT = 10 MYSQL_CONFIG = dict(host='127.0.0.1', port=3306, user='root', password='', db='xpc_hz1804', charset='utf8mb4') PROXIES = [ 'http://47.95.193.101:1801', 'http://47.96.123.201:1801', 'http://47.96.64.101:1804', 'http://39.106.27.237:1801', 'http://118.24.49.46:1801', 'http://47.100.58.124:1801', 'http://101.132.185.43:1801', 'http://39.105.49.81:1801', 'http://47.99.115.152:1801', 'http://47.99.131.3:1801', 'http://39.105.22.186:1801', 'http://39.106.42.87:1801', 'http://116.62.143.107:1801', 'http://120.79.221.245:1804', 'http://47.106.188.175:1804', 'http://47.99.54.144:1801', 'http://101.200.55.163:1801', 'http://119.27.183.40:1801', 'http://118.24.64.185:1801', 'http://106.14.214.226:1801', 'http://203.195.164.241:1801', 'http://47.95.235.90:1801', 'http://47.94.162.16:1801', 'http://118.25.226.251:1801', 'http://132.232.187.251:1801', 'http://47.94.224.67:1801', 'http://118.24.145.88:1801', 'http://122.114.180.120:1801', 'http://39.106.109.148:1801', 'http://118.25.227.120:1801', 'http://60.205.179.182:1804', 'http://39.104.230.114:1806', # failed proxy 'http://119.27.187.59:1804', 'http://39.106.221.204:1801', 'http://111.231.73.145:1801', 'http://111.231.117.197:1801', 'http://60.205.176.40:1888', 'http://193.112.68.34:1801', 'http://39.106.220.99:1801' ] # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 8 COOKIES_DEBUG = True # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8", 'Cache-Control': "no-cache", 'Connection': "keep-alive", 'DNT': "1", 'Host': "www.xinpianchang.com", 'Upgrade-Insecure-Requests': "1", 'Postman-Token': "64ae163e-794d-e3bc-84dc-77d6f5d6cd50" } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'xpc.middlewares.XpcSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'xpc.middlewares.RandomProxyMiddleware': 749, # } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'xpc.pipelines.MysqlPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' import os import sys if os.path.exists('/data/settings_prd.py'): sys.path.append('/data') from settings_prd import *
import pymysql from xpc import settings class MysqlPipeline(object): def open_spider(self, spider): print('spider %s opened' % spider.name) self.conn = pymysql.connect(**settings.MYSQL_CONFIG) self.cur = self.conn.cursor() def process_item(self, item, spider): cols, values = zip(*item.items()) sql = "INSERT INTO `{}` ({}) VALUES ({}) " \ "ON DUPLICATE KEY UPDATE {}".format( item.table, ','.join(['`%s`' % k for k in cols]), ','.join(['%s'] * len(values)), ','.join(['`{}`=%s'.format(k) for k in cols]) ) self.cur.execute(sql, values * 2) # print(self.cur._last_executed) self.conn.commit() return item def close_spider(self, spider): self.cur.close() self.conn.close() print('spider %s closed' % spider.name)
import random from scrapy.exceptions import NotConfigured class RandomProxyMiddleware(object): def __init__(self, settings): # 2. 初始化中间件 if not settings.getlist('PROXIES'): raise NotConfigured self.proxies = settings.getlist('PROXIES') # 初始化统计次数,所有的代理失败次数设置为0 self.stats = {}.fromkeys(self.proxies, 0) self.max_failed = 3 @classmethod def from_crawler(cls, crawler): # 1. 创建一个中间件对象 if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured return cls(crawler.settings) def process_request(self, request, spider): # 3. 设置一个随机代理 if 'proxy' not in request.meta: request.meta['proxy'] = random.choice(self.proxies) print('use proxy: %s ' % request.meta['proxy']) def process_response(self, request, response, spider): # 4. 每次响应回来的时候都会调用 cur_proxy = request.meta['proxy'] if response.status != 200: print('none 200 status code: %s when use %s' % (response.status, cur_proxy)) # 当返回的http状态码异常时,给当前代理的失败次数加1 if response.status >= 400: self.stats[cur_proxy] += 1 # 当代理的失败累计到一定数量时,将该代理从代理池中删除 if self.stats[cur_proxy] > self.max_failed: self.remove_proxy(cur_proxy) del request.meta['proxy'] return request return response def process_exception(self, request, exception, spider): # 4. 当代理服务器出现异常的时候会执行 cur_proxy = request.meta['proxy'] print('raise exception: %s when use %s' % (exception, cur_proxy)) # 从当前代理池中将代理删除 self.remove_proxy(cur_proxy) del request.meta['proxy'] request.dont_filter = True return request def remove_proxy(self, proxy): if proxy in self.proxies: self.proxies.remove(proxy) print('proxy %s removed from proxies list' % proxy)
import scrapy from scrapy import Field class PostItem(scrapy.Item): table = 'posts' pid = Field() thumbnail = Field() title = Field() category = Field() created_at = Field() like_counts = Field() play_counts = Field() description = Field() video = Field() preview = Field() duration = Field() class CommentItem(scrapy.Item): table = 'comments' commentid = Field() pid = Field() content = Field() created_at = Field() like_counts = Field() cid = Field() avatar = Field() uname = Field() reply = Field() class ComposerItem(scrapy.Item): table = 'composers' cid = Field() banner = Field() avatar = Field() name = Field() intro = Field() like_counts = Field() fans_counts = Field() follow_counts = Field() location = Field() career = Field() class CopyrightItem(scrapy.Item): table = 'copyrights' pcid = Field() cid = Field() pid = Field() roles = Field()
import re import json import scrapy import scrapy_redis from scrapy_redis.spiders import RedisSpider from scrapy import Request from scrapy.shell import inspect_response from xpc.items import PostItem, CommentItem, ComposerItem, CopyrightItem def strip(s): if s: return s.strip().replace(' ', '') return '' def convert_int(s): if not s: return 0 return int(s.replace(',', '')) ci = convert_int class DiscoverySpider(RedisSpider): name = 'discovery' allowed_domains = ['xinpianchang.com', 'vmovier.com'] start_urls = [ # 'http://www.xinpianchang.com/channel/index/type-0/sort-like/duration_type-0/resolution_type-/page-21' ] cookies = { "Authorization": "D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA", # "channel_page": "apY%3D", } def start_requests(self): first_cookies = dict(self.cookies) first_cookies['channel_page'] = "b5Y%3D" for url in self.start_urls: request = Request(url, dont_filter=True, callback=self.parse, cookies=first_cookies) request.meta['first_request'] = 1 yield request def make_requests_from_url(self, url): first_cookies = dict(self.cookies) first_cookies['channel_page'] = "b5Y%3D" request = Request(url, dont_filter=True, callback=self.parse, cookies=first_cookies) return request #channel_page=aZec #21 # b5Y%3D # 72 #13994324984 #18041804 # Authorization = D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA; def parse(self, response): url = 'http://www.xinpianchang.com/a%s?from=ArticleList' # 先取li节点 post_list = response.xpath('//ul[@class="video-list"]/li') # 遍历li节点,在遍历的过程中取pid和thumbnail # 这样为了能让pid和thumbnail一一对应,并减少代码量 for post in post_list: pid = post.xpath('./@data-articleid').extract_first() # 注意图片的src属性是延迟加载的,所以这里要取_src属性 thumbnail = post.xpath('./a/img/@_src').get() request = Request(url % pid, callback=self.parse_post) request.meta['pid'] = pid request.meta['thumbnail'] = thumbnail yield request # other_pages = response.xpath('//div[@class="page"]/a/@href').extract() other_pages = response.xpath('//a[@title="下一页"]/@href').extract() if not other_pages: # inspect_response(response, self) tip = response.xpath('//div[@class="notice-tip"]/p/text()').get() if '系统繁忙,请稍候再试' in tip: print('---'*50, '遇到系统繁忙了') response.request.dont_filter = True response.request.cookies['PHPSESSID'] = '123' # request = Request( # response.url, # dont_filter=True, # cookies=self.cookies) yield response.request for page in other_pages: request = Request(page, callback=self.parse, cookies=self.cookies) yield request def parse_post(self, response): # 一次性去除网页源码中的所有\t符号 response.replace(body=response.text.replace('\t', '').encode('utf-8')) pid = response.meta['pid'] post = { 'pid': pid, 'thumbnail': response.meta['thumbnail'], } post = PostItem(post) post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').extract_first() cates = response.xpath( '//span[contains(@class, "cate")]/a/text()').extract() post['category'] = '-'.join([strip(cate) for cate in cates]) post['created_at'] = response.xpath( '//span[contains(@class, "update-time")]/i/text()').get() post['play_counts'] = response.xpath( '//i[contains(@class, "play-counts")]/@data-curplaycounts').get() post['like_counts'] = response.xpath( '//span[contains(@class, "like-counts")]/@data-counts').get() post['description'] = response.xpath('//p[contains(@class, "desc")]/text()').get() url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?' # 通过正则找出页面中js代码中的一个关键变量的值 # 正则匹配这样的字符串: # vid: "5BA5015D7FB4E", vid = re.findall('vid\:\s\"(\w+)\"\,', response.text) if vid: # 去请求视频接口的信息 url = url % vid[0] request = Request(url, callback=self.parse_video) request.meta['post'] = post yield request # 请求评论API,注意ajax参数 comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' request = Request(comment_api % pid, callback=self.parse_comment) yield request # 请求用户页面 user_url = 'http://www.xinpianchang.com/u%s?from=articleList' composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li') for composer in composer_list: cid = composer.xpath('./a/@data-userid').get() request = Request(user_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request # 用户和视频的对应关系 cr = CopyrightItem() cr['pcid'] = '%s_%s' % (cid, pid) cr['cid'] = cid cr['pid'] = pid cr['roles'] = composer.xpath('.//span[contains(@class, "roles")]/text()').get() yield cr def parse_video(self, response): # 该接口返回的是json数据,所以不用解析,直接json.loads post = response.meta['post'] result = json.loads(response.text) post['video'] = result['data']['resource']['default']['url'] post['preview'] = result['data']['video']['cover'] post['duration'] = result['data']['video']['duration'] yield post def parse_comment(self, response): # 该接口返回的是json数据,所以不用解析,直接json.loads result = json.loads(response.text) comments = result['data']['list'] for comment in comments: c = CommentItem() c['commentid'] = comment['commentid'] c['pid'] = comment['articleid'] c['content'] = comment['content'] c['created_at'] = comment['addtime_int'] c['like_counts'] = comment['count_approve'] c['cid'] = comment['userInfo']['userid'] c['avatar'] = comment['userInfo']['face'] c['uname'] = comment['userInfo']['username'] # reply字段内包含的是该条评论是回复的另外哪一条评论,如果没有回复则为空 if comment['reply']: # 只需要保存commentid就可以了 c['reply'] = comment['reply']['commentid'] yield c next_page = result['data']['next_page_url'] if next_page: yield Request(next_page, callback=self.parse_comment) def parse_composer(self, response): composer = ComposerItem() composer['cid'] = response.meta['cid'] banner = response.xpath('//div[@class="banner-wrap"]/@style').get() # 截取background-image:url()内的值 composer['banner'] = banner[21:-1] composer['avatar'] = response.xpath( '//span[@class="avator-wrap-s"]/img/@src').get() composer['name'] = response.xpath( '//p[contains(@class, "creator-name")]/text()').get() composer['intro'] = response.xpath( '//p[contains(@class, "creator-desc")]/text()').get() composer['like_counts'] = ci(response.xpath( '//span[contains(@class, "like-counts")]/text()').get()) composer['fans_counts'] = ci(response.xpath( '//span[contains(@class, "fans-counts")]/@data-counts').get()) composer['follow_counts'] = ci(response.xpath( '//span[@class="follow-wrap"]/span[2]/text()').get()) # 取class为icon-location的span节点的下一个span节点内的文本 composer['location'] = strip(response.xpath( '//span[contains(@class, "icon-location")]/' 'following-sibling::span[1]/text()').get()) composer['career'] = response.xpath( '//span[contains(@class, "icon-career")]/' 'following-sibling::span[1]/text()').get() yield composer
# -*- coding: utf-8 -*- import re import json import scrapy from scrapy import Request from scrapy.shell import inspect_response from xpc.items import PostItem, CommentItem, ComposerItem, CopyrightItem def strip(s): if s: return s.strip().replace(' ', '') return '' def convert_int(s): if not s: return 0 return int(s.replace(',', '')) ci = convert_int class DiscoverySpider(scrapy.Spider): name = 'discovery' allowed_domains = ['xinpianchang.com', 'vmovier.com'] start_urls = [ 'http://www.xinpianchang.com/channel/index/' 'type-0/sort-like/duration_type-0/resolution_type-/page-21'] cookies = { "Authorization": "D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA", # "channel_page": "apY%3D", } def start_requests(self): first_cookies = dict(self.cookies) first_cookies['channel_page'] = "b5Y%3D" for url in self.start_urls: request = Request(url, dont_filter=True, callback=self.parse, cookies=first_cookies) request.meta['first_request'] = 1 yield request #channel_page=aZec #21 # b5Y%3D # 72 #13994324984 #18041804 # Authorization = D1FF2FD49B5914AF89B59148A39B591819E9B591B1EB12155BAA; def parse(self, response): url = 'http://www.xinpianchang.com/a%s?from=ArticleList' # 先取li节点 post_list = response.xpath('//ul[@class="video-list"]/li') # 遍历li节点,在遍历的过程中取pid和thumbnail # 这样为了能让pid和thumbnail一一对应,并减少代码量 for post in post_list: pid = post.xpath('./@data-articleid').extract_first() # 注意图片的src属性是延迟加载的,所以这里要取_src属性 thumbnail = post.xpath('./a/img/@_src').get() request = Request(url % pid, callback=self.parse_post) request.meta['pid'] = pid request.meta['thumbnail'] = thumbnail # yield request # other_pages = response.xpath('//div[@class="page"]/a/@href').extract() other_pages = response.xpath('//a[@title="下一页"]/@href').extract() if not other_pages: # inspect_response(response, self) tip = response.xpath('//div[@class="notice-tip"]/p/text()').get() print('==='*50, tip) if '系统繁忙,请稍候再试' in tip: print('---'*50, '遇到系统繁忙了') response.request.dont_filter = True response.request.cookies['PHPSESSID'] = '123' # request = Request( # response.url, # dont_filter=True, # cookies=self.cookies) yield response.request for page in other_pages: request = Request(page, callback=self.parse, cookies=self.cookies) yield request def parse_post(self, response): # 一次性去除网页源码中的所有\t符号 response.replace(body=response.text.replace('\t', '').encode('utf-8')) pid = response.meta['pid'] post = { 'pid': pid, 'thumbnail': response.meta['thumbnail'], } post = PostItem(post) post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').extract_first() cates = response.xpath( '//span[contains(@class, "cate")]/a/text()').extract() post['category'] = '-'.join([strip(cate) for cate in cates]) post['created_at'] = response.xpath( '//span[contains(@class, "update-time")]/i/text()').get() post['play_counts'] = response.xpath( '//i[contains(@class, "play-counts")]/@data-curplaycounts').get() post['like_counts'] = response.xpath( '//span[contains(@class, "like-counts")]/@data-counts').get() post['description'] = response.xpath('//p[contains(@class, "desc")]/text()').get() url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?' # 通过正则找出页面中js代码中的一个关键变量的值 # 正则匹配这样的字符串: # vid: "5BA5015D7FB4E", vid = re.findall('vid\:\s\"(\w+)\"\,', response.text) if vid: # 去请求视频接口的信息 url = url % vid[0] request = Request(url, callback=self.parse_video) request.meta['post'] = post yield request # 请求评论API,注意ajax参数 comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' request = Request(comment_api % pid, callback=self.parse_comment) yield request # 请求用户页面 user_url = 'http://www.xinpianchang.com/u%s?from=articleList' composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li') for composer in composer_list: cid = composer.xpath('./a/@data-userid').get() request = Request(user_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request # 用户和视频的对应关系 cr = CopyrightItem() cr['pcid'] = '%s_%s' % (cid, pid) cr['cid'] = cid cr['pid'] = pid cr['roles'] = composer.xpath('.//span[contains(@class, "roles")]/text()').get() yield cr def parse_video(self, response): # 该接口返回的是json数据,所以不用解析,直接json.loads post = response.meta['post'] result = json.loads(response.text) post['video'] = result['data']['resource']['default']['url'] post['preview'] = result['data']['video']['cover'] post['duration'] = result['data']['video']['duration'] yield post def parse_comment(self, response): # 该接口返回的是json数据,所以不用解析,直接json.loads result = json.loads(response.text) comments = result['data']['list'] for comment in comments: c = CommentItem() c['commentid'] = comment['commentid'] c['pid'] = comment['articleid'] c['content'] = comment['content'] c['created_at'] = comment['addtime_int'] c['like_counts'] = comment['count_approve'] c['cid'] = comment['userInfo']['userid'] c['avatar'] = comment['userInfo']['face'] c['uname'] = comment['userInfo']['username'] # reply字段内包含的是该条评论是回复的另外哪一条评论,如果没有回复则为空 if comment['reply']: # 只需要保存commentid就可以了 c['reply'] = comment['reply']['commentid'] yield c next_page = result['data']['next_page_url'] if next_page: yield Request(next_page, callback=self.parse_comment) def parse_composer(self, response): composer = ComposerItem() composer['cid'] = response.meta['cid'] banner = response.xpath('//div[@class="banner-wrap"]/@style').get() # 截取background-image:url()内的值 composer['banner'] = banner[21:-1] composer['avatar'] = response.xpath( '//span[@class="avator-wrap-s"]/img/@src').get() composer['name'] = response.xpath( '//p[contains(@class, "creator-name")]/text()').get() composer['intro'] = response.xpath( '//p[contains(@class, "creator-desc")]/text()').get() composer['like_counts'] = ci(response.xpath( '//span[contains(@class, "like-counts")]/text()').get()) composer['fans_counts'] = ci(response.xpath( '//span[contains(@class, "fans-counts")]/@data-counts').get()) composer['follow_counts'] = ci(response.xpath( '//span[@class="follow-wrap"]/span[2]/text()').get()) # 取class为icon-location的span节点的下一个span节点内的文本 composer['location'] = strip(response.xpath( '//span[contains(@class, "icon-location")]/' 'following-sibling::span[1]/text()').get()) composer['career'] = response.xpath( '//span[contains(@class, "icon-career")]/' 'following-sibling::span[1]/text()').get() yield composer
淘宝滑块
https://blog.csdn.net/u012067766/article/details/79793264
https://www.cnblogs.com/garvicker/p/8947121.html
淘宝列表gevent爬虫
# coding: utf-8 from gevent import monkey monkey.patch_all() from gevent.pool import Pool import gevent from queue import Queue import random import re import sys import time import datetime import json import copy import hashlib import requests import hashlib from urllib.parse import quote numbers=0 is_in = [] goods_all_q = Queue() headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN', 'referer': 'https://s.m.taobao.com/' } #获取随机请求头 def set_ua(): # ua=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', # 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', # 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16'] ua = ['Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36', 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', 'Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01', 'Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0', 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', 'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', 'Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10', 'Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0', 'Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17', 'Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)', 'Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0', 'Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01', 'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0', 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0', 'Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0', 'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0', 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)', 'Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36', 'Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)', 'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)', 'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14', 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0', 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)', 'Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 'Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0', 'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36', 'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0', 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'] return random.choice(ua) # 获取cookie def get_cookies(): url = 'https://h5api.m.taobao.com/h5/mtop.taobao.wireless.home.load/1.0/?jsv=2.5.0&appKey=12574478&t=1545354577214&sign=2e7e0d72845d57e773df67565fb5e949&api=mtop.taobao.wireless.home.load&v=1.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22containerId%22%3A%22main%22%2C%22ext%22%3A%22%7B%5C%22h5_platform%5C%22%3A%5C%22h5%5C%22%2C%5C%22h5_ttid%5C%22%3A%5C%2260000%40taobao_h5_1.0.0%5C%22%7D%22%7D' s = requests.Session() headers = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'referer': 'https://h5.m.taobao.com/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } s.get( url=url, headers=headers ) cookies = s.cookies.get_dict() if cookies: return cookies else: print('未获取到cookie') raise NameError # 获取不同城市的最大页码 def get_page(cookies, params, city, headers, retry=0, keywords='', ): t1 = time.time() headers['User-Agent'] = set_ua() if retry >= 4: # print('获取最大城市页码重试3次后仍错误,任务失败') raise NameError stop = 1 try: t2 = time.time() # #print('设置请求头时长',t2-t1) response = requests.get( url='https://acs.m.taobao.com/h5/mtop.taobao.wsearch.h5search/1.0/', headers=headers, cookies=cookies, params=params, timeout=5 ) # t3=time.time() # print('*' * 100) # print(response.text) # print('*' * 100) if str(response.status_code).startswith('2'): if 'callback' in response.text and response.text: res_text = re.findall('callback\((.*)\)', response.text)[0] res_json = json.loads(res_text) pages_num = int(res_json.get('data', {}).get('totalPage', 0)) if pages_num and pages_num!= 0: print({city: pages_num}) return {city: pages_num} else: # print('认证错误,重新请求',response.text) retry += 2 res = get_page(cookies, params, city, headers, retry) return res else: # print('请求失败,状态码', response.status_code) retry += 1 # #print('城市页码的页面失败,重新请求', city, retry) res = get_page(cookies, params, city, headers, retry) return res except requests.exceptions.ConnectTimeout: retry += 1 # print('获取城市最大页码超时,重新请求', retry) res = get_page(cookies, params, city, headers, retry) return res except requests.exceptions.ConnectionError: # print('请求城市最大页码网络异常,3秒后重新请求...', '?' * 20) # print('第{}次重试'.format(retry)) retry += 1 time.sleep(3) res = get_page(cookies, params, city, headers, retry) return res except Exception as e: print('未知异常,跳过当前页面', e) if stop == 0: raise ValueError #解析不同城市的商品url def parse_citys(cookies, params, city, headers, retry=1): if retry == 4: # print('解析列表重试多次仍错误,任务失败') raise NameError try: response = requests.get( url='https://acs.m.taobao.com/h5/mtop.taobao.wsearch.h5search/1.0/', headers=headers, cookies=cookies, params=params, timeout=5 ) if str(response.status_code).startswith('2'): if 'ERROR' not in response.text and response.text: # html_encode = response.apparent_encoding # #print('当前页面编码方式为:', html_encode) # response.encoding = response.apparent_encoding res_text = re.findall('callback\((.*)\)', response.text)[0] # #print(res_text) res_json = json.loads(res_text) goods_list = res_json.get('data', '没有数据').get('listItem', None) # t = time.time() # pool=Pool(3) # print('resjson', res_json) for goods in goods_list: # t=time.time() # print('循环列表中...') tb = {} goods_id = goods.get('item_id', '') act = goods.get('act') if goods.get('act', '') else 0 area = goods.get('area', '') price = goods.get('price', '') price = goods.get('price', '') user_type = goods.get('userType', '') tb['productId'] = goods_id tb['productListPrice'] = price tb['shipAddress'] = area # 0是淘宝,1是天猫 if user_type == '0': tb['productUrl'] = 'https://item.taobao.com/item.htm?id=' + goods_id if tb: # print(tb) goods_all_q.put({json.dumps(tb): int(act)}) if user_type == '1': tb['productUrl'] = 'https://detail.tmall.com/item.htm?id=' + goods_id if tb: # print(tb) goods_all_q.put({json.dumps(tb): int(act)}) # #print('单个城市的数据',city_all) # return city_all else: # print('请求成功,但商品列表数据为空,重新请求中...') retry += 1 parse_citys(cookies, params, city, headers, retry) else: # print('商品列表请求不成功,状态码:',response.status_code) retry += 1 parse_citys(cookies, params, city, headers, retry) except requests.exceptions.ConnectTimeout: retry += 1 # print('商品列表页请求超时,重新请求中...') parse_citys(cookies, params, city, headers, retry) except requests.exceptions.ConnectionError: # requests.exceptions.ConnectionError # print('请求商品列表页网络异常,3秒后重新请求', '?' * 20) # print('第{}次重试'.format(retry)) retry += 1 time.sleep(3) parse_citys(cookies, params, city, headers, retry) except Exception as e: print('请求商品列表数据异常,跳过当前商品,错误:', e) #数据汇总,排序 def g_sort(goods_list, taskId, webcrawlerTaskId, webcrawlerSubTaskId, taskList, taskChannel, keywords): try: goods_sort = [] # print(taskList) for city in goods_list: # print(city, type(city)) index = goods_list.index(city) for i, k in enumerate(city): v = city.get(k) dic_k = json.loads(k) new_city = copy.deepcopy(dic_k) new_city['taskId'] = taskId new_city['webcrawlerTaskId'] = webcrawlerTaskId new_city['webcrawlerSubTaskId'] = webcrawlerSubTaskId new_city['spiderRunTaskId'] = taskList[index // 44].split(':')[1] new_city['taskChannel'] = taskChannel new_city['keywords'] = keywords new_city['pageIndex'] = (index + 1) % 44 + 1 new_city['pageNo'] = (index // 44) + 1 goods_sort.append(new_city) # print('&' * 20, '数据爬完了,共计:{},最终数据开始发送给服务器...', len(goods_sort)) return goods_sort except Exception as e: print('商品切片后的排序异常', e, '?' * 20) raise NameError #爬虫成功,发送最后的请求 def req(goods_sort, localhost, webcrawlerSubTaskId): print('最后一次的请求了...') data = { "pyStatus": 0, "proUrlList": '', "webcrawlerSubTaskId": webcrawlerSubTaskId, } try: if goods_sort: data['proUrlList'] = json.dumps(goods_sort) res = requests.post( url=localhost, data=data, timeout=5, ) print(data, '以上为发送的数据,接口已经调用,response为:', res.text) except Exception as e: print('我方服务器接口错误,错误{},接口为{}'.format(e, localhost)) #任务失败,发送的请求 def error_req(goods_sort, localhost, webcrawlerSubTaskId): print('本次爬虫任务失败,最后一次的请求了...') data = { "pyStatus": 1, "proUrlList": '', "webcrawlerSubTaskId": webcrawlerSubTaskId, } try: res = requests.post( url=localhost, data=data, timeout=5, ) print(goods_sort[-100:], '以上为发送的数据,接口已经调用,response为:', res.text) except Exception as e: print('我方服务器接口错误,错误:', e) #启动的主函数 def main(): search_citys = ['上海', '北京', '天津', '重庆市', '河北', '山西', '辽宁', '吉林', '黑龙江', '江苏', '浙江省', '安徽省', '福建', '江西', '山东', '河南', '湖北', '湖南', '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾', '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门', '海外'] try: search_data_list = sys.argv[1:-1] search_data = ' '.join(search_data_list) # print('接收的参数为:', search_data) localhost = sys.argv[-1] + 'taoBaoListPythonToDB.action' # 爬虫完毕后,汇总发送的ip # print('接收的请求连接为:', localhost) search_data = search_data.replace('\'', '\"') # print('修改后的参数为:', search_data, type(search_data)) search_data = json.loads(search_data) # print('转换json后的为', search_data, type(search_data)) keywords = search_data.get('keywords', None) # 搜索的关键字 taskList = search_data.get('taskList', None) # 四级id # print('四级任务列表:', taskList) startPage = int(taskList[0].split(':')[0]) # 筛选的起始页码 spider_startPage = '' endPage = int(taskList[-1].split(':')[0]) # 筛选的结尾页码 spider_endPage = '' startPrice = search_data.get('startPrice', 0) # 筛选的最低价格 endPrice = search_data.get('endPrice', 0) # 筛选的最高价格 spider_endPrice = '' if endPrice == 0 else int(endPrice) sortType = search_data.get('sortType', None) # 排序方式 urlPathId = search_data.get('urlPathId', None) platFormId = search_data.get('platFormId', None) # 平台类型 spider_platFormId = 'tab_mall' if platFormId == 3 else '' webcrawlerSubTaskId = search_data.get('webcrawlerSubTaskId', None) taskId = search_data.get('taskId', None) webcrawlerTaskId = search_data.get('webcrawlerTaskId', None) taskChannel = search_data.get('taskChannel', None) try: cookies = get_cookies() # get_page(search_citys,keywords,cookie_list,headers,spider_platFormId=spider_platFormId) # cookies=random.choice(cookie_list) token = cookies.get('_m_h5_tk').split('_')[0] appKey = 12574478 t = int(time.time() * 1000) city_andpage_list = [] for city in search_citys: data = {"q": keywords, "search": "提交", "tab": "all", "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "", "style": "list", "closeModues": "nav,selecthot,onesearch", "sort": "_sale", "loc": city,} if spider_platFormId: data['filter'] = 'tab_mall' data = '{}'.format(data) m2 = hashlib.md5() m2.update('{}&{}&{}&{}'.format(token, t, appKey, data).encode('utf-8')) sign = m2.hexdigest() params = { 'jsv': '2.3.16', 'v': '1.0', 'H5Request': 'true', 'ecode': '1', 'appKey': appKey, 't': t, 'sign': sign, 'api': 'mtop.taobao.wsearch.h5search', 'type': 'jsonp', 'dataType': 'jsonp', 'data': data } # print(city) res = get_page(cookies, params, city, headers, retry=0, keywords=keywords) # print('res',res) if res: city_andpage_list.append(res) print('*' * 10, '城市个数共计', len(city_andpage_list), '获取了城市页面,如下:', city_andpage_list) pool = Pool(5) # goods_all_q = Queue() for city_andpage in city_andpage_list: for city, pages in city_andpage.items(): # print(city, pages) if int(pages) >= endPage * 2: pages = endPage * 2 # print('{}需要爬取{}页'.format(city,pages)) print('pages',pages) for page in range(1, pages + 1): data = {"q": keywords, "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "", "sort": "_sale", "loc": city, "page": page} # data = {"q": keywords, "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "", # "style": "list", "closeModues": "nav,selecthot,onesearch", "loc": city,"page": page} data = {"q": keywords, "search": "提交", "tab": "all", "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "", "style": "list", "closeModues": "nav,selecthot,onesearch", "sort": "_sale", "loc": city, "page": page} # data = {"event_submit_do_new_search_auction": "1", "_input_charset": "utf-8", "topSearch": "1", # "atype": "b", "searchfrom": "1", "action": "home:redirect_app_action", "from": "1", # "q": keywords, "sst": "1", "n": 20, "buying": "buyitnow", "m": "api4h5", "token4h5": "", # "style": "list", "closeModues": "nav,selecthot,onesearch", # "sort": "_sale", "loc": city, "page": page} if '全国' == city: del data['loc'] if spider_platFormId: data['filter'] = 'tab_mall' data = '{}'.format(data) m2 = hashlib.md5() m2.update('{}&{}&{}&{}'.format(token, t, appKey, data).encode('utf-8')) sign = m2.hexdigest() params = { # 'jsv': '2.3.16', # 'v': '1.0', # 'H5Request': 'true', # 'ecode': '1', 'appKey': appKey, 't': t, 'sign': sign, 'api': 'mtop.taobao.wsearch.h5search', 'type': 'jsonp', 'dataType': 'jsonp', 'data': data } pool.add(gevent.spawn(parse_citys, cookies, params, city, headers)) goods_all = [] time.sleep(5) if goods_all_q.empty(): # print('goods_all为空') raise NameError while not goods_all_q.empty(): goods_data = goods_all_q.get() goods_all.append(goods_data) print('总共获取数据的有{}个'.format(len(goods_all))) if not goods_all: req('', localhost, webcrawlerSubTaskId) sort_a = sorted({j: i[j] for i in goods_all for j in i.keys()}.items(), key=lambda a: a[1], reverse=True) sort_b = [{i[0]: i[1]} for i in sort_a] goods_list = sort_b[(startPage - 1) * 44:endPage * 44] print('发送的数据共计', len(goods_list)) goods_sort = g_sort(goods_list, taskId, webcrawlerTaskId, webcrawlerSubTaskId, taskList, taskChannel, keywords) req(goods_sort, localhost, webcrawlerSubTaskId) except ValueError: print('采集的关键字淘宝数据为空,停止爬虫') req('', localhost, webcrawlerSubTaskId) except NameError: error_req('', localhost, webcrawlerSubTaskId) except Exception as e: print('爬虫出错,有可能传参有问题', e, search_data) response = error_req('', localhost, webcrawlerSubTaskId) # print('响应的结果为', response) # print('爬虫执行完毕,最终的发送地址为', search_data, localhost) except Exception as e: print('参数异常,无法发送最终的请求,错误:', e) if __name__ == '__main__': start_t = time.time() print('开始启动爬虫任务,当前时间为:',datetime.datetime.now()) main() print('爬虫结束,当前时间为:',datetime.datetime.now()) print('本次爬虫exe程序执行时长为:', time.time() - start_t)