具体搭建步骤不再赘述,这里主要使用到了fakeagent,phantomjs和proxy
pyspider的爬取相当智能,在不能获取图片的时候会适当的暂停一段时间再试探性的爬取,配合fakeagent,proxypool和phantomjs,爬取成功率在90%以上。
代码是扒的别人的然后修改提高速度和成功率的,数据总量在百G左右,磁盘大的可以扒一扒。
代码如下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-03-25 00:59:45
# Project: taobaomm
from pyspider.libs.base_handler import *
from fake_useragent import UserAgent
import base64
import requests
import random
import sys
reload(sys)
sys.setdefaultencoding('UTF-8')
PAGE_START = 1
PAGE_END = 4301
DIR_PATH = '/root/images/tbmm'
class Handler(BaseHandler):
r = requests.get(u'http://127.0.0.1:5010/get_all/')
proxy = random.choice(eval(r.text))
ua = UserAgent()
crawl_config = {
"proxy": proxy,
"headers":{
"User-Agent": ua.random
},
}
def __init__(self):
self.base_url = 'https://mm.taobao.com/json/request_top_list.htm?page='
self.page_num = PAGE_START
self.total_num = PAGE_END
self.deal = Deal()
def on_start(self):
while self.page_num <= self.total_num:
url = self.base_url + str(self.page_num)
self.crawl(url, callback=self.index_page)
self.page_num += 1
def index_page(self, response):
for each in response.doc('.lady-name').items():
self.crawl(each.attr.href, callback=self.detail_page, fetch_type='js')
def detail_page(self, response):
domain = response.doc('.mm-p-domain-info li > span').text()
if domain:
page_url = 'https:' + domain
self.crawl(page_url, callback=self.domain_page)
def domain_page(self, response):
name = base64.b64encode(response.doc('.mm-p-model-info-left-top dd > a').text())
dir_path = self.deal.mkDir(name)
brief = response.doc('.mm-aixiu-content').text()
if dir_path:
imgs = response.doc('.mm-aixiu-content img').items()
count = 1
self.deal.saveBrief(brief, dir_path, name)
for img in imgs:
url = img.attr.src
if url:
extension = self.deal.getExtension(url)
file_name = name + str(count) + '.' + extension
count += 1
self.crawl(img.attr.src, callback=self.save_img,
save={'dir_path': dir_path, 'file_name': file_name})
def save_img(self, response):
content = response.content
dir_path = response.save['dir_path']
file_name = response.save['file_name']
file_path = dir_path + '/' + file_name
self.deal.saveImg(content, file_path)
import os
class Deal:
def __init__(self):
self.path = DIR_PATH
if not self.path.endswith('/'):
self.path = self.path + '/'
if not os.path.exists(self.path):
os.makedirs(self.path)
def mkDir(self, path):
path = path.strip()
dir_path = self.path + path
exists = os.path.exists(dir_path)
if not exists:
os.makedirs(dir_path)
return dir_path
else:
return dir_path
def saveImg(self, content, path):
f = open(path, 'wb')
f.write(content)
f.close()
def saveBrief(self, content, dir_path, name):
file_name = dir_path + "/" + name + ".txt"
f = open(file_name, "w+")
f.write(content.encode('utf-8'))
def getExtension(self, url):
extension = url.split('.')[-1]
return extension
/**
*
* __ (__`\
* (__`\ \\`\
* `\\`\ \\ \
* `\\`\ \\ \
* `\\`\#\\ \#
* \_ ##\_ |##
* (___)(___)##
* (0) (0)`\##
* |~ ~ , \##
* | | \##
* | /\ \## __..---'''''-.._.._
* | | \ `\## _.--' _ `.
* Y | \ `##' \`\ \
* / | \ | `\ \
* /_...___| \ | `\\
* / `. | / ##
* | | | / ####
* | | | / ####
* | () () | \ | | _.-' ##
* `. .' `._. |______..| |-'|
* `------' | | | | | || |
* | | | | | || |
* | | | | | || |
* | | | | | || |
* _____ | | | |____| || |
* / `` |-`/ ` |` |
* \________\__\_______\__\
* """"""""" """""""'"""
* Don't be a fucking stupid donkey! No, this is a fucking mule!
*/