Spider/spider/kuwo.py爬虫代码:
# -*- coding: utf-8 -*-
import scrapy
import demjson
import re
import os
from ..items import MusicItem, SingerItem
from bloomfilter import Bloomfilter #布隆过滤
class KuwoSpider(scrapy.Spider):
name = 'kuwo'
allowed_domains = ['kuwo.cn']
start_urls = [
'http://artistlistinfo.kuwo.cn/mb.slist?stype=artistlist&category=0&order=dict&pn=0&rn=100&encoding=utf8&prefix='
]
def __init__(self, name=None, **kwargs):
super(KuwoSpider, self).__init__(name=name, kwargs=kwargs)
if not os.path.exists("singer.state"):
self.bloom = Bloomfilter(10000000)
else:
# 存储状态文件后缀随便写
self.bloom = Bloomfilter("singer.state")
def start_requests(self):
for x in [chr(code) for code in range(97, 123)]:
url = self.start_urls[0] + x
yield scrapy.Request(
url=url,
callback=self.parse,
dont_filter=True,
meta={'prefix': x}
)
def parse(self, response):
meta = response.meta
json_obj = demjson.decode(response.text)
total = json_obj.get("total", "0")
total = int(total) if total.isdigit() else 0
rn = json_obj.get("rn", "100")
rn = int(rn) if rn.isdigit() else 100
total_page = total//rn if total % rn == 0 else total//rn+1
# 处理数据并存储
artistlist = json_obj.get('artistlist', [])
for artist in artistlist:
pic = artist.get('pic')
if not self.bloom.test(pic):
item = SingerItem()
item['singer_id'] = artist.get("id")
url = "http://search.kuwo.cn/r.s?stype=artist2music&artistid={}&pn=0&rn=100&sortby=0&show_copyright_off=1&alflac=1&pcmp4=1&encoding=utf8&vipver=MUSIC_8.7.7.0_PQ&plat=pc&devid=51016591&thost=search.kuwo.cn".format(item['singer_id'])
yield scrapy.Request(
url=url,
callback=self.parse_music,
dont_filter=True,
)
item['singer_name'] = artist.