python -- 京东图书

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
import re
import random
from jdgoods.items import JdgoodsItem
from lxml import etree
from scrapy.http import Request


class GoodsSpider(scrapy.Spider):
name = 'goods'
allowed_domains = ['jd.com']
url_lst=[]
pd_lst=[]
pd_pages={}
#start_urls = ['http://jd.com/']
ua = ['Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
]

def start_requests(self):

req1 = urllib.request.Request("https://book.jd.com/")
user_age=random.choice(self.ua)
req1.add_header("User-Agent", user_age )
all_data = urllib.request.urlopen(req1).read().decode('gbk','ignore')
# print(all_data)
pat1 = '"URL":"(.*?)","ANCHOR":'
all_html_data = re.compile(pat1).findall(all_data)
#print(all_html_data )
for i in all_html_data:
a = i.split(',')
self.url_lst.append("http:"+(a[len(a) - 1].replace('"URL":"', "")).replace('\/','/'))
#print("http:"+(a[len(a) - 1].replace('"URL":"', "")).replace('\/','/'))
url_set=set(self.url_lst)
self.url_lst=list(url_set)

for j in self.url_lst:
try:

req2 = urllib.request.Request(j)
user_age = random.choice(self.ua)
req2.add_header("User-Agent", user_age)

sub_data=urllib.request.urlopen(req2).read().decode('gbk', 'ignore')
pat2='href="//list.*cat=(.*?)[&|"]'
all_html_addr = re.compile(pat2).findall(sub_data)
for lst_num in all_html_addr :
self.pd_lst.append(lst_num)

except Exception as err:
pass
x=0
for a in self.pd_lst:

this_url = 'https://list.jd.com/list.html?cat=' + a
req3 = urllib.request.Request(this_url)
user_age = random.choice(self.ua)
req3.add_header("User-Agent", user_age)


html_data = urllib.request.urlopen(req3).read().decode('utf-8', 'ignore')
pat3 = u"[\u4e00-\u9fa5]"+"<b>(.*?)</b>"+u"[\u4e00-\u9fa5]"
pages = re.compile(pat3).findall(html_data)
self.pd_pages[a]="".join(pages)
x+=1
if x > 1 :break
y=0
for key in self.pd_pages:
#print(key +":"+ str(self.pd_pages[key]))
for p in range(1,int(str(self.pd_pages[key]))):
thispage='https://list.jd.com/list.html?cat='+key+'&page=' + str(p)
#print(thispage)
y+=1
if y>2:break
#yield Request(thispage, callback=self.parse)

yield Request("https://list.jd.com/list.html?cat=1713,3260,3339", callback=self.parse)
def parse(self, response):
item=JdgoodsItem()

try:

content_lst=response.xpath('//span[@class="curr"]/text()').extract()
p_content="---".join(content_lst)
print(p_content)

book_name_lst=response.xpath('//div[@class="p-name"]/a/em/text()').extract()

book_price_html = response.xpath('//div[@class="p-img"]/a/@href').extract()

book_pub_lst = response.xpath('//span[@class="p-bi-store"]/a/text()').extract()

book_seller = response.xpath('//span[@class="curr-shop"]/text()').extract()

# price https://p.3.cn/prices/mgets?&skuIds=J_11481255
print("书名--出版社----销售商---已下载")
skuIds=[]
price=[]
comment=[]
for n in range(len(book_price_html)):
pat = '//.*/([0-9].*?).html'
tmp = re.compile(pat).findall(book_price_html[n])
skuIds.append("".join(tmp))

for i in range(0,len(book_name_lst)):
req4 = urllib.request.Request('https://p.3.cn/prices/mgets?&skuIds=J_' + str(skuIds[i]))
user_age = random.choice(self.ua)
req4.add_header("User-Agent", user_age)
p = urllib.request.urlopen(req4).read().decode()
pat = '"p":"(.*?)"'
p1 = re.compile(pat).findall(p)
p2="".join(p1)
price.append(p2)
print("书价格---已下载")
#评论:https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=11338771
for i in range(0, len(book_name_lst)):
req5 = urllib.request.Request(
'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + str(
skuIds[i]))
user_age = random.choice(self.ua)
req5.add_header("User-Agent", user_age)
c = urllib.request.urlopen(req5).read().decode("utf-8", 'ignore')
pat = '"CommentCount":(.*?),'
c1 = re.compile(pat).findall(c)
c2 = "".join(c1)
comment.append(c2)
print("书评论---已下载")
for n in range(len(book_name_lst)):
print(book_name_lst[n]+':'+str(price[n])+':'+book_seller[n]+':'+book_pub_lst[n]+':'+str(comment[n]))
except Exception as err:
pass

yield item
posted @ 2017-12-10 23:28  沧海一粒水  阅读(180)  评论(0编辑  收藏  举报