京东商品信息及其价格爬虫
python 2.7
# -*- coding:utf-8 -*-
#导入模块
import urllib2,re,urllib
from bs4 import BeautifulSoup
import json,time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
fout = open(r'res.txt', "wb")
tot = 0
#定义抓取类
class JD:
#记录抓取产品个数
prodNum = 1
#初始化参数
def __init__(self,baseurl,page):
self.baseurl = baseurl
self.page = page
#拼装成url
self.url = self.baseurl+'&'+'page='+str(self.page)
def getHtml(self,url):
try:
#请求抓取对象
request = urllib2.Request(url)
#响应对象
reponse = urllib2.urlopen(request)
#读取源代码
html = reponse.read()
#返回源代码
except:
time.sleep(0.1)
return self.getHtml(url)
return html
#获取总页数
def getNum(self,html):
#封装成BeautifulSoup对象
soup = BeautifulSoup(html)
#定位到总页数节点
items = soup.find_all('span',class_='p-skip')
#获取总页数
for item in items:
pagenum = item.find('em').find('b').string
return pagenum
#获取所有产品id列表
def getIds(self,html):
#生成匹配规则
pattern = re.compile('<a target="_blank" href="//item.jd.com/(.*?).html".*?>')
#查询匹配对象
items = re.findall(pattern,html)
return items
#根据产品id获取同款产品列表
def getIdByItems(self,id):
#拼装成url
url = basePd+str(id)+'.html'
#调用抓取函数返回源代码
html = self.getHtml(url)
# 封装成BeautifulSoup对象
soup = BeautifulSoup(html)
#查询匹配对象
items = []
items = soup.find('div',class_='dd clearfix')
l = []
#生成列表
for item in items:
pattern = re.compile('href="//item.jd.com/(.*?).html".*?>')
id = re.findall(pattern,str(item))
if id:
l += id
return l
#获取产品价格
def getPrice(self,id):
url = 'http://p.3.cn/prices/mgets?skuIds=J_'+str(id)
jsonString = self.getHtml(url)
jsonObject = json.loads(jsonString.decode())
price_jd = jsonObject[0]['p']
price_mk = jsonObject[0]['m']
fout.write('jd price:'+str(price_jd)+'\n')
fout.write('market price:'+str(price_mk)+'\n')
#获取产品图片
def getImg(self,html,subid):
'''
pattern = re.compile(r'<img id=.*?data-origin="(.*?)" alt=.*?', re.S)
items = re.findall(pattern, html)
for item in items:
imgurl = 'http:%s' % (item)
urllib.urlretrieve(imgurl, 'd:/temp/jdimg/%s.jpg' % (str(subid)))
'''
#获取内容
def getContent(self,html,subid):
soup = BeautifulSoup(html)
title = soup.find('div',class_='sku-name')
fout.write('\n-----------------'+ str(JD.prodNum) +'--------------------\n')
try:
for t in title:
fout.write('name:'+t.string+'\n')
except:
return
time.sleep(1)
#价格
self.getPrice(subid)
#编码
items1 = soup.find_all('ul',class_='parameter1 p-parameter-list')
#商品基本信息
for item in items1:
p = item.findAll('p')
for i in p:
i.string=""
# 商品基本信息
items2 = soup.find_all('ul', class_='parameter2 p-parameter-list')
for item in items2:
p = item.findAll('li')
if len(str(p[0].string))>0:
fout.write(str(p[0].string))
fout.write('\n')
'''
for i in p:
if len(str(i.string))>0:
fout.write(str(i.string))
fout.write('\n')
'''
#规格与包装
'''
items3 = soup.find_all('div',class_='Ptable-item')
for item in items3:
contents1 = item.findAll('dt')
contents2 = item.findAll('dd')
for i in range(len(contents1)):
if len(str(contents1[i].string))>0 and len(str(contents2[i].string))>0:
fout.write(contents1[i].string)
if len(str(contents2[i].string))>0:
fout.write(str(contents2[i].string))
fout.write('\n')
'''
JD.prodNum += 1
print JD.prodNum
#启动抓取程序
def start(self):
html = spider.getHtml(self.url)
pageNum = self.getNum(html)
print 'doing............'
#time.sleep(3)
print 'finish. all',pageNum,'pages'
#time.sleep(1)
print 'doing.........'
#循环1--页数
for page in range(1,int(pageNum)+1):
url = self.baseurl+'&'+'page='+str(page)
html = self.getHtml(url)
ids = self.getIds(html)
#循环2--产品列表
for id in ids:
urlprod = basePd+str(id)+'.html'
htmlprod = self.getHtml(urlprod)
'''
subids = self.getIdByItems(id)
'''
self.getContent(htmlprod,id)
self.getImg(htmlprod,id)
'''
#循环3--产品组列表
for subid in subids:
urlsubprod = basePd+str(subid)+'.html'
subhtml = self.getHtml(urlsubprod)
time.sleep(1)
self.getContent(subhtml,subid)
self.getImg(subhtml,subid)
'''
#产品列表base页
basePd = 'http://item.jd.com/'
#抓取入口URL
baseURL = 'http://list.jd.com/list.html?cat=9987,653,655'
#生成爬虫抓取对象
spider = JD(baseURL,1)
#开始抓取
spider.start()