scrapy 爬取天猫商品信息
spider
# -*- coding: utf-8 -*- from urllib.parse import urlencode import requests import scrapy import re import json from ..items import TmallItem cookie = {'thw': 'cn', 'hng': 'CN%7Czh-CN%7CCNY%7C156', 'tracknick': 'yzhy1372', 'tg': '0', 'miid': '813697773983481206', 'x': 'e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0', '_cc_': 'UIHiLt3xSw%3D%3D', 'enc': '52fRsc7qpI96LDqf%2FkMA7AfWwN0%2BYmGMXsa4AdC3He4jEbrP%2BRbmYwz%2Bn3xwMrIk4fqBuRCR6BYtQvI%2FP7UBRw%3D%3D', 'UM_distinctid': '165c600d3903a8-0dc9190eb920d3-c343567-100200-165c600d39319', 'cna': 'iSbqEnsQrkoCAXM7KlL0pQWu', 't': '8489c373deedc2a297ebe4c4ad6debb5', '_uab_collina': '153991002330679083015734', '_umdata': '6AF5B463492A874D05644EF9A3CE888C0BB3EC8395620198BCCF71C40733CB6AAB98C444C566382ECD43AD3E795C914C010C8EDA083E64FAFA9E46E3CF4DEA41', '_m_h5_tk': 'bf46d22c8564ad537f01664eb002112c_1539921942514', '_m_h5_tk_enc': 'f2a1bff4b69d2c036314c66504744070', 'v': '0', 'cookie2': '2b9488dea40dbe840f20ea5f14836ef7', '_tb_token_': 'fb83ee7ebeed7', 'alitrackid': 'www.taobao.com', 'lastalitrackid': 'www.taobao.com', 'JSESSIONID': '9787B4CF4D2812E2BA1E407B224AE53A', 'isg': 'BOfnzJhvcDexNPXcxwaGYkk8dhtxxJBNn5b9BrlUMnacqAVqyz-ynoHpzuiTQJPG', 'Hm_lvt_dde6ba2851f3db0ddc415ce0f895822e': '1539912803,1539913323,1539944839,1539944853', 'Hm_lpvt_dde6ba2851f3db0ddc415ce0f895822e': '1539944853', 'unb': '624984624', 'uc1': 'cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&cookie21=WqG3DMC9FxUx&cookie15=VT5L2FSpMGV7TQ%3D%3D&existShop=false&pas=0&cookie14=UoTfItnW5e2f1g%3D%3D&tag=8&lng=zh_CN', 'sg': '244', '_l_g_': 'Ug%3D%3D', 'skt': '5c93ad4f47f0c1ca', 'cookie1': 'U%2BTs5qAQHjB1CoYPMJcEQ4UfC6zh%2FdhqLG66mPjcz38%3D', 'csg': 'e312c3a6', 'uc3': 'vt3=F8dByRmq%2Bp63ob4wR7I%3D&id2=VW3j%2BbmcVcIV&nk2=GhETDBFSx%2Fs%3D&lg2=VT5L2FSpMGV7TQ%3D%3D', 'existShop': 'MTUzOTk0NTUzNw%3D%3D', 'lgc': 'yzhy1372', 'dnk': 'yzhy1372', '_nk_': 'yzhy1372', 'cookie17': 'VW3j%2BbmcVcIV', 'mt': 'np='} class MianbaoSpider(scrapy.Spider): name = "mianbao" # allowed_domains = ["https://www.taobao.com"] def start_requests(self): url = 'https://s.taobao.com/search' pars = { 'q': '女士上衣', #搜索关键字 'initiative_id': 'staobaoz_20181019', 'ie': 'utf8', 'tab': 'mall', #搜索天猫 1,all天猫淘宝 2,tmall天猫 3,old二手 # 's': '0', #页码 44递增 'sort': 'sale-desc' #默认 default #排序类型 # #credit-desc信用排序 # #price-asc 价格升序 #price-desc 价格降序序 } data = urlencode(pars) urls = [url+'?'+data+'&s='+str(page) for page in range(0,450,44)] #翻页爬取 for u in urls: yield scrapy.Request(u,self.mianbao,cookies=cookie) def mianbao(self, response): res = re.compile(r'g_page_config = (.*?);\s*g_srp_loadCss',re.S) datas = json.loads(res.findall(response.text)[0])['mods']['itemlist']['data']['auctions'] for i in datas: title = i['raw_title'] #商品名称 pic_url = 'http:'+i['pic_url'] #图片链接 #列表页图片 # view_price = i['view_price'] #商品价格 detail_url = 'https:'+i['detail_url'] #商品详情url nick = i['nick'] #店铺名称 view_sales = i['view_sales'] #付款人数 item_loc = i['item_loc'] #商品所在地 comment_count = i['comment_count'] #评论数 user_id = i['user_id'] #取评论内容用 yield scrapy.Request(detail_url,self.detail_info,meta={'title':title,'nick':nick,'view_sales':view_sales,'item_loc':item_loc,'comment_count':comment_count,'pic_url':pic_url,'user_id':user_id}) def detail_info(self,response): item = TmallItem() res = re.compile(r'"defaultItemPrice":"(.*?)",',re.S) price = res.findall(response.text)[0] #单价 good_imgs = response.xpath('//*[@id="J_UlThumb"]/li/a/img/@src').extract()#抓取图片 good_info = response.xpath('//*[@id="J_AttrUL"]/li/text()').extract() if len(good_info) == 0: #商品详情 good_infos = '暂无' else: good_infos = good_info item_id = re.findall(r'id=(.*?)&',response.url)[0] #这里是取出商品id user_id = response.meta['user_id'] #取出商家id url = 'https://rate.tmall.com/list_detail_rate.htm' data = { 'itemId': item_id, #商品id 'sellerId': user_id #商家id } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } try: rote_response = requests.get(url=url,params=data,headers=headers) #发起请求 rote_json = json.loads(re.findall(r'jsonp128\((.*?)\)',rote_response.text)[0])['rateDetail']['rateList'] rote_list = [] # 评论列表 for i in rote_json: rote_dict = {} rote_dict['auctionSku'] = i['auctionSku'] #购买商品名称 rote_dict['rateContent'] = i['rateContent'] #商品评论内容 rote_dict['pics'] = i['pics'] #评价图片 if len(rote_list) < 5: #每件商品只抓5条评论 rote_list.append(rote_dict) #把评论内容放到列表里 except: print('该商品评论 无法抓取') rote_list = [] item['title'] = response.meta['title'] item['nick'] = response.meta['nick'] item['price'] = price item['view_sales'] = response.meta['view_sales'] item['item_loc'] = response.meta['item_loc'] item['comment_count'] = response.meta['comment_count'] item['pic_url'] = response.meta['pic_url'] item['good_infos'] = good_infos item['good_imgs'] = good_imgs item['rote_list'] = rote_list return item
piplines
# -*- coding: utf-8 -*- import pymongo mongo = pymongo.MongoClient('127.0.0.1',27017) mongodb = mongo['tmall'] mongocoll = mongodb['good_info'] import os import requests import csv import pymysql db = pymysql.connect( db = 'test', user = 'root', port = 3306, host = 'localhost', password = 'mysql', charset = 'utf8' ) cursor = db.cursor() # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html class TmallPipeline(object): def process_item(self, item, spider): good_imgs = item['good_imgs'] title = item['title'] path = 'tmalls/' + title #商品信息路径 if not os.path.exists(path): os.mkdir(path) img = [] #更改图片链接 count = 0 for i in good_imgs: count += 1 url = 'https:'+i[:-13] img.append(url) with open(path+'\\'+str(count)+'.jpg','wb') as f: #写入图片 response = requests.get(url) f.write(response.content) item['good_imgs'] = img with open(path+'\\'+'商品信息'+'.csv','w+',encoding='utf-8',newline='') as f: writer = csv.writer(f) for k, j in dict(item).items(): datas = [ [k, j] ] writer.writerows(datas) writer.writerows('\n') mongocoll.insert(dict(item)) title = item['title'] price = item['price'] good_infos = item['good_infos'] view_sales = item['view_sales'] comment_count = item['comment_count'] item_loc = item['item_loc'] nick = item['nick'] sql = 'insert into tmall values (0,%s,%s,%s,%s,%s,%s,%s)' cursor.execute(sql,[title,price,str(good_infos),view_sales,comment_count,item_loc,nick]) db.commit() return item
分类:
爬虫
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 25岁的心里话
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现