简易天猫爬虫
天猫商品数据爬取代码分享
虽然很简陋但是写这个程序我学到了一些新的技术,比如openpyxl库的使用,python的打包啊,设置图标啥的,还是收获很多.
闲话不多说,直接上代码
#导入需要的库
import re
import urllib.parse
import requests
from openpyxl import Workbook
from openpyxl.styles import Font,Alignment
import os
#创建工作簿对象
wb = Workbook()
#我们使用Workbook对象的默认创建的工作表
ws = wb.active
#标题命名
ws['A1'] = '商品名称'
ws['B1'] = '商品价格'
ws['C1'] = '产地'
ws['D1'] = '月成交量'
ws['E1'] = '商品链接'
#提示用户输入,查找的商品名,页数,保存的地址,和文件名
print('----欢迎使用----')
keyword = input("请输入你要查找的商品名称:")
frequency = int(input("请输入你要下载的页数(1~100):"))
name = input("请输入你要保存的文件名:")+'.xlsx'
#列宽判断的依据
width = 0
#冻结第一行
ws.freeze_panes = 'A2'
#标题格式设置居中
ws['A1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['B1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['C1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['D1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['E1'].alignment =Alignment(horizontal = 'center',vertical ='center')
#设置标题字号为20,加粗
ws['A1'].font = Font(size= 20,bold=True)
ws['B1'].font = Font(size= 20,bold=True)
ws['C1'].font = Font(size= 20,bold=True)
ws['D1'].font = Font(size= 20,bold=True)
ws['E1'].font = Font(size= 20,bold=True)
#设置商品价格,成交量,产地的列宽
ws.column_dimensions['B'].width = 20
ws.column_dimensions['C'].width = 20
ws.column_dimensions['D'].width = 20
#中文关键字的编码
keyword = urllib.parse.quote(keyword)
#请求头的处理
headers = {'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.10 Safari/537.36',
'cookie':'cna=dzhnFJcvPFYCAcplZsR6KtPL; hng=CN%7Czh-CN%7CCNY%7C156; lid=su%E3%80%81%E9%9F%A9; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; tk_trace=1; t=cf98821548a85be3261b5d3e02dfc50c; _tb_token_=53317e736e697; cookie2=171ec1fa788a97042140b1dc23ea8cbd; _m_h5_tk=ce5a4b969c5fee0ce43fbecb1c8b5698_1544366646609; _m_h5_tk_enc=08f79f8dd140168b3e95d192521140f3; x=__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; uc1=cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=UIHiLt3xTIkz&cookie15=UtASsssmOIJ0bQ%3D%3D&existShop=false&pas=0&cookie14=UoTYMh2PRIzYCw%3D%3D&tag=8&lng=zh_CN; uc3=vt3=F8dByR1fSHC9rqvO0Hw%3D&id2=UUGk2VnYR7N9og%3D%3D&nk2=saDewT4jhA05glO9&lg2=V32FPkk%2Fw0dUvg%3D%3D; tracknick=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; _l_g_=Ug%3D%3D; ck1=""; unb=2967018108; lgc=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; cookie1=AiVdFlFBrPvLkxJuDQ%2FIWWWqMYV30iZYcqUsqvmxAjc%3D; login=true; cookie17=UUGk2VnYR7N9og%3D%3D; _nk_=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; uss=""; csg=da3a7af3; skt=22f8e8af802abead; enc=P0JAHDOULky9KTinsCWQ4Ib6YVG7q7qPW5KKCJd4YWKlwiYOGGRObgbMOWOpxn4w12VNH34hJK%2FVCxsPmDqs%2FQ%3D%3D; pnm_cku822=098%23E1hvc9vUvbpvUvCkvvvvvjiPR2FWljlUn2qw6jEUPmPZ1jrERFdO1jYUnLS9zjtUiQhvCvvvpZptvpvhvvCvpvGCvvpvvPMMvphvC9mvphvvvvyCvhQv7sg%2FjNpBKBh78BoxfXkXdiYso%2BLpjXe4Vc3Z0f06W3vOJ1kHsfUpeB6AxYjxRLwprj6OfwoKjd8rJm7g%2BfUz%2BsIIHYFpeiQa5javuphvmvvvpoX8LTuKkphvC9hvpyPw1byCvm9vvhCvvvvvvvvvBfIvvvjivvCVB9vv9LvvvhXVvvmCjvvvByOvvUhw; cq=ccp%3D0; swfstore=199766; isg=BG5uugvevlR2SM3ay3guf99Uv8ScezhtaUhC2pg36XEsew_VAP0keVBxNqcy_yqB'}
#url的处理
url1 = "https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.36105702i4oQH9&s="
url2 ="&q=" +keyword+"&sort=s&style=g&from=..pc_1_searchbutton&active=2&type=pc#J_Filte"
#循环爬取每一页,正则提取商品链接,商品名,价格,销量
for i in range(1,frequency+1):
#跳过访问发生的异常,并输出异常信息
try:
print("----正在爬取第%d页----"%i)
url = url1 +str((i-1)*60)+url2
r = requests.get(url,headers = headers)
names = re.compile('target="_blank" title="(.*?)"',re.S).findall(r.text)
if(len(names)):
print('访问成功')
else:
print("访问失败,请更改代码的里cookie,或者明天再使用")
urls = re.compile('<div class="productImg-wrap">\n<a href="(.*?)" class="productImg" target="_blank" data-p="',re.S).findall(r.text)
chengjiaoliangs = re.compile('<span>月成交 <em>(.*?)笔',re.S).findall(r.text)
moneys = re.compile('<em title="(.*?)"><b>¥',re.S).findall(r.text)
#访问商品详细信息,爬取产地
wheres = []
for x in range(len(urls)):
#跳过获取商品地区信息时发生的异常,并输出异常
try:
wurl = 'http:'+urls[x]
w = requests.get(wurl,headers = headers)
wheres.append((re.compile('name="region" value="(.*?)"',re.S).findall(w.text))[0])
if(len(wheres[x])):
print('第%d页%d条商品信息爬取成功'%(i,x+1))
except Exception as er:
print(er)
if (len(names)):
print("----第%d页爬取成功----" % i)
print("----第%d页开始写入----"%i)
#循环写入excel表格
for y in range(1,len(urls)):
#跳过写入表格时发生的异常,并输出异常的信息
try:
#设置商品名称和商品链接自适应列宽
if(len(names[y-1])> width):
ws.column_dimensions['A'].width = 2*len(names[y-1])
if (len(urls[y - 1]) > width):
ws.column_dimensions['E'].width = len(urls[y - 1])
#分别写入数据
ws['A%d' % ((i - 1) * 60 + y + 1)] = names[y-1]
ws['B%d' % ((i - 1) * 60 + y + 1)] = moneys[y - 1]+'元'
ws['C%d' % ((i - 1) * 60 + y + 1)] = wheres[y - 1]
ws['D%d' % ((i - 1) * 60 + y + 1)] = chengjiaoliangs[y - 1]+'笔'
ws['E%d' % ((i - 1) * 60 + y + 1)] = 'http:'+ urls[y - 1]
print('----第%d页第%d条写入成功----'%(i,y+1))
except Exception as e:
print('----第%d页第%d条写入失败----'%(i,y+1))
print("----第%d页写入成功----" % i)
except Exception as err:
print(err)
#创建文件下载路径
path = './天猫数据爬取excel文件/'
if not os.path.exists(path):
os.mkdir(path)
#保存工作簿到指定路径
wb.save(path +name)
print('----已经全部写入----')
print('----感谢使用----')
os.system('pause')