爬取京东所有商品的评论代码
#-*-coding:utf-8-*- #@Time :2022/3/14 12:49 #@Author:shuaichao #@File :.py #@Software: PyCharm import openpyxl as op import urllib.request from bs4 import BeautifulSoup # 网页解析,获悉数据.231 import urllib.request, urllib.error # 制定URL,获取网页数据 import time import random import json list_goodid = [] #商品id list_id = [] # 产品ID list_content = [] # 评论内容 list_time = [] # 时间 list_score = [] # 评分 list_name = [] # 名字 list_mobileVersion = [] # 是否移动端 list_plusAvailable = [] # 会员等级 list_days = [] # 收货间隔 def askUrl(url): headers = { # "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0', # 'Host': 'movie.douban.com', # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88", # "Connection": "keep-alive", # "Cache-Control": "max-age = 0", # "Accept-Language": "zh - CN, zh;q = 0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } request = urllib.request.Request(url, headers=headers) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reasen) return html # 爬取网页信息 def get_info(baseurl): html = askUrl(baseurl) bs = BeautifulSoup(html, "html.parser") return bs # soup处理并转换成字符串 def transport(bs, info): ex_info = bs.find_all(class_=info) info = str(ex_info) return ex_info, info def askUrl2(url): headers = { # "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0', # 'Host': 'movie.douban.com', # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88", # "Connection": "keep-alive", # "Cache-Control": "max-age = 0", # "Accept-Language": "zh - CN, zh;q = 0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } request = urllib.request.Request(url, headers=headers) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("gb2312") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reasen) return html def get_info2(baseurl): html = askUrl2(baseurl) bs = BeautifulSoup(html, "html.parser") return bs if __name__ == '__main__': pagesize = 100 # 某一页包含的商品数量 list_Goods = [] # 所有商品的ID ''' 获取商品ID ''' print("开始") for i in range(1,2): url = 'https://floor.jd.com/user-v20/feed/get?' \ 'page=' + str(i) + '&pagesize=' + str(pagesize) + \ '&area=1_2802_0_0&source=pc-home&callback=jsonpMore2Goods&_=1647233103435' time.sleep(random.randint(2, 5)) res = get_info(url) time.sleep(random.randint(2,5)) response_data =json.loads(res.text.replace('jsonpMore2Goods(','')[:-1])['data'] for v in response_data: list_Goods.append(v['sku']) ''' 获取商品的评论和相关信息 ''' print("开始获取评论等用户信息") for v in list_Goods: try: time.sleep(random.randint(2, 5)) print("完成一个商品购买信息收集") url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \ '&productId=' + str(v) + '&score=0&sortType=10&page=10&pageSize=100' res = get_info2(url_comment) time.sleep(random.randint(2, 5)) response_data = json.loads(res.text)['maxPage'] pageCount = response_data for i in range(1, pageCount): try: print("中循环成功运行一次") url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \ '&productId='+str(v)+'&score=0&sortType=10&page='+str(i)+'&pageSize=100' time.sleep(random.randint(2, 5)) res = get_info2(url_comment) time.sleep(random.randint(2, 5)) response_data = json.loads(res.text)['comments'] except: print("中循环报错一次") try: time.sleep(random.randint(2, 5)) res = get_info(url_comment) time.sleep(random.randint(2, 5)) response_data = json.loads(res.text)['comments'] except: print("中循环二次报错") continue for value in response_data: list_goodid.append(v) list_id.append(value['id']) list_content.append(value['content']) list_time.append(value['creationTime']) list_score.append(value['score']) list_days.append(value['days']) if value['mobileVersion'] != "": list_mobileVersion.append(value['mobileVersion']) else: list_mobileVersion.append("pc") list_name.append(value['nickname']) list_plusAvailable.append(value['plusAvailable']) except: print("大循环报错一次") continue finally: wb = op.Workbook() # 创建工作簿对象 ws = wb['Sheet'] # 创建子表 for i in range(len(list_id)): d = list_goodid[i],list_id[i], list_name[i], list_content[i], list_time[i], list_mobileVersion[i], list_plusAvailable[ i], list_score[i],list_days[i] ws.append(d) wb.save("./comment1.xlsx") wb = op.Workbook() # 创建工作簿对象 ws = wb['Sheet'] # 创建子表 for i in range(len(list_id)): d = list_goodid[i],list_id[i],list_name[i],list_content[i],list_time[i],list_mobileVersion[i],list_plusAvailable[i],list_score[i],list_days[i] ws.append(d) wb.save("./comment1.xlsx")