爬取京东所有商品的评论代码

#-*-coding:utf-8-*-
#@Time :2022/3/14 12:49
#@Author:shuaichao
#@File :.py
#@Software: PyCharm

import openpyxl as op
import urllib.request
from bs4 import BeautifulSoup  # 网页解析,获悉数据.231
import urllib.request, urllib.error  # 制定URL,获取网页数据
import time
import random
import json


list_goodid = []  #商品id
list_id = []  # 产品ID
list_content = []  # 评论内容
list_time = []  # 时间
list_score = []  # 评分
list_name = []  # 名字
list_mobileVersion = []  # 是否移动端
list_plusAvailable = []  # 会员等级
list_days = []          # 收货间隔
def askUrl(url):
    headers = {
        # "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
        # 'Host': 'movie.douban.com',
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    request = urllib.request.Request(url, headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reasen)
    return html


# 爬取网页信息
def get_info(baseurl):
    html = askUrl(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    return bs


# soup处理并转换成字符串
def transport(bs, info):
    ex_info = bs.find_all(class_=info)
    info = str(ex_info)
    return ex_info, info


def askUrl2(url):
    headers = {
        # "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
        # 'Host': 'movie.douban.com',
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    request = urllib.request.Request(url, headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("gb2312")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reasen)
    return html

def get_info2(baseurl):
    html = askUrl2(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    return bs

if __name__ == '__main__':
    pagesize = 100      # 某一页包含的商品数量
    list_Goods = []    # 所有商品的ID
    '''
        获取商品ID
    '''
    print("开始")
    for i in range(1,2):
        url = 'https://floor.jd.com/user-v20/feed/get?' \
              'page=' + str(i) + '&pagesize=' + str(pagesize) + \
              '&area=1_2802_0_0&source=pc-home&callback=jsonpMore2Goods&_=1647233103435'
        time.sleep(random.randint(2, 5))
        res = get_info(url)
        time.sleep(random.randint(2,5))
        response_data =json.loads(res.text.replace('jsonpMore2Goods(','')[:-1])['data']
        for v in response_data:
            list_Goods.append(v['sku'])
    '''
        获取商品的评论和相关信息
    '''

    print("开始获取评论等用户信息")

    for v in list_Goods:
        try:
            time.sleep(random.randint(2, 5))
            print("完成一个商品购买信息收集")
            url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \
                          '&productId=' + str(v) + '&score=0&sortType=10&page=10&pageSize=100'
            res = get_info2(url_comment)
            time.sleep(random.randint(2, 5))
            response_data = json.loads(res.text)['maxPage']
            pageCount = response_data
            for i in range(1, pageCount):
                try:
                    print("中循环成功运行一次")
                    url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \
                                  '&productId='+str(v)+'&score=0&sortType=10&page='+str(i)+'&pageSize=100'
                    time.sleep(random.randint(2, 5))
                    res = get_info2(url_comment)
                    time.sleep(random.randint(2, 5))
                    response_data = json.loads(res.text)['comments']
                except:
                    print("中循环报错一次")
                    try:
                        time.sleep(random.randint(2, 5))
                        res = get_info(url_comment)
                        time.sleep(random.randint(2, 5))
                        response_data = json.loads(res.text)['comments']
                    except:
                        print("中循环二次报错")
                        continue
                for value in response_data:
                    list_goodid.append(v)
                    list_id.append(value['id'])
                    list_content.append(value['content'])
                    list_time.append(value['creationTime'])
                    list_score.append(value['score'])
                    list_days.append(value['days'])
                    if value['mobileVersion'] != "":
                        list_mobileVersion.append(value['mobileVersion'])
                    else:
                        list_mobileVersion.append("pc")
                    list_name.append(value['nickname'])
                    list_plusAvailable.append(value['plusAvailable'])
        except:
            print("大循环报错一次")
            continue
        finally:
            wb = op.Workbook()  # 创建工作簿对象
            ws = wb['Sheet']  # 创建子表
            for i in range(len(list_id)):
                d = list_goodid[i],list_id[i], list_name[i], list_content[i], list_time[i], list_mobileVersion[i], list_plusAvailable[
                    i], list_score[i],list_days[i]
                ws.append(d)
            wb.save("./comment1.xlsx")
        wb = op.Workbook()  # 创建工作簿对象
        ws = wb['Sheet']  # 创建子表
        for i in range(len(list_id)):
            d = list_goodid[i],list_id[i],list_name[i],list_content[i],list_time[i],list_mobileVersion[i],list_plusAvailable[i],list_score[i],list_days[i]
            ws.append(d)
        wb.save("./comment1.xlsx")

  

posted @ 2022-03-15 10:56  帅超007  阅读(214)  评论(1编辑  收藏  举报