Python爬虫爬取京东某商品评论信息存入mysql数据库

  1 """
  2 https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100006852812&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
  3 page=0&pageSize=10
  4 """
  5 import json
  6 from datetime import time
  7 from random import randint
  8 
  9 import pymysql
 10 import requests
 11 #连接数据库  获取游标
 12 def get_conn():
 13     """
 14     :return: 连接，游标
 15     """
 16     # 创建连接
 17     conn = pymysql.connect(host="127.0.0.1",
 18                     user="root",
 19                     password="000429",
 20                     db="info",
 21                     charset="utf8")
 22     # 创建游标
 23     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
 24     if ((conn != None) & (cursor != None)):
 25         print("数据库连接成功！游标创建成功！")
 26     else:
 27         print("数据库连接失败！")
 28     return conn, cursor
 29 #关闭数据库连接和游标
 30 def close_conn(conn, cursor):
 31     if cursor:
 32         cursor.close()
 33     if conn:
 34         conn.close()
 35     return 1
 36 
 37 def jd_spider():
 38     headers = {
 39         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
 40     }
 41     res_data = []   #最终返回的数据
 42     temp_data=[]    #中间数据
 43     for page_num in range(6,50):
 44         url=f'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100006852812&score=0&sortType=5&page={page_num}&pageSize=1&isShadowSku=0&fold=1'
 45         # 发起请求
 46         response = requests.get(url, headers=headers)
 47         response.encoding='GBK'
 48         # 获取响应
 49         res_text = response.text
 50         # print(res_text)
 51         new_res_text=res_text[20:-2]
 52         # print(new_res_text)
 53         res_json=json.loads(new_res_text)
 54         # print(type(res_json))     #<class 'dict'>
 55         comments=res_json['comments']
 56         # print(comments[0])
 57         comments_dict=comments[0]
 58         # print(comments_dict.keys())
 59         temp_data.append(comments_dict['id'])
 60         temp_data.append(comments_dict['content'])
 61         temp_data.append(comments_dict['creationTime'])
 62         temp_data.append(comments_dict['referenceTime'])
 63         temp_data.append(comments_dict['days'])
 64         flag_isMobile=randint(0,1)
 65         flag_userLevel=randint(0,1)
 66         temp_data.append(flag_isMobile)
 67         temp_data.append(flag_userLevel)
 68         res_data.append(temp_data)
 69         print(str(page_num),temp_data)
 70         insert_jd(res_data)
 71         temp_data=[]
 72         res_data=[]
 73     return
 74 
 75 #插入
 76 def insert_jd(data):
 77     """
 78         插入imdb数据
 79         :return:
 80         """
 81     cursor = None
 82     conn = None
 83     try:
 84         # list_=[]
 85         # list = jd_spider()
 86         # if(type(list)!=type(list_)):
 87         #     return -1
 88         conn, cursor = get_conn()
 89         sql = "insert into jd_comments(id,content,creationTime,referenceTime,days,flag_isMobile,flag_userLevel) values(%s,%s,%s,%s,%s,%s,%s)"
 90         for item in data:
 91             try:
 92                 print(item)
 93                 cursor.execute(sql, [item[0], item[1], item[2], item[3], item[4], item[5], item[6]])
 94             except pymysql.err.IntegrityError:
 95                 print("出现错误！")
 96             conn.commit()  # 提交事务 update delete insert操作
 97     finally:
 98         close_conn(conn, cursor)
 99     return
100 
101 if __name__ == '__main__':
102     jd_spider()

id, 评论id

content, 评论

creationTime, 评论时间

referenceTime, 确认收货时间

days, 评论时间距【收货/下单】时间多长时间

isMobile(0 1) , 是否移动端（0 PC端，1 移动端）

userLevel(0 1) 用户等级（1会员，0非会员）

posted @ 2022-03-13 11:08 靠谱杨阅读(468) 评论(0) 编辑收藏举报

刷新页面返回顶部

靠谱杨

文章内出现的公众号名称可能有误请统一搜索：靠谱杨的秘密基地；我的github：https://github.com/SAH01

Python爬虫爬取京东某商品评论信息存入mysql数据库

公告

靠谱杨

文章内出现的公众号名称可能有误请统一搜索：靠谱杨的秘密基地； 我的github：https://github.com/SAH01

Python爬虫爬取京东某商品评论信息存入mysql数据库

公告

文章内出现的公众号名称可能有误请统一搜索：靠谱杨的秘密基地；我的github：https://github.com/SAH01