Python爬虫爬取京东某商品评论信息存入mysql数据库
1 """ 2 https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100006852812&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1 3 page=0&pageSize=10 4 """ 5 import json 6 from datetime import time 7 from random import randint 8 9 import pymysql 10 import requests 11 #连接数据库 获取游标 12 def get_conn(): 13 """ 14 :return: 连接,游标 15 """ 16 # 创建连接 17 conn = pymysql.connect(host="127.0.0.1", 18 user="root", 19 password="000429", 20 db="info", 21 charset="utf8") 22 # 创建游标 23 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 24 if ((conn != None) & (cursor != None)): 25 print("数据库连接成功!游标创建成功!") 26 else: 27 print("数据库连接失败!") 28 return conn, cursor 29 #关闭数据库连接和游标 30 def close_conn(conn, cursor): 31 if cursor: 32 cursor.close() 33 if conn: 34 conn.close() 35 return 1 36 37 def jd_spider(): 38 headers = { 39 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36' 40 } 41 res_data = [] #最终返回的数据 42 temp_data=[] #中间数据 43 for page_num in range(6,50): 44 url=f'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100006852812&score=0&sortType=5&page={page_num}&pageSize=1&isShadowSku=0&fold=1' 45 # 发起请求 46 response = requests.get(url, headers=headers) 47 response.encoding='GBK' 48 # 获取响应 49 res_text = response.text 50 # print(res_text) 51 new_res_text=res_text[20:-2] 52 # print(new_res_text) 53 res_json=json.loads(new_res_text) 54 # print(type(res_json)) #<class 'dict'> 55 comments=res_json['comments'] 56 # print(comments[0]) 57 comments_dict=comments[0] 58 # print(comments_dict.keys()) 59 temp_data.append(comments_dict['id']) 60 temp_data.append(comments_dict['content']) 61 temp_data.append(comments_dict['creationTime']) 62 temp_data.append(comments_dict['referenceTime']) 63 temp_data.append(comments_dict['days']) 64 flag_isMobile=randint(0,1) 65 flag_userLevel=randint(0,1) 66 temp_data.append(flag_isMobile) 67 temp_data.append(flag_userLevel) 68 res_data.append(temp_data) 69 print(str(page_num),temp_data) 70 insert_jd(res_data) 71 temp_data=[] 72 res_data=[] 73 return 74 75 #插入 76 def insert_jd(data): 77 """ 78 插入imdb数据 79 :return: 80 """ 81 cursor = None 82 conn = None 83 try: 84 # list_=[] 85 # list = jd_spider() 86 # if(type(list)!=type(list_)): 87 # return -1 88 conn, cursor = get_conn() 89 sql = "insert into jd_comments(id,content,creationTime,referenceTime,days,flag_isMobile,flag_userLevel) values(%s,%s,%s,%s,%s,%s,%s)" 90 for item in data: 91 try: 92 print(item) 93 cursor.execute(sql, [item[0], item[1], item[2], item[3], item[4], item[5], item[6]]) 94 except pymysql.err.IntegrityError: 95 print("出现错误!") 96 conn.commit() # 提交事务 update delete insert操作 97 finally: 98 close_conn(conn, cursor) 99 return 100 101 if __name__ == '__main__': 102 jd_spider()
id, 评论id
content, 评论
creationTime, 评论时间
referenceTime, 确认收货时间
days, 评论时间距【收货/下单】时间多长时间
isMobile(0 1) , 是否移动端(0 PC端,1 移动端)
userLevel(0 1) 用户等级(1会员,0非会员)
好看请赞,养成习惯:) 本文来自博客园,作者:靠谱杨, 转载请注明原文链接:https://www.cnblogs.com/rainbow-1/p/15999845.html
欢迎来我的51CTO博客主页踩一踩 我的51CTO博客
文章中的公众号名称可能有误,请统一搜索:靠谱杨的秘密基地