Python爬取数据并输出到数据库
代码如下:
1 import pymysql 2 import requests 3 import json 4 import time 5 6 7 def daorumysql(items): 8 # 连接数据库 9 db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='jingdong', charset='utf8') 10 cursor = db.cursor() 11 # 测试,打印一下mysql版本 12 cursor.execute("select version()") 13 14 sql = 'insert into shishi(id,guid,content,creationTime,isTop,referenceTime,firstCategory,secondCategory,thirdCategory,replyCount,score,nickname,userClient,productColor,productSize,plusAvailable,productSales,days,afterDays) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' 15 16 index = 1 # 行数 17 for data in items: # items是十条数据 data是其中一条(一条下有三个内容) 18 # 执行sql语句 19 values = ( 20 data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11], 21 data[12], data[13], data[14], data[15], data[16], data[17], data[18]) 22 cursor.execute(sql, values) 23 db.commit() 24 25 cursor.close() 26 db.close() 27 28 29 def start(page): 30 # 获取URL 31 # score 评价等级 page=0 第一页 producitid 商品类别 32 # url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100014348492&score=0&sortType=5&page='+str(page)+'&pageSize=3100&isShadowSku=0&fold=1' 33 34 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100034710036&score=1&sortType=5&page=' + str( 35 page) + '&pageSize=10&isShadowSku=0&fold=1' 36 37 headers = { 38 "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36" 39 } 40 time.sleep(2) 41 test = requests.get(url=url, headers=headers) 42 t = test.text 43 data = json.loads(t.lstrip('fetchJSON_comment98vv12345(').rstrip(');')) 44 return data 45 46 47 def parse(data): 48 items = data['comments'] 49 for i in items: 50 yield ( 51 i['id'], # id 52 i['guid'], 53 i['content'], # 内容 54 i['creationTime'], # 时间 55 i['isTop'], 56 i['referenceTime'], 57 i['firstCategory'], 58 i['secondCategory'], 59 i['thirdCategory'], 60 i['replyCount'], 61 i['score'], 62 i['nickname'], 63 i['userClient'], 64 i['productColor'], 65 i['productSize'], 66 i['plusAvailable'], 67 i['productSales'], 68 i['days'], 69 i['afterDays'] 70 ) 71 72 73 def create(): 74 # 连接数据库 75 db = pymysql.connect(host='localhost', port=3306, user='root', password='ym123', db='jingdong', charset='utf8') 76 cursor = db.cursor() 77 # 测试,打印一下mysql版本 78 cursor.execute("select version()") 79 data = cursor.fetchone() 80 print("Database Version:%s" % data) 81 82 # 执行创建分词表的SQL语句 83 sql1 = """ 84 CREATE TABLE shishi ( 85 id varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 86 guid varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 87 content varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 88 creationTime varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 89 isTop varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 90 referenceTime varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 91 firstCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 92 secondCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 93 thirdCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 94 replyCount varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 95 score varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 96 nickname varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 97 userClient varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 98 productColor varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 99 productSize varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 100 plusAvailable varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 101 productSales varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 102 days varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 103 afterDays varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL 104 )ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic; 105 """ 106 # 执行SQL语句 107 cursor.execute(sql1) 108 print("创建shishi表成功") 109 110 111 def main(): 112 create() 113 114 j = 1 # 页面数 115 116 for i in range(0, 100): 117 time.sleep(1.5) 118 # 记得time反爬 其实我在爬取的时候没有使用代理ip也没给我封 不过就当这是个习惯吧 119 first = start(j) 120 test = parse(first) 121 122 daorumysql(test) 123 124 print('第' + str(j) + '页抓取完毕\n') 125 j = j + 1 126 127 128 if __name__ == '__main__': 129 main()