返回顶部

一缕半夏微光

温柔半两,从容一生

导航

Python爬取数据并输出到数据库

代码如下:

  1 import pymysql
  2 import requests
  3 import json
  4 import time
  5 
  6 
  7 def daorumysql(items):
  8     # 连接数据库
  9     db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='jingdong', charset='utf8')
 10     cursor = db.cursor()
 11     # 测试,打印一下mysql版本
 12     cursor.execute("select version()")
 13 
 14     sql = 'insert into shishi(id,guid,content,creationTime,isTop,referenceTime,firstCategory,secondCategory,thirdCategory,replyCount,score,nickname,userClient,productColor,productSize,plusAvailable,productSales,days,afterDays) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
 15 
 16     index = 1  # 行数
 17     for data in items:  # items是十条数据 data是其中一条(一条下有三个内容)
 18         # 执行sql语句
 19         values = (
 20         data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11],
 21         data[12], data[13], data[14], data[15], data[16], data[17], data[18])
 22         cursor.execute(sql, values)
 23         db.commit()
 24 
 25     cursor.close()
 26     db.close()
 27 
 28 
 29 def start(page):
 30     # 获取URL
 31     # score 评价等级 page=0 第一页 producitid 商品类别
 32     # url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100014348492&score=0&sortType=5&page='+str(page)+'&pageSize=3100&isShadowSku=0&fold=1'
 33 
 34     url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100034710036&score=1&sortType=5&page=' + str(
 35         page) + '&pageSize=10&isShadowSku=0&fold=1'
 36 
 37     headers = {
 38         "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36"
 39     }
 40     time.sleep(2)
 41     test = requests.get(url=url, headers=headers)
 42     t = test.text
 43     data = json.loads(t.lstrip('fetchJSON_comment98vv12345(').rstrip(');'))
 44     return data
 45 
 46 
 47 def parse(data):
 48     items = data['comments']
 49     for i in items:
 50         yield (
 51             i['id'],  # id
 52             i['guid'],
 53             i['content'],  # 内容
 54             i['creationTime'],  # 时间
 55             i['isTop'],
 56             i['referenceTime'],
 57             i['firstCategory'],
 58             i['secondCategory'],
 59             i['thirdCategory'],
 60             i['replyCount'],
 61             i['score'],
 62             i['nickname'],
 63             i['userClient'],
 64             i['productColor'],
 65             i['productSize'],
 66             i['plusAvailable'],
 67             i['productSales'],
 68             i['days'],
 69             i['afterDays']
 70         )
 71 
 72 
 73 def create():
 74     # 连接数据库
 75     db = pymysql.connect(host='localhost', port=3306, user='root', password='ym123', db='jingdong', charset='utf8')
 76     cursor = db.cursor()
 77     # 测试,打印一下mysql版本
 78     cursor.execute("select version()")
 79     data = cursor.fetchone()
 80     print("Database Version:%s" % data)
 81 
 82     # 执行创建分词表的SQL语句
 83     sql1 = """
 84     CREATE TABLE shishi (
 85     id varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 86     guid varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 87     content varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 88     creationTime varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 89     isTop varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 90     referenceTime varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 91     firstCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 92     secondCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 93     thirdCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 94     replyCount varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 95     score varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 96     nickname varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 97     userClient varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 98     productColor varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
 99     productSize varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
100     plusAvailable varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
101     productSales varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
102     days varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
103     afterDays varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
104     )ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
105     """
106     # 执行SQL语句
107     cursor.execute(sql1)
108     print("创建shishi表成功")
109 
110 
111 def main():
112     create()
113 
114     j = 1  # 页面数
115 
116     for i in range(0, 100):
117         time.sleep(1.5)
118         # 记得time反爬 其实我在爬取的时候没有使用代理ip也没给我封 不过就当这是个习惯吧
119         first = start(j)
120         test = parse(first)
121 
122         daorumysql(test)
123 
124         print('' + str(j) + '页抓取完毕\n')
125         j = j + 1
126 
127 
128 if __name__ == '__main__':
129     main()

 

posted on 2022-03-17 20:15  一缕半夏微光  阅读(310)  评论(0编辑  收藏  举报