Python实例之抓取网易云课堂搜索数据(post方式json型数据)并保存到数据库
本实例实现了抓取网易云课堂中以‘java’为关键字的搜索结果,经详细查看请求的方式为post,请求的结果为JSON数据
具体实现代码如下:
import requests import json import pymysql conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='test', charset='utf8') #连接数据库 cur = conn.cursor() totlePage = 0 #初始化总页数 test = 0 #初始化数据总条数 url = 'http://study.163.com/p/search/studycourse.json' headers = {'content-type': 'application/json'} def getData(count): #定义一个方法,返回json型请求结果 payload = { 'pageIndex':count, #页码为变量 'pageSize':'50', 'keyword':'java', 'searchTimeType':'-1', 'orderType':'5', 'priceType':'-1' } req = requests.post(url,data=json.dumps(payload),headers=headers) res_json = json.loads(req.text) return res_json cur.execute("DROP TABLE IF EXISTS neteasy") #如果表存在就删除 sqlc = "create table neteasy(id int(5),title varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci,\ provider varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci,price float(10),\ learnercount int(5)) CHARACTER SET utf8 COLLATE utf8_general_ci" cur.execute(sqlc) #创建表 final = getData(1)['result']['list'] #判断是否有搜索结果 if final != None: totlePage = getData(1)['result']['query']['totlePageCount'] #获取页码总数 for j in range(1,totlePage+1): #页码循环 final = getData(j)['result']['list'] for i in range(len(final)): #每页中的数据项循环 rt = final[i]['productName'] rp = final[i]['provider'] strpri = final[i]['originalPrice'] if final[i]['discountPrice'] != None: strpri = final[i]['discountPrice'] rn = strpri strcou = final[i]['learnerCount'] if strcou == None: strcou = 0 rd = strcou print('当前正在读取第'+str(j)+'页的第'+str(i+1)+'条数据...') test += 1 sqli = 'insert into neteasy values(%s,%s,%s,%s,%s)' cur.execute(sqli,(test,rt,rp,rn,rd)) #插入数据 print('保存完毕!共'+str(test)+'条数据') cur.close() conn.commit() conn.close() else: print('没有查询结果,请换个关键词试试!')