爬虫之爬取B站关键字
# import requests # r = requests.get('http://192.168.8.176:8089/mappuat_sit/user/login?loginNo=13068702992&loginPassword=1111112') # print(r.text) import requests import json import time # 网址 url = "https://api.bilibili.com/x/web-interface/search/all/v2" # 浏览器代{过}{滤}理 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36", "referer": "https://api.bilibili.com/x/web-interface/search/all/v2?context=&page=1&order=&keyword=%E7%BC%96%E7%A8%8B&duration=&tids_1=&tids_2=&__refresh__=true&highlight=1&single_column=0&jsonp=jsonp&callback=__jp2" } # 网页后缀 parameters = { "context": "", "page": "3", "order": "", "keyword": "Django", "duration": "", "tids_1": "", "tids_2": "", "__refresh__": "true", "search_type": "video", "highlight": "1", "single_column": "0", "jsonp": "jsonp", "callback": "__jp1", } # 格式化时间戳 转换成时间格式 def Transformation_time(pud_times): timearray = time.localtime(pud_times) format_time = time.strftime("%Y-%m-%d %H:%M:%S", timearray) return format_time # 去除数据中的多余的杂乱字符 def data_fromat(data): limit = ["#", ";", ",", " ", "【", "】", "\n", "\t", '\r' ,'<emclass="keyword">', "</em>"] # 遍历分隔符 替换成空 for i in limit: data = data.replace(i, "") return data # # 筛选数据并写入 def screening_data(data): # 每一页的所有数据都在这个数组里 information = data["data"]["result"][8]["data"] for x in range(len(information)): with open("d:/bilibili_programming11.txt","a",encoding="utf-8") as file: file.write(("up主:"+information[x]["author"]+"\t")) file.write(("url:"+information[x]["arcurl"]+"\t")) # 调用data_fromat剔除多余字符 file.write(("标题:"+data_fromat(information[x]["title"])+"\t")) # 调用data_fromat剔除多余字符 file.write(("描述:" + data_fromat(information[x]["description"]) + "\t")) file.write(("播放量:" + str(information[x]["play"]) + "\t")) file.write(("弹幕量:" + str(information[x]["video_review"]) + "\t")) file.write(("收藏量:" + str(information[x]["favorites"]) + "\t")) file.write(("标签:" + information[x]["tag"] + "\t")) file.write(("评论量:" + str(information[x]["review"]) + "\t")) # 转换成时间格式 file.write(("发布日期:" + Transformation_time(information[x]["pubdate"]) + "\n")) file.write(("时长(分):" + str(information[x]["duration"]) + "\n")) if __name__ == '__main__': for i in range(1, 51): if i >= 2: headers["referer"] = "https://api.bilibili.com/x/web-interface/search/all/v2?context=&page="+str(i-1)+"&order=&keyword=%E7%BC%96%E7%A8%8B&duration=&tids_1=&tids_2=&__refresh__=true&highlight=1&single_column=0&jsonp=jsonp&callback=__jp2" parameters["page"] = i # 获取的数据转成text然后去除多余字符,再从转成json格式 datas = json.loads((requests.get(url,params=parameters,headers=headers).text).replace("__jp1(","").replace(")","")) screening_data(datas) time.sleep(3)