64爬取b站,微博,ai问答等数据写入excel
# 功能1:获取手机号归属地
# 功能2:查询天气
# 功能3:查询百度热搜
# 功能4:查询微博热搜
# 功能5:查询b站
# 功能6 ai问答(在这用不了 涉及网站逆向写在另外一个py模块,没写入到这里)
# coding=gbk
# -*- coding:uft-8 -*-
import requests
import time
import os
import re
import pandas as pd
from lxml import etree
import io
import sys
import datetime
os.environ['NO_PROXY'] = 'https://cc-api.sbaliyun.com/v1/completions'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
# 手机号
def phone_number():
while True:
phoneNumber = input("查询的手机号(按q退出手机号查询):")
if phoneNumber == 'q':
break
try:
params = {
'mobile': phoneNumber,
'action': 'mobile'
}
url = 'https://www.ip138.com/mobile.asp'
res = requests.get(url=url, headers=headers, params=params)
# print(res.text)
e = etree.HTML(res.text)
KH = e.xpath('//div[@class="table"]/table/tbody/tr[2]/td/text()')[0] # 卡号归属地
kh = e.xpath('//div[@class="table"]/table/tbody/tr[2]/td/span/text()')[0] # 卡号归属地
YXS = e.xpath('//div[@class="table"]/table/tbody/tr[3]/td/text()')[0] # 运行商
yxs = e.xpath('//div[@class="table"]/table/tbody/tr[3]/td/a/text()')[0] # 运行商
QH = e.xpath('//div[@class="table"]/table/tbody/tr[5]/td/text()')[0] # 区号
qh = e.xpath('//div[@class="table"]/table/tbody/tr[5]/td/a/text()')[0] # 区号
YB = e.xpath('//div[@class="table"]/table/tbody/tr[6]/td/text()')[0] # 邮编
yb = e.xpath('//div[@class="table"]/table/tbody/tr[6]/td/a/text()')[0] # 邮编
# print(KH, kh)
# return f'{KH}-->{kh}\n{YXS}-->{yxs}\n{QH}-->{qh}\n{YB}-->{yb}'
print("查询结果如下:")
print(f'{KH}-->{kh}\n{YXS}--->{yxs}\n{QH}------>{qh}\n{YB}------>{yb}')
except Exception:
print("输入的手机号格式不正确,请重新输入!")
# 天气
def get_weather():
while True:
location = input("输入查询的地区(按q退出天气查询):")
if location == 'q':
break
try:
params = {
'location': location
}
url = 'https://www.wentian123.com/search/'
res = requests.get(url=url, headers=headers, params=params)
e = etree.HTML(res.text)
dz = e.xpath('//div[@class="table-inner"]/table//td/a/text()') # 地址
xqj = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[1]/p[1]/text()') # 星期几
# rq = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[1]/p[2]/text()') # 日期
tqkj = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[2]/p[2]/span/text()') # 天气情况
ds = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[3]/p/text()') # 度数
fxjs = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[4]/p/text()') # 风向级数
# print(dz, xqj, rq, tqkj, ds, fxjs)
# print(f'现在是:\t{xqj[0]}\t{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
for d, t, d2, f in zip(dz, tqkj, ds, fxjs):
if d.__contains__(location):
print(d, t, d2, f)
print(f'现在是:\t{xqj[0]}\t{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
except Exception:
print("输入的地区有误,请重写输入!")
# 百度热搜
def bai_du_rei_sou():
while True:
lis = ['realtime', 'novel', 'movie', 'teleplay', 'car', 'game']
print('1.热搜榜\t 2.小说\t 3.电影\t 4.电视剧\t 5.汽车\t 6.游戏\t 7.退出百度热榜单查询')
dic = {
'realtime': '热搜榜',
'novel': '小说',
'movie': '电影',
'teleplay': '电视剧',
'car': '汽车',
'game': '游戏'
}
try:
num = int(input("输入你要查询的榜单:"))
if num == 7:
break
print(f'正在查询{dic[lis[num - 1]]}的榜单:')
url = 'https://top.baidu.com/board'
params = {
# 'tab': 'realtime',
'tab': lis[num - 1]
}
res = requests.get(url=url, headers=headers, params=params)
e = etree.HTML(res.text)
titles = e.xpath('//div[@class="c-single-text-ellipsis"]/text()') # 标题
hot_nums = e.xpath('//div[@class="hot-index_1Bl1a"]/text()') # 热搜指数
# print(hot_nums)
# print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
for t, h in zip(titles, hot_nums):
print(f'标题:{t}------>热度:{h}')
print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
except Exception:
print("输入格式错误,请重新输入!")
# 微博热搜
def weibo_search():
while True:
lis = ['realtimehot', 'socialevent', 'entrank']
url = 'https://s.weibo.com/top/summary'
print('1.热搜榜\t 2.要闻榜\t 3.文娱榜\t 4.退出微博热榜单查询')
num = int(input("输入你要查询的榜单:"))
try:
if num == 4:
break
params = {
'cate': lis[num - 1]
}
headers = {
'cookie': 'SINAGLOBAL=690519784757.2731.1671192419517; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFB2MFg.53.mACIaAgd8wTi5JpVF020e05Neh5XSoMp; SUB=_2AkMUwi8HdcPxrAZZnPoTymngb49H-jynF0bxAn7uJhMyAxh87nwzqSVutBF-XMKjNdhFviACxIXacTNM_j5vca_y; _s_tentry=www.google.com; UOR=,,www.google.com; Apache=8260187671478.501.1675384443714; ULV=1675384443775:3:1:1:8260187671478.501.1675384443714:1671340035730',
'referer': 'https://www.google.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
res = requests.get(url=url, headers=headers, params=params)
e = etree.HTML(res.text)
if params['cate'] != 'socialevent':
xh = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[1]/text()') # 序号
biao_t = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[2]/a/text()') # 置顶+标题
biao_tts = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[2]/a/text()')[1:] # 标题
# print(xh,biao_t)
# print(len(xh),len(biao_t))
print(f"置顶:------->{biao_t[0]}")
for x, b in zip(xh, biao_tts):
print(f"{x}-------->{b}")
else:
biaot = e.xpath('//div[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/text()') # 标题
for i in biaot:
print(f'o------>{i[1:][:-1]}')
print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
except Exception:
print("存在响应或输入问题!重新查询!")
# b站类
class Bili:
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.bilibili.com',
'referer': 'https://www.bilibili.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
# b站排行榜
def get_bili(self):
lis_e = ['all', 'bangumi', 'guochan', 'guochuang', 'documentary', 'douga', 'music', 'dance', 'game',
'knowledge',
'tech', 'sports', 'car', 'life', 'food', 'animal', 'kichiku', 'fashion', 'ent', 'cinephile', 'movie',
'tv',
'variety', 'origin', 'rookie']
lis_c = ['全站', '番剧', '国产动画', '国创相关', '纪录片', '动画', '音乐', '舞蹈', '游戏', '知识', '科技',
'运动',
'汽车', '生活', '美食', '动物圈', '鬼畜', '时尚', '娱乐', '影视', '电影', '电视剧', '综艺', '原创',
'新人']
# print(len(lis_e), len(lis_c))
dic = {
}
for i in range(len(lis_e)): # 写入字典
dic[lis_e[i]] = lis_c[i]
while True:
print('1.全站 2.番剧 3.国产动画 4.国创相关 5.纪录片 6.动画 7.音乐 8.舞蹈 9.游戏 10.知识 11.科技 12.运动 13.汽车 14.生活 15.美食 \
16.动物圈 17.鬼畜 18.时尚 19.娱乐 20.影视 21.电影 22.电视剧 23.综艺 24.原创 25.新人 100.退出当前查询')
# print(dic)
num = int(input("你要查询的类型榜单:"))
if num == 100:
break
try:
print(f'正在查询{lis_c[num - 1]}的榜单:')
url = f'https://www.bilibili.com/v/popular/rank/{lis_e[num - 1]}'
res = requests.get(url, headers)
# print(res.text)
# title = re.findall('class="title">(.*?)</a>', res.text) # 标题
# up = re.findall('alt="up"(.*?)', res.text, re.S)
# print(title)
e = etree.HTML(res.text)
total = e.xpath('//div[@class="detail"]//span/text()') # up 播放 评论
bt = e.xpath('//div[@class="info"]/a/text()') # 标题
# print(total)
lis = []
for i in total:
t = i.strip().replace('\n', '')
lis.append(t)
# print(lis)
lis2 = [lis[i:i + 3] for i in range(0, len(lis), 3)] # 将里面的元素排成3个一组
# print(lis2)
print(e.xpath('//div[@id="app"]//ul[@class="rank-tab"]/li/text()'))
for i, b in zip(lis2, bt):
print(f'标题:{b}------------up:{i[0]}------------播放:{i[1]}------------评论:{i[2]}')
print(f'查询{lis_c[num - 1]}榜单完毕!')
save = input("是否保存到本地? 'y/n':")
if save == 'y':
# ------------------------------excel todo
today = datetime.datetime.today()
# year = today.year
# month = today.month
print(today)
today = str(today).split(' ')[0].replace('-', '_')
total_list = []
for i ,b in zip(lis2, bt):
dic = {
"标题": b,
"up": i[0],
"播放量": i[1],
"评论": i[2]
}
total_list.append(dic)
pf = pd.DataFrame(total_list) # 转列表为DataFrame
path = pd.ExcelWriter(f'{today}{lis_c[num - 1]}.xlsx') # 设置保存路径
pf.to_excel(path, encoding='utf-8', index=False) # 转化为Excel
path.save() # 保存
print(f'{lis_c[num - 1]}已保存!')
# ------------------------------excel
except Exception:
print("输入格式有误或响应错误,重新输入")
# 综合热门
def zong_he_rm(self):
url = 'https://api.bilibili.com/x/web-interface/popular'
params = {
# 'ps': '20', # 展示数据量
'ps': '50', # 展示数据量
'pn': '1'
}
res = requests.get(url, headers=headers, params=params)
datas = res.json()['data']['list']
# print(datas)
for data in datas:
print(
f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}')
print()
# ------------------------------excel todo
save = input("是否保存到本地? 'y/n':")
if save == 'y':
today = datetime.datetime.today()
# year = today.year
# month = today.month
print(today)
today = str(today).split(' ')[0].replace('-', '_')
total_list = []
for data in datas:
dic = {
"板块": data["tname"],
"标题": data["title"],
"up": data["owner"]["name"],
"播放量": data["stat"]["view"],
"评论数": data["stat"]["reply"],
"投币数": data["stat"]["coin"],
"点赞数": data["stat"]["like"]
}
total_list.append(dic)
pf = pd.DataFrame(total_list) # 转列表为DataFrame
path = pd.ExcelWriter(f'{today}b站综合热门.xlsx') # 设置保存路径
pf.to_excel(path, encoding='utf-8', index=False) # 转化为Excel
path.save() # 保存
print(f'b站综合热门已保存!')
# ------------------------------excel
# 每周必看
def weekly(self, num_page):
url = 'https://api.bilibili.com/x/web-interface/popular/series/one'
params = {
'number': num_page
}
res = requests.get(url, headers=headers, params=params)
datas = res.json()['data']['list']
# print(datas)
for data in datas:
print(
f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}')
print()
# 每周必看(第**期)
def week2(self):
url = 'https://api.bilibili.com/x/web-interface/popular/series/list'
res = requests.get(url, headers=headers).json()
# print(res)
lists = res['data']['list']
num_lis = []
for i in lists:
print(f'期数:{i["number"]}----------{i["subject"]}-----------{i["name"]}')
num_lis.append(i["number"])
return num_lis
# 入站必刷
def r_z(self):
url = 'https://api.bilibili.com/x/web-interface/popular/precious'
params = {
'page_size': '100',
'page': '1'
}
res = requests.get(url, headers=headers, params=params)
datas = res.json()['data']['list']
# print(datas)
for data in datas:
print(
f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}')
print()
# 全站音乐榜
def music_(self, num):
url = 'https://api.bilibili.com/x/copyright-music-publicity/toplist/music_list'
params = {
'list_id': num
}
res = requests.get(url, headers=headers, params=params)
try:
datas = res.json()['data']['list']
# print(datas)
for data in datas:
print(
f'歌名:{data["music_title"]}-----歌手:{data["singer"]}-----热度:{data["heat"]}-----播放量:{data["creation_play"]}------up:{data["creation_nickname"]}------成就:{data["achievements"]}')
print()
except Exception:
print("响应超时或改期数不存在! 请查询输入!")
def main():
while True:
print(
"----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ???Welcome to into???')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 1:查询手机号')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 2:查询天气')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 3:查询百度热搜')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 4:查询微博热搜')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 5:查询b站页面')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 6:ai问答区')
print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t q:退出查询')
print(
"----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
order = input("输入查询的序号:")
if order == '1':
print("----进入查询手机号界面----")
phone_number()
elif order == '2':
print("----进入查询天气界面----")
get_weather()
elif order == '3':
print("----进入查询百度热搜界面----")
bai_du_rei_sou()
elif order == '4':
print("----进入查询微博热搜界面----")
weibo_search()
elif order == '5':
bl = Bili()
print("----进入查询b站界面----")
while True:
print("1.综合热门 2.每周必看 3.入站必刷 4.排行榜 5.全站音乐榜 6.退出当前查询")
n5 = input("输入你要查询的板块:")
if n5 == '1':
bl.zong_he_rm()
elif n5 == '2':
qi_shu = bl.week2()
while True:
print(qi_shu)
num = input("输入你要查看的期数(按q退出):")
if num == 'q':
break
bl.weekly(num)
elif n5 == '3':
bl.r_z()
elif n5 == '4':
bl.get_bili()
elif n5 == '5':
while True:
num1 = input('输入查询期号(按q退出!):')
if num1 == 'q':
break
bl.music_(num=num1)
elif n5 == '6':
break
elif order == '6':
from re_Ai import input_get_info
input_get_info()
elif order == 'q':
exit()
else:
print("输入的格式有误!请重新输入")
if __name__ == '__main__':
main()
本文来自博客园,作者:__username,转载请注明原文链接:https://www.cnblogs.com/code3/p/17091119.html