爬虫练习一

# -*-coding:utf-8-*-
import requests
import re
from bs4 import BeautifulSoup


def get_encoding(response):
"""获取页面编码"""
encoding = response.apparent_encoding
if encoding in ("ISO-8859-5", "ptcp154"):
ret = re.search(r'charset=.*"', response.text)
encoding = "".join(ret.group().split("=")[1][:-1])
if encoding in ["GB2312", "GBK"]:
encoding = "GB18030"
return encoding


def get_data(url):
# 获取页面html
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6",
"Cache-Control": "max-age=0",
"Host": "www.xmusic.io",
"Cookie": "PHPSESSID=ln2a8hdjm5u8m94te8gpjjk1r6; __gads=ID=15a6e42d58fd3651-2261f4c943d300e5:T=1653040308:RT=1653040308:S=ALNI_MaqWHa2FiTrTRv6PsmJmYenFEvZVA; __gpi=UID=0000059a0ae692a3:T=1653040308:RT=1653040308:S=ALNI_MbgmfdzvyWFPyflOG1KicvIH5jhJw; usercode=1001180AZE212ECCZ842FDF5AFZ",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"
}
response = requests.get(url, headers=headers)
encoding = get_encoding(response)
response.encoding = encoding
text = response.content.decode(encoding)
soup = BeautifulSoup(text, "lxml")
ul_html = soup.find("ul", attrs={"class": "song-list"})
li_set = ul_html.find_all("li")
song_names = []
for index, item in enumerate(li_set):
if index > 0:
song_name = item.find("div", attrs={"class": "song-tit"}).text
art_name = item.find("div", attrs={"class": "art-name"}).text
song_names.append({"song_name": song_name, "art_name": art_name})
return song_names



if __name__=="__main__":
mun = {
"2017": [item+1 for item in range(52)],
"2018": [item+1 for item in range(52)],
"2019": [item+1 for item in range(52)],
"2020": [item+1 for item in range(52)],
"2021": [item+1 for item in range(52)],
"2022": [1,2,3,4,5,6,7,8,9,10, 11, 12,13,14,15,16,17,18,20],
}

f = open("排行榜.txt", "w+")
for x in mun.keys():
for i in mun[x]:
url = "xxxx/charts/lists/10/%s/%s" % (str(x), str(i))
print("当前解析url:%s" % url)
data = get_data(url)
for item in data:
a = "/t".join(["%s第%s期" % (x, str(i)), item["song_name"], item["art_name"]])
print("正在解析:%s第%s期%s" % (x, str(i), item["song_name"]))
f.write(a + "\n")
f.close()
posted @ 2022-05-23 12:27  你看起来真的很好吃  阅读(30)  评论(0编辑  收藏  举报