python 简单的动漫排名爬虫

前两天刚看了两部动漫,找了找动漫排名,发现网上的排名有点老了,于是自己简单写了一点儿代码,非常简单,没有用多线程或多进程


import json

from bs4 import BeautifulSoup
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
}
proxies = {
    'http': 'socks5://127.0.0.1:10808',
    'https': 'socks5://127.0.0.1:10808'
}


def MAL(links: list[str]):
    values = {}
    num = 0
    for link in links:
        res = requests.get(link)
        soup = BeautifulSoup(res.content, 'lxml')
        items = soup.findAll(class_='ranking-list')
        for item in items:
            # rank = item.find(class_='top-anime-rank-text').text
            name = item.find(class_="anime_ranking_h3").text
            score = item.find(class_="score-label").text
            print(name, score)
            values[name] = score
            num += 1
            if num == 100:  # 只获取前 100 个
                return values


def BGM(links: list[str]):
    values = {}
    num = 0
    for link in links:
        res = requests.get(link, headers=headers)
        soup = BeautifulSoup(res.content, 'lxml')
        items = soup.find(class_='browserFull').findAll('li')
        for item in items:
            # rank = item.find(class_='rank').text.replace("Rank ", "")
            name = item.find(class_="l").text
            score = item.find(class_="fade").text
            print(name, score)
            values[name] = score
            num += 1
            if num == 100:
                return values


def ANK(links: list[str]):
    # Anikore 需要登陆才能查看排名,用 session 来登陆并保持会话
    session = requests.session()
    session.post(url='https://www.anikore.jp/users/login/',
                 data={'data[User][email]': "your_username@qq.com",  # 键是登陆页面的用户名和密码标签的 name 属性;值是账户,要改成自己的账户和密码
                       'data[User][original_password]': 'your_password'},
                 headers=headers)
    values = {}
    num = 0
    for link in links:
        res = session.get(link, headers=headers)
        soup = BeautifulSoup(res.content, 'lxml')
        items = soup.findAll(class_='l-searchPageRanking_unit')
        for item in items:
            item = item.find('h2')
            name = item.find(class_="l-searchPageRanking_unit_title").text
            score = item.find(class_="l-searchPageRanking_unit_score").text
            print(name, score)
            values[name] = score
            num += 1
            if num == 100:
                return values


if __name__ == "__main__":
    v1 = MAL(["https://myanimelist.net/topanime.php?limit=%s" % i for i in range(0, 500, 50)])

    v2 = BGM(['http://bangumi.tv/anime/browser?sort=rank&page=%s' % i for i in range(1, 10)])

    v3 = ANK(['https://www.anikore.jp/pop_ranking/page:%s' % i for i in range(1, 10)])
    
    with open('mal.json', 'w', encoding='utf8') as f:
        json.dump(v1, f, indent=4, ensure_ascii=False)  # ensure_ascii=False 可以让 json 写入非 ASCII 码的内容,即英文以外的其他语言

    with open('bgm.json', 'w', encoding='utf8') as f:
        json.dump(v2, f, indent=4, ensure_ascii=False)

    with open('ank.json', 'w', encoding='utf8') as f:
        json.dump(v3, f, indent=4, ensure_ascii=False)

posted @ 2022-04-06 22:20  wztshine  阅读(140)  评论(0编辑  收藏  举报