LMJ的小爬虫

import pprint

import requests as req
from lxml import etree
import json5
import pymysql

session = req.session()
session.trust_env = False


def exe_sql(sql, *args, **kwargs):
    """连接数据库"""
    config = {
        'user': 'root',
        'password': '数据库密码',
        'host': 'xx',
        'database': 'xxx',
        'charset': 'utf8mb4',  # 防止乱码
    }
    with pymysql.connect(**config) as conn:
        with conn.cursor() as cur:
            result = cur.execute(sql, *args)
            conn.commit()
            return result

def create_table():
    """创建表"""
    sql = """
            CREATE TABLE anime_info2 (
          id INT NOT NULL AUTO_INCREMENT,
          title VARCHAR(255) NOT NULL,
          link VARCHAR(255) NOT NULL,
          thumbnail VARCHAR(255) NOT NULL,
          description TEXT NOT NULL,
          year VARCHAR(255) NOT NULL,
          region VARCHAR(255) NOT NULL,
          type VARCHAR(255) NOT NULL,
          director VARCHAR(255) NOT NULL,
          scenarist VARCHAR(255) NOT NULL,
          performer VARCHAR(255) NOT NULL,
          updateDate DATE NOT NULL,
          status VARCHAR(255) NOT NULL,
          PRIMARY KEY (id)
        );
    """
    exe_sql(sql)



def insert_anime(data):
    # print(data)
    sql = f'INSERT INTO anime_info2 (`title`, `link`, `thumbnail`, `description`, `year`, `region`, `type`, `director`, `scenarist`, `performer`, `updateDate`, `status`) VALUES (%(title)s, %(link)s, %(thumbnail)s, %(description)s, %(year)s, %(region)s, %(type)s, %(director)s, %(scenarist)s, %(performer)s, %(updateDate)s, %(status)s)'
    return exe_sql(sql, data)

def get_anime(html, title: str, link: str, thumbnail: str):
    result = {
        "title": title,
        "link": link,
        "thumbnail": thumbnail,
        "description":  html.xpath('.//div[@class="module-info-introduction-content"]/p/text()')[0].replace(u'\u3000', u'') if len(html.xpath('.//div[@class="module-info-introduction-content"]/p/text()')) > 0 else '',
        "year": html.xpath('.//div[@class="module-info-tag"]/div[1]/a/@title')[0],
        "region": html.xpath('.//div[@class="module-info-tag"]/div[2]/a/@title')[0],
        "type": html.xpath('.//div[@class="module-info-tag"]/div[3]/a/text()')[0],
        "scenarist": "",
    }
    info_items = html.xpath('.//div[@class="module-info-item"]')
    if len(info_items) == 6:
        result['director'] = ','.join(info_items[0].xpath('.//div[@class="module-info-item-content"]/a/text()'))
        result['scenarist'] = ','.join(info_items[1].xpath('.//div[@class="module-info-item-content"]/a/text()'))
        result['performer'] = ','.join(info_items[2].xpath('.//div[@class="module-info-item-content"]/a/text()'))
        result['updateDate'] = info_items[3].xpath('.//div[@class="module-info-item-content"]/text()')[0]
        result['status'] = info_items[4].xpath('.//div[@class="module-info-item-content"]/text()')[0]

    elif len(info_items) == 5:
        result['director'] = ','.join(info_items[0].xpath('.//div[@class="module-info-item-content"]/a/text()'))
        result['performer'] = ','.join(info_items[1].xpath('.//div[@class="module-info-item-content"]/a/text()'))
        result['updateDate'] = info_items[2].xpath('.//div[@class="module-info-item-content"]/text()')[0]
        result['status'] = info_items[3].xpath('.//div[@class="module-info-item-content"]/text()')[0]

    return result



if __name__ == '__main__':
    create_table()
    index = 1
    for i in range(1, 14):
        url = f'https://www.voflix.me/show/4--------{i}---.html'
        htmlStr = session.get(url).text
        html = etree.HTML(htmlStr)
        cardList = html.xpath('//body//div[@class="module-main module-page "]//a[@class="module-poster-item module-item"]')
        animeList = []
        for j, card in enumerate(cardList):
            # originalImg = card.find(".//img").get('data-original')

            cardLink = 'https://www.voflix.me' + card.get('href')
            originalImg = card.xpath(".//img/@data-original")[0]
            cardName = card.xpath('.//div[@class="module-poster-item-title"]/text()')[0]
            print(index, cardName, cardLink, originalImg)

            infoHtmlStr = session.get(cardLink).text
            infoHtml = etree.HTML(infoHtmlStr)

            anime = get_anime(infoHtml, cardName, cardLink, originalImg)
            insert_anime(anime)
            animeList.append(anime)
            index += 1

        # pprint.pprint(animeList)
        # print(json5.dumps(animeList, ensure_ascii=False))

        # if i == 1:
        #     break
posted @ 2023-05-10 19:29  __username  阅读(21)  评论(1编辑  收藏  举报

本文作者:DIVMonster

本文链接:https://www.cnblogs.com/guangzan/p/12886111.html

版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。