LMJ的小爬虫
import pprint
import requests as req
from lxml import etree
import json5
import pymysql
session = req.session()
session.trust_env = False
def exe_sql(sql, *args, **kwargs):
"""连接数据库"""
config = {
'user': 'root',
'password': '数据库密码',
'host': 'xx',
'database': 'xxx',
'charset': 'utf8mb4', # 防止乱码
}
with pymysql.connect(**config) as conn:
with conn.cursor() as cur:
result = cur.execute(sql, *args)
conn.commit()
return result
def create_table():
"""创建表"""
sql = """
CREATE TABLE anime_info2 (
id INT NOT NULL AUTO_INCREMENT,
title VARCHAR(255) NOT NULL,
link VARCHAR(255) NOT NULL,
thumbnail VARCHAR(255) NOT NULL,
description TEXT NOT NULL,
year VARCHAR(255) NOT NULL,
region VARCHAR(255) NOT NULL,
type VARCHAR(255) NOT NULL,
director VARCHAR(255) NOT NULL,
scenarist VARCHAR(255) NOT NULL,
performer VARCHAR(255) NOT NULL,
updateDate DATE NOT NULL,
status VARCHAR(255) NOT NULL,
PRIMARY KEY (id)
);
"""
exe_sql(sql)
def insert_anime(data):
# print(data)
sql = f'INSERT INTO anime_info2 (`title`, `link`, `thumbnail`, `description`, `year`, `region`, `type`, `director`, `scenarist`, `performer`, `updateDate`, `status`) VALUES (%(title)s, %(link)s, %(thumbnail)s, %(description)s, %(year)s, %(region)s, %(type)s, %(director)s, %(scenarist)s, %(performer)s, %(updateDate)s, %(status)s)'
return exe_sql(sql, data)
def get_anime(html, title: str, link: str, thumbnail: str):
result = {
"title": title,
"link": link,
"thumbnail": thumbnail,
"description": html.xpath('.//div[@class="module-info-introduction-content"]/p/text()')[0].replace(u'\u3000', u'') if len(html.xpath('.//div[@class="module-info-introduction-content"]/p/text()')) > 0 else '',
"year": html.xpath('.//div[@class="module-info-tag"]/div[1]/a/@title')[0],
"region": html.xpath('.//div[@class="module-info-tag"]/div[2]/a/@title')[0],
"type": html.xpath('.//div[@class="module-info-tag"]/div[3]/a/text()')[0],
"scenarist": "",
}
info_items = html.xpath('.//div[@class="module-info-item"]')
if len(info_items) == 6:
result['director'] = ','.join(info_items[0].xpath('.//div[@class="module-info-item-content"]/a/text()'))
result['scenarist'] = ','.join(info_items[1].xpath('.//div[@class="module-info-item-content"]/a/text()'))
result['performer'] = ','.join(info_items[2].xpath('.//div[@class="module-info-item-content"]/a/text()'))
result['updateDate'] = info_items[3].xpath('.//div[@class="module-info-item-content"]/text()')[0]
result['status'] = info_items[4].xpath('.//div[@class="module-info-item-content"]/text()')[0]
elif len(info_items) == 5:
result['director'] = ','.join(info_items[0].xpath('.//div[@class="module-info-item-content"]/a/text()'))
result['performer'] = ','.join(info_items[1].xpath('.//div[@class="module-info-item-content"]/a/text()'))
result['updateDate'] = info_items[2].xpath('.//div[@class="module-info-item-content"]/text()')[0]
result['status'] = info_items[3].xpath('.//div[@class="module-info-item-content"]/text()')[0]
return result
if __name__ == '__main__':
create_table()
index = 1
for i in range(1, 14):
url = f'https://www.voflix.me/show/4--------{i}---.html'
htmlStr = session.get(url).text
html = etree.HTML(htmlStr)
cardList = html.xpath('//body//div[@class="module-main module-page "]//a[@class="module-poster-item module-item"]')
animeList = []
for j, card in enumerate(cardList):
# originalImg = card.find(".//img").get('data-original')
cardLink = 'https://www.voflix.me' + card.get('href')
originalImg = card.xpath(".//img/@data-original")[0]
cardName = card.xpath('.//div[@class="module-poster-item-title"]/text()')[0]
print(index, cardName, cardLink, originalImg)
infoHtmlStr = session.get(cardLink).text
infoHtml = etree.HTML(infoHtmlStr)
anime = get_anime(infoHtml, cardName, cardLink, originalImg)
insert_anime(anime)
animeList.append(anime)
index += 1
# pprint.pprint(animeList)
# print(json5.dumps(animeList, ensure_ascii=False))
# if i == 1:
# break
本文来自博客园,作者:__username,转载请注明原文链接:https://www.cnblogs.com/code3/p/17389105.html