e-hentai爬虫(更新)

使用了BeautifulSoup，re和urllib模块

#! /usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Orange'
from urllib import request, parse
import re
from bs4 import BeautifulSoup
class Item(object):
    time = ''
    img_url = ''
    tittle = ''
    down = ''
    def __init__(self, time, img_url, tittle, down) -> None:
        self.time = time
        self.img_url = img_url
        self.tittle = tittle
        self.down = down
def getPages(html):
    return re.findall(r'onclick="return false">(\d+)</a>', html)
def get_url(pages, key):
    data = {
        'page': pages,
        'f_doujinshi': 'on',
        'f_manga': 'on',
        'f_artistcg': 'on',
        'f_gamecg': 'on',
        'f_western': 'on',
        'f_non-h': 'on',
        'f_imageset': 'on',
        'f_cosplay': 'on',
        'f_asianporn': 'on',
        'f_misc': 'on',
        'f_search': key,
        'f_apply': 'Apply Filter'
    }
    url_parame = parse.urlencode(data)
    url = "https://e-hentai.org/?"
    url_all = url+url_parame
    return url_all
def gethtml(url):
    req = request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')
    with request.urlopen(req) as f:
        html = f.read().decode('utf-8')
    return html
def getDown(url):
    html = gethtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.find_all(href=re.compile(r'https://ehtracker.org/get/'))[0]['href']
def getItems(html):
    soup = BeautifulSoup(html, 'html.parser')
    for tr in soup.find_all('tr',class_=['gtr0','gtr1']):
        time = tr.find_all('td',{'style':"white-space:nowrap"})[0].string
        content = tr.find_all(class_='it2')[0].string
        if not content:
            img_url = tr.find_all('img')[1]['src']
            tittle = tr.find_all('img')[1]['alt']
        else:
            mix = re.split(r'~', content)
            img_url = 'https://ehgt.org/'+ mix[2]
            tittle = mix[3]
        if tr.find_all(href=re.compile(r'https://e-hentai.org/gallerytorrents.php')):
            down = getDown(tr.find_all(href=re.compile(r'https://e-hentai.org/gallerytorrents.php'))[0]['href'])
        else:
            down = 'em'
        item = Item(time, img_url, tittle, down)
        print(time, img_url, tittle, down)
    return item
def init(key):
    url = get_url(0, key)
    html = gethtml(url)
    pages = getPages(html)[-2]
    items = []
    for i in range(1, int(pages)):
        url = get_url(i, key)
        html = gethtml(url)
        items.append(getItems(html))
    print(items)
if __name__ == "__main__":
    # key = input('输入搜索关键字(english):')
    key = 'chinese'
    init(key)

可爬取图片，标题，时间，bt下载连接
无编写代理池,防爬虫功能

** (后续更新) **

增加了bt转磁力链接,可直接对接到mysql存储,代码之后贴出来

posted @ 2018-07-10 17:20 摇橙子阅读(91792) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Loading

摇橙子`s Blog

TECH OTAKUS SAVE THE WORLD

e-hentai爬虫(更新)

公告