1-Python - 爬取美女图片

校花网

校花网官网

pip install requests
pip install BeautifulSopu4

爬取单页

import os
import requests
from bs4 import BeautifulSoup

base_dir = os.path.dirname(os.path.abspath(__file__))
def spider():
    response = requests.get(url='https://nice.ruyile.com/?f=2')
    soup = BeautifulSoup(response.text, 'html.parser')
    all_content = soup.find(name='div', attrs={'class': 'm3_xhtp'})  # 拿到所有图片标签外部的div
    tag_list = all_content.find_all(name='div', attrs={'class': 'tp_list'})
    for item in tag_list:
        # 获取每个妹子的具体链接
        res = item.find_all(name='a')[1]
        a_content_file_path = res.text
        a_url = response.url.split('/?')[0] + res.get('href')
        # print(a_url, a_content_file_path)  # https://nice.ruyile.com/r16604/ 清纯大眼睛MM
        # 进入每个妹子的详情页
        girl_details = requests.get(url=a_url)
        girl_soup = BeautifulSoup(girl_details.text, 'html.parser')
        img_all_div = girl_soup.find_all(name='div', attrs={'class': 'm6_js'})[1]
        img_list = img_all_div.find_all(name='p')
        os.makedirs(os.path.join(base_dir, 'img_list', a_content_file_path))
        file_path = os.path.join(base_dir, 'img_list', a_content_file_path)
        for i in img_list:
            img_src = i.find(name='img').get('src')
            img_content = requests.get(url=img_src)
            with open(os.path.join(file_path, img_src.rsplit('/')[-1]), 'wb') as f:
                f.write(img_content.content)


if __name__ == '__main__':
    spider()

7160图片大全

http://www.7160.com/xiaohua/list_6_1.html

import os
import requests
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup

BASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)))

def spider(line):

    response = requests.get(url='http://www.7160.com/xiaohua/list_6_{}.html'.format(line))
    # print(response.encoding)
    response.encoding = 'GBK'
    soup = BeautifulSoup(response.text, 'html.parser')
    div = soup.find(name='div', attrs={'class': "news_bom-left"})
    for li in div.find_all(name='li'):
        a_url = li.find('img').get('src')
        print(response.url, a_url)
        path = os.path.join(BASE_PATH, 'a', a_url.rsplit('/', 1)[-1])
        with open(path, mode='wb') as f:
            res = requests.get(a_url)
            f.write(res.content)
def run():
    t = ThreadPoolExecutor(10)
    for i in range(1, 11):
        t.submit(spider, i)


if __name__ == '__main__':
    run()

天极网

天极网明星图片:http://pic.yesky.com/c/6_243_1.shtml

import os
import requests
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup

BASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)))


def worker(a_url, title):
    response = requests.get(a_url)
    # print(response.encoding)
    response.encoding = 'GBK'
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    div = soup.find(name='div', attrs={'class': "overview"})
    # print(div)
    for item in div.find_all(name='img'):
        src = item.get('src')
        print(a_url, src)
        path = os.path.join(BASE_PATH, 'b', title, src.rsplit('/', 1)[-1])
        with open(path, 'wb') as f:
            res = requests.get(src.replace('113x113', '740x-'))
            f.write(res.content)

def spider(line):

    response = requests.get(url='http://pic.yesky.com/c/6_243_{}.shtml'.format(line))
    soup = BeautifulSoup(response.text, 'html.parser')
    div = soup.find(name='div', attrs={'class': "lb_box"})
    # print(div)
    for dd in div.find_all(name='dd'):
        a_url, title = dd.find('a').get("href"), dd.find('a').get("title")
        path = os.path.join(BASE_PATH, 'b', title)
        if not os.path.isdir(os.path.join(BASE_PATH, 'b', title)):
            os.mkdir(path)
        worker(a_url, title)
        # break

def run():
    t = ThreadPoolExecutor(10)
    for i in range(1, 11):
        t.submit(spider, i)


if __name__ == '__main__':
    run()
posted @ 2019-03-02 01:59  听雨危楼  阅读(489)  评论(0编辑  收藏  举报