pyquery 抓取优图

案例

import requests
import os
from requests.packages import urllib3
from pyquery import PyQuery as pq
import re  # 解析数据
import ssl

os.chdir(r"E:/pics22223/")
def get_url1(url):
    ssl._create_default_https_context = ssl._create_unverified_context
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    urllib3.disable_warnings()
    response  = requests.get(url, headers=headers, verify=False)
    response.encoding = response.apparent_encoding
    html = response.text
    doc = pq(html)
    a = doc('.TypeList .TypeBigPics')
    for item in a.items():
        lists = []
        b = item.attr('href')
        lists.append(b)
        response2 = requests.get(b, headers=headers, verify=False)
        response2.encoding = response2.apparent_encoding
        html2 = response2.text
        doc2 = pq(html2)
        text = doc2('body > div.wrap > div.NewPages > ul > li:nth-child(1) > a').text()
        group = re.search(r'[\d]+', text).group()
        for item2 in range(2, int(group)):
            htm_ = b.replace(".htm", '') + '_' + str(item2) + '.htm'
            lists.append(htm_)
        title = re.search(r'[\u4e00-\u9fa5]+', doc2('body > div.wrap > div.ArticleTitle > strong').text()).group()
        for element in lists:
            response3 = requests.get(element, headers=headers, verify=False)
            response3.encoding = response3.apparent_encoding
            html3 = response3.text
            doc3 = pq(html3)
            attr = doc3('#ArticleId0 > p > a > img').attr('src')
            path = str(title) + '/' + attr.split(r'/')[-1]
            # 根目录加上url中以反斜杠分割的最后一部分,即可以以图片原来的名字存储在本地
            try:
                if not os.path.exists(str(title)):  # 判断当前根目录是否存在
                    os.mkdir(str(title))  # 创建根目录
                if not os.path.exists(path):  # 判断文件是否存在
                    r = requests.get(attr)
                    with open(path, 'wb')as f:
                        f.write(r.content)
                        f.close()
                        print("文件保存成功", '\n')
                else:
                    print("文件已存在")
            except:
                print("爬取失败")

if __name__ == '__main__':
    url = 'https://www.umei.cc/p/gaoqing/cn/'
    for i in range(21, 26):
        url1 = url + str(i) + '.htm'
        get_url1(url1)

 案例抓取-->剪切板-->发送

# -*- coding:utf-8 -*-
import os
import re
import requests
import time
import win32api
import win32con
from PIL import Image
from io import BytesIO
import win32clipboard

os.chdir(r"E:/ntmssFile/umei/")
def paste_img(file_img):
    """
    图片转换成二进制字符串,然后以位图的格式写入剪贴板
    主要思路是用Image模块打开图片,
    用BytesIO存储图片转换之后的二进制字符串
    :param file_img: 图片的路径
    """
    # 把图片写入image变量中
    # 用open函数处理后,图像对象的模式都是 RGB
    image = Image.open(file_img)
    # 声明output字节对象
    output = BytesIO()
    # 用BMP (Bitmap) 格式存储
    # 这里是位图,然后用output字节对象来存储
    image.save(output, 'BMP')
    # BMP图片有14字节的header,需要额外去除
    data = output.getvalue()[14:]
    # 关闭
    output.close()
    # DIB: 设备无关位图(device-independent bitmap),名如其意
    # BMP的图片有时也会以.DIB和.RLE作扩展名
    # 设置好剪贴板的数据格式,再传入对应格式的数据,才能正确向剪贴板写入数据
    send_msg_to_clip(win32clipboard.CF_DIB, data)


def send_msg_to_clip(type_data, msg):
    """
    操作剪贴板分四步:
    1. 打开剪贴板:OpenClipboard()
    2. 清空剪贴板,新的数据才好写进去:EmptyClipboard()
    3. 往剪贴板写入数据:SetClipboardData()
    4. 关闭剪贴板:CloseClipboard()
    :param type_data: 数据的格式,
    unicode字符通常是传 win32con.CF_UNICODETEXT
    :param msg: 要写入剪贴板的数据
    """
    win32clipboard.OpenClipboard()
    win32clipboard.EmptyClipboard()
    win32clipboard.SetClipboardData(type_data, msg)
    win32clipboard.CloseClipboard()


def pasteInfo():
    win32api.keybd_event(17, 0, 0, 0)  # ctrl键位码是17
    win32api.keybd_event(86, 0, 0, 0)  # v键位码是86
    win32api.keybd_event(86, 0, win32con.KEYEVENTF_KEYUP, 0)  # 释放按键
    win32api.keybd_event(17, 0, win32con.KEYEVENTF_KEYUP, 0)
    win32api.keybd_event(13, 0, 0, 0)  # enter
    win32api.keybd_event(13, 0, win32con.KEYEVENTF_KEYUP, 0)  # 释放按键


def crawl(start_url, req_headers):
    try:
        res = requests.get(start_url, headers=req_headers)
        content = res.content.decode("utf8")
        parttern_href = re.compile(r'<li>.*?<a href="(.*?)" class="TypeBigPics" .*?>.*?</li>', flags=re.DOTALL)
        hrefs = re.findall(parttern_href, content)
        for href in hrefs:
            res = requests.get(href, headers=req_headers)
            content_href = res.content.decode("utf8")
            parttern_title = re.compile(r'<strong>(.*)</strong>', flags=re.DOTALL)
            title = re.search(parttern_title, content_href).group(1)
            if not os.path.exists(title):
                os.makedirs(title)
            total_compile = re.compile(r'<li><a>共(\d*)页: </a></li>', flags=re.DOTALL)
            total_page = re.search(total_compile, content_href).group(1)
            for page in range(2, int(total_page)):
                sub = re.sub(r'\.htm', '', href)
                url_page = sub + '_' + str(page) + '.htm'
                res_ = requests.get(url_page)
                content_ = res_.content.decode("utf8")
                photo_compile = re.compile(r'<p align="center">.*?<a .*?>.*?<img alt=".*" src="(.*?)" /></a>',flags=re.DOTALL)
                photo_url = re.search(photo_compile, content_).group(1)
                img_ = requests.get(photo_url)
                file_path = '{}/{}.{}'.format(title, page, 'jpg')
                with open(file_path, 'wb') as f:
                    f.write(img_.content)
                    f.close()
                curryDir = os.getcwd()
                time.sleep(5)
                absPath = curryDir + '/' + file_path
                replace = absPath.replace("\\", '/')
                paste_img(replace)
                time.sleep(5)
                pasteInfo()
            res_1 = requests.get(href)
            content_ = res_1.content.decode("utf8")
            photo_compile = re.compile(r'<p align="center">.*?<a .*?>.*?<img alt=".*" src="(.*?)" /></a>',flags=re.DOTALL)
            photo_url = re.search(photo_compile, content_).group(1)
            img_ = requests.get(photo_url)
            file_path = '{}/{}.{}'.format(title, '1', 'jpg')
            with open(file_path, 'wb') as f:
                f.write(img_.content)
                f.close()
    except:
        print("爬去失败")


if __name__ == '__main__':
    start_url = 'https://www.umei.cc/p/gaoqing/cn/'
    req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    res = requests.get(start_url, headers=req_headers)
    content = res.content.decode("utf8")
    count_com = re.compile(r"<li><a href='(\d+).htm'>末页</a></li>", flags=re.DOTALL)
    count = re.search(count_com, content).group(1)
    for i in range(1, int(count) + 1):
        url = start_url + str(i) + '.htm'
        crawl(url, req_headers)

 

posted @ 2021-01-27 16:37  Bonnie_ξ  阅读(97)  评论(0编辑  收藏  举报