昆仑山:眼中无形心中有穴之穴人合一

夫君子之行,静以修身,俭以养德;非澹泊无以明志,非宁静无以致远。夫学须静也,才须学也;非学无以广才,非志无以成学。怠慢则不能励精,险躁则不能冶性。年与时驰,意与岁去,遂成枯落,多不接世。悲守穷庐,将复何及!

 

爬虫:抓图片

# -*- coding: UTF-8 –*-
import feedparser
import requests
from lxml import etree
import threading
import random
import os


def get_url():
    rss_url = 'https://www.mzitu.com/feed/'
    feeds = feedparser.parse(rss_url)

    page_url = []
    for i in range(20):
        page_url.append(feeds.entries[i]['link'])

    return page_url


def download(dirname, imgurl):
    headers = {
        'referer': 'https://www.mzitu.com/',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }

    filename = imgurl.split('/')[-1]

    r = requests.get(imgurl, headers=headers, stream=True)
    if os.path.exists(dirname):
        with open(dirname + '/' + filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=32):
                f.write(chunk)
            print('下载:%s中' % filename)
    else:
        os.mkdir(dirname)
        with open(dirname + '/' + filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=32):
                f.write(chunk)
            print('下载:%s中' % filename)


def get_img(url):
    r = requests.get(url)
    page = etree.HTML(r.text)
    span = page.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span')
    hs = page.xpath('//h2[@class="main-title"]')
    for h in hs:
        title = h.text
    for a in span:
        pages = a.text
    try:
        for i in range(int(pages) + 1):
            if i == 1:
                pass
            else:
                imgpage = url + '/' + str(i)
                r1 = requests.get(imgpage)
                page1 = etree.HTML(r1.text)
                x_href = page1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img')
                for href in x_href:
                    imgurl = href.get('src')
                    download(title, imgurl)
    except KeyboardInterrupt:
        pass
    except:
        pass


def main():
    urls = get_url()
    threads = []
    for i in range(len(urls)):
        t = threading.Thread(target=get_img, args=(urls[0 + i],))
        threads.append(t)

    for i in threads:
        i.start()

    for i in threads:
        i.join()


if __name__ == '__main__':
    main()

posted on 2019-04-21 20:37  Indian_Mysore  阅读(124)  评论(0编辑  收藏  举报

导航