Python 爬虫

2018/2/22 创建

# -*- encode: utf-8 -*-
import threading
import urllib.request
import traceback
import requests
import re
import os
import time

# 正则列表
img_url_partern = '<div class="content" id="content"><a ><img src="(.*?)" alt='
max_page_partern = '<em class="info">共(\d+)页</em>'
img_name_partern = 'alt="(.*?)" /></a><span'
tag_partern = '<a href="(http:\/\/www\.mmjpg\.com\/tag\/\w+)">'
partern_1 = '<a href="(http:\/\/www\.mmjpg\.com\/\w+\/\d+)" target="_blank">'

headers = {'Referer': "http://www.mmjpg.com/mm/1013/5"}
session = requests.session()

def down_img(url_1):

    html_data = session.get(url_1, headers=headers, timeout=5)
    print(html_data.status_code)
    html_data.encoding = 'utf-8'
    if html_data.status_code !=200:
        print(233)
        return
    html_data.encoding = 'utf-8'
    html_data = html_data.text
    # 读取每组照片名
    href_ = list(set(re.compile(partern_1).findall(html_data)))
    print((href_))
    for i in href_:
        headers_ = {'Referer': i}
        #keys = requests.get('http://www.mmjpg.com/data.php?id={0}&page=8999'.format(i.split('/')[-1]), headers=headers_)
        #keys_ = ['i' + zz for zz in (keys.text.split(','))]
        #print(keys_)
        ts = []
        for k in range(1,50):
            t = threading.Thread(target=down_pic, args=(i,k))
            ts.append(t)
        for kk in ts:
            kk.start()
        for kk in ts:
            kk.join()
        time.sleep(1)

def down_pic(i,k):
    headers_ = {}
    headers_.update({'Referer': i + '/' + str(k)})
    img = session.get('http://img.mmjpg.com/2018/{0}/{1}.jpg'.format(i.split('/')[-1], k),
                      timeout=5, headers=headers_)
    print(img.status_code,
          'http://img.mmjpg.com/2018/{0}/{1}.jpg'.format(i.split('/')[-1], k))

    if img.status_code != 200:
        return
    save_name = r"img/%d" % int(time.time()) + r'.jpg'
    with open(save_name, 'wb') as f:
        f.write(img.content)
    time.sleep(1)

    pass

def main():
    basic_url = r'http://www.mmjpg.com/'
    while 1:
        try:
            ret = session.get(basic_url, timeout=5, headers=headers)
            print(ret.status_code)
            if ret.status_code == 200:
                ret.encoding = 'utf-8'
                print(ret.text)
                tag_list = (re.compile(tag_partern).findall(ret.text))
                break
        except:
            print(traceback.format_exc())
            #traceback.format_exc()
            #continue
    # 网站目前有最多1263组图片,for循环全部遍历
    for k in tag_list:
     for x in range(1, 1264):
        url = k +'/'+ str(x)
        try:
            down_img(url)

        except:
            print(traceback.format_exc())
            pass

            #print('[-]The Url:' + url + ' Was No Found!')


if __name__ == '__main__':
    main()

# 该网站部分图片下载规则有区别，会导致下载不成功，需要用特别的规则适配，待解决 update by 2018/2/22

posted on 2018-02-22 22:48 0o0o0o0o0o000 阅读(135) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

0o0o0o0o0o000

Python 爬虫

导航

公告