Python 爬虫

2018/2/22 创建

 

# -*- encode: utf-8 -*-
import threading
import urllib.request
import traceback
import requests
import re
import os
import time

# 正则列表
img_url_partern = '<div class="content" id="content"><a ><img src="(.*?)" alt='
max_page_partern = '<em class="info">共(\d+)页</em>'
img_name_partern = 'alt="(.*?)" /></a><span'
tag_partern = '<a href="(http:\/\/www\.mmjpg\.com\/tag\/\w+)">'
partern_1 = '<a href="(http:\/\/www\.mmjpg\.com\/\w+\/\d+)" target="_blank">'

headers = {'Referer': "http://www.mmjpg.com/mm/1013/5"}
session = requests.session()

def down_img(url_1):

html_data = session.get(url_1, headers=headers, timeout=5)
print(html_data.status_code)
html_data.encoding = 'utf-8'
if html_data.status_code !=200:
print(233)
return
html_data.encoding = 'utf-8'
html_data = html_data.text
# 读取每组照片名
href_ = list(set(re.compile(partern_1).findall(html_data)))
print((href_))
for i in href_:
headers_ = {'Referer': i}
#keys = requests.get('http://www.mmjpg.com/data.php?id={0}&page=8999'.format(i.split('/')[-1]), headers=headers_)
#keys_ = ['i' + zz for zz in (keys.text.split(','))]
#print(keys_)
ts = []
for k in range(1,50):
t = threading.Thread(target=down_pic, args=(i,k))
ts.append(t)
for kk in ts:
kk.start()
for kk in ts:
kk.join()
time.sleep(1)

def down_pic(i,k):
headers_ = {}
headers_.update({'Referer': i + '/' + str(k)})
img = session.get('http://img.mmjpg.com/2018/{0}/{1}.jpg'.format(i.split('/')[-1], k),
timeout=5, headers=headers_)
print(img.status_code,
'http://img.mmjpg.com/2018/{0}/{1}.jpg'.format(i.split('/')[-1], k))

if img.status_code != 200:
return
save_name = r"img/%d" % int(time.time()) + r'.jpg'
with open(save_name, 'wb') as f:
f.write(img.content)
time.sleep(1)

pass

def main():
basic_url = r'http://www.mmjpg.com/'
while 1:
try:
ret = session.get(basic_url, timeout=5, headers=headers)
print(ret.status_code)
if ret.status_code == 200:
ret.encoding = 'utf-8'
print(ret.text)
tag_list = (re.compile(tag_partern).findall(ret.text))
break
except:
print(traceback.format_exc())
#traceback.format_exc()
#continue
# 网站目前有最多1263组图片,for循环全部遍历
for k in tag_list:
for x in range(1, 1264):
url = k +'/'+ str(x)
try:
down_img(url)

except:
print(traceback.format_exc())
pass

#print('[-]The Url:' + url + ' Was No Found!')


if __name__ == '__main__':
main()
 # 该网站部分图片下载规则有区别,会导致下载不成功, 需要用特别的规则适配,待解决  update by 2018/2/22
 

posted on 2018-02-22 22:48  0o0o0o0o0o000  阅读(135)  评论(0编辑  收藏  举报

导航