汪晓康

导航

爬虫:爬大量妹子图

#!/usr/bin/python
# coding: UTF-8
import requests
import re
import os


def GetLink(link):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	r = requests.get(link, headers=headers, timeout=20)
	r.encoding = r.apparent_encoding
	html_doc = r.text
	pattern = re.compile(r'\<a\s+href\=\"(\/pn\.x\S+)\"\s+title\=\"(\S+)\"\s+target')
	result1 = pattern.findall(html_doc)
	return result1


def GetPicture(link):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	r = requests.get(link, headers=headers, timeout=20)
	r.encoding = r.apparent_encoding
	html_doc = r.text
	"""
	src="https://pic.ssz91.com/d7/3643/364391-1.jpg"
	"""
	pattern = re.compile(r'(https\:\S+\.jpg)')
	result2 = pattern.findall(html_doc)
	return result2


for i in range(1, 189):
	try:
		link0 = "http://ccw27.com"
		link2 = "http://ccw27.com/piclist.x?classid=6&page="
		link1 = link2 + str(i)
		rs1 = GetLink(link1)

		for each in rs1:
			new_link = link0 + each[0]
			new_title = each[1]
			print(new_link, new_title)
			rs3 = __file__
			rs4 = rs3.split("/")[:-1]
			temp_path = '/'.join(rs4)
			temp_path = temp_path + '/' + new_title
			if os.path.exists(temp_path):
				continue
			else:
				os.mkdir(new_title, mode=0o777)
			rs2 = GetPicture(new_link)
			j = 0
			for each2 in rs2:
				j += 1
				picture = requests.get(each2, timeout=20)
				pictureName = "picture" + str(j) + ".jpg"
				pictureName = temp_path + '/' + pictureName
				with open(pictureName, 'wb') as f:
					f.write(picture.content)
					print("爬完了第%d个(第%d页)" % (j, i))
	except:
		print("执行出错")
		continue

posted on 2021-08-09 23:14  汪晓康  阅读(60)  评论(0编辑  收藏  举报