爬小草
1 import requests 2 from bs4 import BeautifulSoup as bs 3 import re 4 import os 5 import socket 6 import time 7 import threading 8 9 10 def url_open(url): 11 socket.setdefaulttimeout(20) 12 headers = { 13 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} 14 res = requests.get(url, headers=headers) 15 16 return res 17 18 19 def save(url): # 传入每个子网页链接 20 res = url_open(url) 21 res.encoding = 'gbk' 22 soup = bs(res.text, 'lxml') 23 title = soup.find('title').text.split('-')[0] # 标题 24 #os.mkdir(title) 25 # os.chdir(title) 26 temp = soup.find_all('tr', class_='tr3') 27 img = re.findall(r'data-src="(.*?jpg)" type', str(temp)) 28 29 30 imglist = [] 31 32 for each in img: 33 imglist.append(each) 34 for each in imglist: 35 filename = each.split('/')[-1] 36 img = url_open(each) 37 print('saving...+%s'%filename) 38 39 with open(title+filename, 'wb')as f: 40 f.write(img.content) 41 #os.chdir('..') 42 43 44 45 46 if __name__ == '__main__': 47 os.makedirs('1024', exist_ok=True) 48 os.chdir('1024') 49 url = 'https://cl.e7s.win/thread0806.php?fid=16&search=&page=1' #默认爬取第一个页面,毕竟要注意身体,需要多个页面的话,自己加个for循环也不是什么难事~ 50 urlhead = 'https://cl.e7s.win/' #页面解析出来的连接前面需要加上这个头才能打开,根据多年经验这个头是会变的,如果哪天不能用了自己看下是不是这个头变了 51 res = url_open(url) 52 res.encoding = 'gbk' 53 54 '''找到页面中的所有子网页''' 55 soup = bs(res.text, 'lxml') 56 temp = soup.find_all('td', class_="tal") 57 link = [] 58 for each in temp: 59 link.append(urlhead + each.h3.a.get('href')) 60 # del link[0:10] 61 62 downloads = [] 63 for each in link: 64 print(each) 65 66 down = threading.Thread(target=save, args=[each]) 67 downloads.append(down) 68 69 down.start() 70 for each in downloads: 71 each.join() 72 print('Done')