全自动爬取壁纸
如图
代码:
1 import requests 2 import re 3 from lxml import html 4 import _thread 5 6 7 class wallpaper: 8 def __init__(self): 9 ''' 10 初始化 11 ''' 12 self.url = "http://simpledesktops.com/" 13 self.req_header = { 14 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 15 } 16 self.S = requests.session() 17 18 def home(self, num): 19 ''' 20 访问首页 21 ''' 22 self.numx = num 23 X = self.S.get(self.url+f"browse/{num}") 24 self.context = X.text 25 26 def url_all(self): 27 ''' 28 获取所有子页链接 29 ''' 30 tree = html.fromstring(self.context) 31 self.res = tree.xpath('//div[@class="desktop"]/a/@href') 32 print(self.res) 33 34 def Single(self, url): 35 ''' 36 访问子页,下载图片 37 ''' 38 url = url 39 O = self.S.get(self.url+url) 40 Otree = html.fromstring(O.text) 41 res = Otree.xpath('//div[@class="desktop"]/a/@href') 42 title = Otree.xpath('//h2/a/text()') 43 print(f"第{self.numx}页_标题:"+title[0]) 44 cs = self.S.get(self.url+res[0]) 45 print("链接:"+self.url+res[0]) 46 with open(f'img/{self.numx}_{title[0]}.png', 'wb') as f: 47 f.write(cs.content) 48 49 def dowlod(self): 50 ''' 51 下载所有图片 52 ''' 53 for i in self.res: 54 self.Single(url=i) 55 56 57 def job(IN, OUT): 58 ''' 59 线程函数 60 ''' 61 for num in range(IN, OUT): 62 A = wallpaper() 63 A.home(num=num) 64 A.url_all() 65 A.dowlod() 66 67 68 try: # 多线程分段下载 69 _thread.start_new_thread(job, (5, 10, )) 70 _thread.start_new_thread(job, (14, 20, )) 71 _thread.start_new_thread(job, (23, 30, )) 72 _thread.start_new_thread(job, (32, 40, )) 73 _thread.start_new_thread(job, (41, 51, )) 74 except: 75 print("Error: 无法启动线程") 76 77 78 while 1: # 阻塞程序 79 pass