爬虫-爬取美少女壁纸
1. 有时候想找些好看的壁纸,一个一个下载太慢了,作为一个菜鸡程序员,还是会点爬虫的,说到爬虫,当时还是python香了,说干就干
import requests import re,os from lxml import etree import threading def getUrl(i): url_list=[] page_url="https://www.bizhizu.cn/search/动漫/"+str(i)+".html" page_res=requests.get(page_url).text html = etree.HTML(page_res) for i in range(1,19): html_data = html.xpath('//*[@class="imgcont"]/ul/li['+str(i)+']/a[1]/@href') name = html.xpath('//*[@class="imgcont"]/ul/li['+str(i)+']/a[2]/text()') if len(html_data) > 0 and len(name)>0: path_name = [] path_name.append(html_data[0]) # print(path_name) path_name.append(name[0]) url_list.append(path_name) print(url_list) return url_list def saveImg(url,path_name): # url = "https://www.bizhizu.cn/pic/62690.html" res = requests.get(url).text html = etree.HTML(res) if ("美" in path_name) or ("性" in path_name) or ("感" in path_name) or ("少女" in path_name):#你懂我意思吧 r_name = path_name.replace("/", "-") if not os.path.exists("/Volumes/HD2/downloadpic/" + r_name): os.mkdir("/Volumes/HD2/downloadpic/" + r_name) else: pass for i in range(1, 10): html_data = html.xpath('//*[@id="thumb"]/li[' + str(i) + ']/a/img/@src') # print(html_data) if len(html_data) > 0: img_url = re.findall(r"(https://.*?\.jpg)\.220\.146.jpg", html_data[0]) # print(img_url) if len(img_url)>0: img = requests.get(img_url[0]+".source.jpg") img_url_name = re.findall(r"https://uploadfile.bizhizu.cn/up/.*/.*/.*/(.*?)\.jpg\.220\.146.jpg", html_data[0]) r_name = path_name.replace("/", "-") f = open("/Volumes/HD2/downloadpic/" + r_name + "/" + img_url_name[0] + ".jpg", 'ab') # 存储图片,多媒体文件需要参数b(二进制文件) f.write(img.content) # 多媒体存储content f.close() print("保存成功:", path_name + '/' + img_url_name[0] + ".jpg") else: pass def demo1(): for x in range(1,18): urlList = getUrl(x) for url in urlList: saveImg(url[0], url[1]) def demo2(): for x in range(18,37): urlList = getUrl(x) for url in urlList: saveImg(url[0], url[1]) if __name__=='__main__':
#开启两个线程 t1 = threading.Thread(target=demo1) t2 = threading.Thread(target=demo2) t1.start() t2.start()