目前学习的爬取小数据图片zzz
import os import threading import re import time from lxml import etree all_img_urls = [] # 图片列表页面的数组 g_lock = threading.Lock() # 初始化一个锁 # 声明一个生产者的类,来不断地获取图片详情页地址,然后添加到 all_img_url列表中 # url = "http://www.xiaohuar.com/" all_urls = [] class Spider(object): # 构造函数,初始化数据实用 def __init__(self,target_url,headers): self.target_url = target_url self.headers = headers # 获取所有的想要抓取的URL def getUrls(self,start_page,page_num): for i in range(start_page,page_num): url = self.target_url % i all_urls.append(url) if __name__ == '__main__': headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Host":"eclick.baidu.com", } target_url = "http://www.xiaohuar.com/list-1-%d.html" # 抓取链接的样式 spider = Spider(target_url,headers) # 抓取链接的对象传入 链接与请求头 spider.getUrls(0,14) # 抓取的多少页面的链接 # print (all_urls) class Producer(threading.Thread): #创建一个生产者用来批量的'生产'链接 def run(self): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Host": "eclick.baidu.com", } while len(all_urls) > 0: # 这里写了一个死循环,为的是能够一直抓取为爬去数据的链接 g_lock.acquire() # 锁,为的是不让不同的线程共同使用同一条连接 # for url in all_urls: url = all_urls.pop() # 使用pop方法,可以获取链接 g_lock.release() # 获取连接后 释放锁,让其他线程可前去列表中获取链接 response = requests.get(url,headers).text selector = etree.HTML(response) # 使用xpath mods = selector.xpath("//div[@class='item_t']") # 获取指定标签 for i in mods: img_link = i.xpath("div[@class='img']/a/img/@src") name = i.xpath("div[@class='img']/span/text()") name = name[0].encode("utf8") img_link = img_link[0].encode("utf8") comment = {name: img_link} if img_link.startswith("/"): # 因为抓取的链接,有一部分是本地,所以在此处将之拼接成可直接访问的url str = "http://www.xiaohuar.com" img_link = str + img_link comment = {name: img_link} all_img_urls.append(comment) all_img_urls.append(comment) for x in range(10): # 创建10个线程用来爬去链接 down = Producer() down.run() # print all_img_urls class DownPic(threading.Thread): # 用来下载爬取数据的类 def run(self): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Host": "eclick.baidu.com", } while True: # 这个地方写成死循环,为的是不断监控图片链接数组是否更新 g_lock.acquire() if len(all_img_urls) == 0: #没有图片了,就解锁 g_lock.release() continue else: img = all_img_urls.pop() g_lock.release() # 遍历字典列表 for key,value in img.items(): path = "xiaohua/%s.jpg"% key.decode("utf8") response = requests.get(value) # print path with open (path,"wb") as f: f.write(response.content) f.close()# # # # # for x in range(10): down = DownPic() down.run()