【爬虫】python+urllib+beautifusoup爬取花瓣网美女图片
爬取花瓣网的图片
#!/usr/bin/env python # -*- encoding:utf-8 -*- import urllib2 from bs4 import BeautifulSoup import re import requests url = 'http://huaban.com/favorite/beauty/' def requestMain(): request = urllib2.Request(url) request.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" } html_doc = urllib2.urlopen(request) print html_doc.getcode() return html_doc def getPins(): html_doc = requestMain().read() soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8') pins = soup.find_all('a', href=re.compile(r"/pins/\d+/")) # print pins huaban = 'http://huaban.com' i = 0 for pin in pins: pin_urls = huaban + pin['href'] req = urllib2.Request(pin_urls) resp = urllib2.urlopen(req) soup = BeautifulSoup(resp, 'html.parser', from_encoding='utf-8') div_tag = soup.find_all('div', class_="image-holder") i = i+1 print i for tag in div_tag: img = tag.find('img') link = 'http:'+img.get('src') print link a = requests.get(link) imgname = i #imgname = link.split('/')[-1] with open(r'C:\Users\wuzhi_000\Desktop\Python\py_scrapy\image\%s.jpg' % imgname, 'wb') as pic: pic.write(a.content) if __name__ == '__main__': print getPins() # print (soup.prettify()) # print soup.title # print soup.title.name # # print soup.title.string # # print soup.p # # print soup.p['class'] # # print soup.a # # print soup.find_all('img') # # print ('\r\n') # # print soup.find(href="/pins/1147154763/") # # print ('\r\n') # # for img in soup.find_all('img'): # print (img.get('src'))