第六篇 - bs4爬取校花网
环境:python3 pycharm
模块:requests bs4 urlretrieve os time
第一步:获取网页源代码
import requests from bs4 import BeautifulSoup from urllib.request import urlretrieve import os import time def get_html(url): try: response = requests.get(url) response.encoding = 'gbk' return response.text except Exception as e: print(e) if __name__ == '__main__': url = 'http://www.521609.com/meinvxiaohua/' get_html(url)
第二步:下载美女图片
def down_show(html,page): try: soup = BeautifulSoup(html,'lxml') all_img = soup.find("div",class_="index_img list_center").find_all('img') num = 1 for img in all_img: src = img.get('src') url_pic = 'http://www.521609.com' + src if os.path.exists('show'): pass else: os.mkdir('show') urlretrieve(url_pic,'./show/'+'第%s页-%s.jpg'%(page,num)) num += 1 except Exception as e: print(e)
第三步:可选打印多少页,代码所示下载5页
def get_pages(page): for i in range(121,page+121): url = 'http://www.521609.com/meinvxiaohua/list%d.html' % i html = get_html(url) down_show(html,i-120) time.sleep(1) print("图片下载完毕") if __name__ == '__main__': get_pages(5)
也可以采用多线程
import requests from bs4 import BeautifulSoup import threading import time import os headers = { 'Referer': 'http://www.521609.com/meinvxiaohua/', 'User-Agent': '', } def get_html(url): try: response = requests.get(url=url,headers=headers) response.encoding = "gb2312" return response.text #文本,字符串 except Exception as e: print(e) def mk_dir(): os.makedirs('./show/',exist_ok=True) def down_image(html,page): try: soup = BeautifulSoup(html,'lxml')#可以解析html,xml all_img = soup.find('div',class_='index_img list_center').find_all('img') num = 1 for img in all_img: src = img.get('src')#后半部分的地址 url = 'http://www.521609.com' + src content = requests.get(url=url,headers=headers).content#字节流 with open('./show/第%s页-%s.jpg' % (page,num),'wb') as file: file.write(content) num += 1 time.sleep(1) except Exception as e: print(e) pass def get_pages(page): for i in range(121,121+page): url = "http://www.521609.com/meinvxiaohua/list%s.html" % i html = get_html(url) if not os.path.exists('show'): mk_dir() down_image(html,page) time.sleep(1) print('美女图片前%s页下载完毕' % str(i-120)) # if not os.path.exists('show'): # mk_dir() # thread = [] # for i in range(121,121+page): # url = "http://www.521609.com/meinvxiaohua/list%s.html" % i # html = get_html(url) # t = threading.Thread(target=down_image,args=(html,str(i-120))) # thread.append(t) # for i in thread: # i.start() # for j in thread: # j.join() def main(): start_time = time.time() get_pages(3) stop_time = time.time() load_time = stop_time - start_time print(load_time)#48.115086793899536 if __name__ == '__main__': main()