python学习笔记---BeautifulSoup模块爬图
BeautifulSoup模块爬图学习HTML文本解析标签定位
网上教程多是爬mzitu,此网站反爬限制多了。随意找了个网址,解析速度有些慢。
脚本流程:首页获取总页数-->拼接每页URL-->获取每页中所有主题URL-->遍历图片源URL下载,保存
1 #python3 2 #coding:utf-8_ 3 #_author: Jack 4 #_date: 2020/3/28 5 6 from bs4 import BeautifulSoup 7 import requests,os,sys,time 8 9 DIR_PATH = os.path.dirname(os.path.abspath(__file__)) 10 sys.path.append(DIR_PATH) 11 12 13 HEADER = { 14 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:74.0) Gecko/20100101 Firefox/74.0', 15 } 16 17 def create_dir(file_path): 18 ''' 19 :param file_path: images_directory 20 :return: 21 ''' 22 if not os.path.exists(file_path): 23 os.makedirs(file_path) 24 print('Creatr directory:',file_path) 25 os.chdir(file_path) # cd .. 26 27 def save_data(src,dir_name,file_name): 28 ''' 29 :param src: images url 30 :param sum: directory name 31 :param file_name: image name 32 :return: 33 ''' 34 file_path = os.path.join(DIR_PATH,'images',str(dir_name)) #directory path 35 image_path = os.path.join(file_path,file_name) #images path 36 create_dir(file_path) 37 38 if not os.path.isfile(image_path): 39 req = requests.get(src,headers=HEADER) 40 with open(image_path, 'wb') as f_save: 41 f_save.write(req.content) 42 print('Download successful:',file_name) 43 f_save.flush() 44 else: 45 print('File already exists! Pass') 46 47 def request_to_url(url,header): 48 ''' 49 :param url: page_url 50 :param head: request.header 51 :return: respond.text 52 ''' 53 res = requests.get(url,headers=header) 54 return res.text 55 56 def soup(url,header): 57 ''' 58 :param url: 59 :param header: 60 :return: HTML_Tag 61 ''' 62 return BeautifulSoup(request_to_url(url,header),'html.parser') 63 64 def action(url): 65 ''' 66 Download a count of 100 images and create a new folder 67 :param url: URL 68 :return: 69 ''' 70 download_count = 0 71 dir_name =100 72 try: 73 page_tag = soup(url,HEADER).find('div',class_='pg').find_all('a') 74 max_page = int(page_tag[-2].text.split(' ')[-1]) 75 76 for i in range(1,max_page+1): #find page 77 page_url = os.path.join(url,'forum.php?order=&fid=0&page=%d'%i) 78 #time.sleep(1) 79 page_all_theme_list = soup(page_url,HEADER).find('div',class_='kind_show') 80 theme_list = page_all_theme_list.find_all('div', class_='photo_thumb kind_left') 81 82 for i in theme_list: #find theme 83 theme = i.find('div', class_='title').find('a') 84 #title = theme.string 85 img_url = theme.get('href') 86 print("Ready download: %s" % theme.string,img_url) 87 # time.sleep(1) 88 img_page_tag = soup(img_url,HEADER).find('td',class_='t_f').find_all('img') 89 90 for i in img_page_tag: #find image 91 try: 92 img_src = i.get('src') 93 if download_count %100 == 0: 94 dir_name +=100 95 save_data(img_src,dir_name,img_src.split('/')[-1]) 96 download_count += 1 97 print('Download successful: %d' %download_count) 98 99 except Exception as e: 100 print('Img_tag & Save_data Error:',e) 101 continue 102 103 except Exception as e: 104 print('The trunk Error:',e) 105 106 if __name__ == '__main__': 107 print('Run.....') 108 URL = 'http://www.lesb.cc/' 109 action(URL) 110 print('Perform !')