PYTHON 简单的网页图片爬虫
直接上代码:
''' 简单的网页图片爬虫 要先安装requests,BeautifulSoup的库 pip install requests pip install bs4 是一个可以从HTML或XML文件中提取数据的Python库 pip install lxml ''' import requests #导入requests库 from bs4 import BeautifulSoup def get_htmls(pages=list(range(2, 5))): #获取待爬取的网页 pages_list = [] for page in pages: url = f"https://pic.netbian.com/4kfengjing/index_{page}.html" #网址 response = requests.get(url) response.encoding = 'gbk' pages_list.append(response.text) return pages_list def get_picturs(htmls): #获取所有图片,并下载 for html in htmls: soup = BeautifulSoup(html, 'html.parser') #解析html或xml # print(soup.prettify()) #把要解析的字符串以标准的缩进格式输出 # print(soup.title.string) #输出HTML中title节点的文本内容 # print(soup.link.attrs) #中间的link是页签?比如<link> <title> <head> # print(soup.link.attrs['href']) #指定节点的数据 pic_li = soup.find('div', id='main').find('div', class_='slist').find( 'ul', class_='clearfix') image_path = pic_li.find_all('img') for file in image_path: pic_name = './partice05' + file['alt'].replace(" ", '_') + '.jpg' src = file['src'] src = f"https://pic.netbian.com/{src}" response = requests.get(src) with open(pic_name, 'wb') as f: f.write(response.content) print("picturs dowmload in:{}".format(pic_name)) htmls = get_htmls(pages=list(range(2, 3))) #得到网页的代码list # print(htmls) get_picturs(htmls)
posted on 2023-09-07 09:45 seven1314pp 阅读(26) 评论(0) 编辑 收藏 举报