- 使用正则表达式匹配
1234567891011121314151617181920212223242526272829
# coding:utf-8
import
re
import
urllib
def
get_content(url):
""" Evilxr, """
html
=
urllib.urlopen(url)
content
=
html.read()
html.close()
return
content
def
get_images(info):
"""" Download Baidu pictures.
<img class="BDE_Image" src="http:*****">
"""
regex
=
r
' class="BDE_Image" src="(.+?\.jpg)" '
pat
=
re.
compile
(regex)
images_code
=
re.findall(pat, info)
i
=
0
for
image_url
in
images_code:
print
image_url
urllib.urlretrieve(image_url,
'%s.jpg'
%
i)
i
=
i
+
1
print
len
(images_code)
info
=
get_content(
"http://tieba.baidu.com/p/2299704181"
)
print
get_images(info)
- 使用第三方库BeautifulSoup匹配
1
# 安装 sudo pip install beautifulsoup4
1234567891011121314151617181920212223242526272829303132# coding:utf-8
import
urllib
from
bs4
import
BeautifulSoup
def
get_content(url):
""" Evilxr, """
html
=
urllib.urlopen(url)
content
=
html.read()
html.close()
return
content
def
get_images(info):
"""
使用BeautifulSoup在网页源码中匹配图片地址
"""
soup
=
BeautifulSoup(info)
all_img
=
soup.find_all(
'img'
,
class_
=
"BDE_Image"
)
i
=
1
for
img
in
all_img:
print
img[
'src'
]
urllib.urlretrieve(img[
'src'
],
'%s.jpg'
%
i)
i
=
i
+
1
print
"一共下载了 "
,
len
(all_img),
"张图片"
info
=
get_content(
"http://tieba.baidu.com/p/3368845086"
)
print
get_images(info)
若非特别声明,文章均为Evilxr的个人笔记,转载请注明出处。
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步