使用 BeautifulSoup 进行解析 html
#coding=utf-8
import
urllib2
import
socket
import
httplib
from
bs4
import
BeautifulSoup
UserAgent
=
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
def
downloadPage(url):
try
:
opener
=
urllib2.build_opener()
headers
=
{
'User-Agent'
: UserAgent }
req
=
urllib2.Request(url
=
url, headers
=
headers)
resp
=
opener.
open
(req, timeout
=
30
)
result
=
resp.read()
return
result
except
urllib2.HTTPError, ex:
print
ex
return
''
except
urllib2.URLError, ex:
print
ex
return
''
except
socket.error, ex:
print
ex
return
''
except
httplib.BadStatusLine, ex:
print
ex
return
''
if
__name__
=
=
'__main__'
:
content
=
downloadPage(
"这填douban的地址"
)
#print content
soap
=
BeautifulSoup(content,
'lxml'
)
lst
=
soap.select(
'ol.grid_view li'
)
for
item
in
lst:
# 电影详情页链接
print
item.select(
'div.item > div.pic a'
)[
0
].attrs[
'href'
]
# 图片链接
print
item.select(
'div.item > div.pic a img'
)[
0
].attrs[
'src'
]
# 标题
print
item.select(
'div.item > div.info > div.hd > a > span.title'
)[
0
].get_text()
# 评分
print
item.select(
'div.item > div.info > div.bd > div.star > span.rating_num'
)[
0
].get_text()
print
'-------------------------------------------------------------------------'