import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
kv = {'user-agent':'Mozilla/5.0'}
r = requests.get(url, timeout=30, headers=kv)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
print(r.request.headers)
print('---------------')
return r.text[:1000]
except:
return '产生异常'
if __name__ == '__main__':
url = 'http://www.baidu.com'
demo = getHTMLText(url)
soup = BeautifulSoup(demo, 'html.parser')
print(soup.prettify())
print(soup.title)
print(soup.a.name)
print(soup.a.parent.name)
print(soup.a.attrs) # 属性