Python学习16
常用外置模块
1、requests
Python第三方库requests比python的内置库urllib处理URL资源更方便
1、使用requests
GET访问一个页面
- 当获取的首页乱码时,可以用encoding/content设置解码方式
import requests
r = requests.get('https://www.baidu.com/')
#用encoding解码获取的内容
r.encoding='utf-8' #设置编码方式
print(r.encoding) #检测编码方式
print(r.status_code) #状态码判断请求是否成功
print(r.text) #文本内容
print(r.url) #实际请求的url
#用content解码获取的内容
r.content.decode() #用content获得bytes对象并用decode解码
print(r.text)
- 可以用来判断请求是否成功
assert response.status_code==(num)
- 查看请求的响应头以及相应的url
import requests
response = requests.get('https://www.sina.com')
print(response.headers)
print(response.request.url)
print(response.url)
- 可以构造正确的headers头部,来请求网页得到完整的页面内容
import requests
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
response = requests.get('https://www.baidu.com',headers = headers)
print(response.headers)
print(response.content.decode())
- 在requests中的response.requests.url的返回结果中存在url编码,需要url解码
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
p = {'wd':'耐克'}
url_tem = 'https://www.baidu.com/s?'
r = requests.get(url_tem,headers = headers, params = p)
print(r.status_code)
print(r.request.url) #返回结果存在url编码
print(r.content)
print(r.text)
- 爬取耐克百度贴吧的网页信息,并保存到本地
import requests
class TiebaSpider:
def __init__(self,tiebaname):
self.tiebaname = tiebaname
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
self.url_temp = 'https://tieba.baidu.com/f?kw='+tiebaname+'&ie=utf-8&pn={}'
def get_url_list(self):
url_list =[]
for i in range(1000):
url_list.append(self.url_temp.format(i*50))
return url_list
def parse_url(self,url):
response = requests.get(url,headers = self.headers)
return response.content.decode()
def html_save(self,html_str,pagename):
file_path = '{}第{}页.html'.format(self.tiebaname,pagename)
with open(file_path,'w',encoding='utf-8') as f:
f.write(html_str)
def run(self):
get_list = self.get_url_list()
for url in get_list:
html_str = self.parse_url(url)
pagename = get_list.index(url)+1
save = self.html_save(html_str,pagename)
if __name__ == '__main__':
tieba_spaider = TiebaSpider('耐克')
tieba_spaider.run()