简单爬虫
import requests
# requests模块,发出请求,接受响应,包括请求响应
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
params = dict(wd="python")
r = requests.get("http://www.baidu.com/s",headers=headers,params=params)
print(r.content.decode())
#获取请求的url
print(r.request.url)
import requests
# "发送请求"
r = requests.get("http://www.baidu.com")
# "有乱码,requests模块是猜 对方web响应的返回的编码格式进行解码的"
print(r.text)
# 此时获取的内容比较少。因为此时还没有设置user——agent ,对方会认为时一个爬虫,有些东西不让爬
# 打印编码格式
# print(r.encoding)
# r.content返回的是bite格式的数据,所以直接用 r.content.decode
# 修改解码方式
# r.encoding = "utf8"
# print(r.text)
# 打印请求头
# print(r.request.headers)
#打印响应头
print(r.headers)
import requests
class TiebaSpider:
def __init__(self,tieba_name):
self.tieba_name = tieba_name
self.temp_url = "https://tieba.baidu.com/f?kw=" + tieba_name + "&pn={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
#构造url列表
def get_url_list(self):
url_list = [self.temp_url.format(i*50) for i in range(5)]
return url_list
#发送请求,获取响应
def parse_url(self,url):
response = requests.get(url,headers=self.headers)
return response.content.decode()
#保存
def save_html(self,html,page_num):
file_path = self.tieba_name+"_"+str(page_num)+".html"
with open(file_path, "w", encoding="utf-8") as f:
f.write(html)
def run(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.parse_url(url)
page_num = url_list.index(url)+1
self.save_html(html_str,page_num)
if __name__ == "__main__":
tieba = TiebaSpider("蒋欣")
tieba.run()