带有headers的urllib库爬取
#请求头 #1、引入模块 from urllib import request #2、操作 #(1)定义目标url base_url = "http://www.langlang2017.com/index.html" #请求头部---request headers headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", #"Accept-Encoding":"gzip, deflate", #一定不要添加,就算添加了也要注释掉 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } req = request.Request(base_url,headers=headers) #生成一个带headers的request对象 #说明: #a.url #b.data:(默认空)是伴随新势力提交的数据(比如要post的数据),同时http请求将从GET方式改成“POST”方式。 #c.headers:(默认空),字典类型,包含了需要发送到http报头的键值对。 #c.1 User-Agent:表示浏览器的身份 #历史:netscape(网景)VS IE,网景就凉凉了,----网景编程人员去Mozilla(开源了) #添加更多的header信息 req.add_header("Connection","keep-alive") #获取header信息 print(req.get_header("Connection")) # # response = request.urlopen(req) # # # # html = response.read() # # # html = html.decode('utf-8') # # print(html) # # with open("langlang2017_index_headers.html","w",encoding="utf-8") as f: # f.write(html)