判断是否是正常浏览器访问
''' 作业2: 判断user-agent,判断是否是正常浏览器访问 ''' from urllib import request base_url = "http://www.langlang2017.com" headers = { "connnction":"keep-alive", "USer_Agent":"mozilla/5.0 (Windows nt 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } req = request.Request(base_url,headers=headers) user_agent = req.get_header("User_agent") print(req.headers,user_agent) # if user_agent: # print("是浏览器访问") # else: # print("不是浏览请求!") # response = request.urlopen(req) # # html = response.read() # # html = html.decode("utf-8") # # # # print(html) ''' 作业1: 假装浏览访问,假装不同的浏览器访问。 从user_agent_list.txt文件中,读取user_agent数据,用来封装成一个带headers的request对象,进行网站页面的爬取。 ''' #1.读取文件内容 # text = "" # with open("user_agent_list.txt","r",encoding="utf-8") as f: # text = f.read() # print(text) # f.close() #2.使用文件内容 # import random # user_agent_list = text.split("\n") # print(len(user_agent_list)) # headers = { # "user_agent":random.choice(user_agent_list) # } # # # req = request.Request(base_url,headers=headers) # # response = request.urlopen(req) # # html = response.read() # # html = html.decode("utf-8") # # print(html,req.get_header("User_agent")) # # #小结:遇到的问题,一次读全部的user_agent,太占内存,而且大多数用不着。 # # 解决方法:使用迭代读取文件内容,确保节省内存。 # # for line in open("user_agent_list.txt"): # headers = { # "user_agent":line.strip() # } # # req = request.Request(base_url,headers=headers) # response = request.urlopen(req) # html = response.read() # html = html.decode("utf-8") # print(html)