判断是否是正常浏览器访问

'''
作业2:
判断user-agent,判断是否是正常浏览器访问
'''
from urllib import request

base_url = "http://www.langlang2017.com"

headers = {
    "connnction":"keep-alive",
    "USer_Agent":"mozilla/5.0 (Windows nt 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
}
req = request.Request(base_url,headers=headers)

user_agent  = req.get_header("User_agent")
print(req.headers,user_agent)

# if user_agent:
#     print("是浏览器访问")
# else:
#     print("不是浏览请求!")



# response = request.urlopen(req)
#
# html = response.read()
#
# html = html.decode("utf-8")
#
# #
# print(html)


'''
作业1:
假装浏览访问,假装不同的浏览器访问。
从user_agent_list.txt文件中,读取user_agent数据,用来封装成一个带headers的request对象,进行网站页面的爬取。
'''

#1.读取文件内容
# text = ""
# with open("user_agent_list.txt","r",encoding="utf-8") as f:
#     text = f.read()
#     print(text)
#     f.close()

#2.使用文件内容

# import random
# user_agent_list = text.split("\n")
# print(len(user_agent_list))
# headers = {
#     "user_agent":random.choice(user_agent_list)
# }
#
# # req = request.Request(base_url,headers=headers)
# # response = request.urlopen(req)
# # html = response.read()
# # html = html.decode("utf-8")
# # print(html,req.get_header("User_agent"))
#
# #小结:遇到的问题,一次读全部的user_agent,太占内存,而且大多数用不着。
# # 解决方法:使用迭代读取文件内容,确保节省内存。
#
# for line in open("user_agent_list.txt"):
#     headers = {
#         "user_agent":line.strip()
#     }
#
#     req = request.Request(base_url,headers=headers)
#     response = request.urlopen(req)
#     html = response.read()
#     html = html.decode("utf-8")
#     print(html)

 

posted @ 2018-03-10 21:16  Bob__Zhang  阅读(583)  评论(0编辑  收藏  举报