爬虫之urllib下的request使用
一.url网址
1.含有汉字的url
from urllib import parse
word = '龙'
word=urllib.parse.quote(word)
url='https://baike.baidu.com/search/word?word=%s'%word
2.可以将字典添加到url中
from ulrlib import parse,request
base_url = "http://www.baidu.com/s?"
content = input("请输入你要搜索的内容:")
qs = {
"wd":content,
"rsv_sp":1
}
#将汉字转成unicode码
from urllib import parse
qs = parse.urlencode(qs) #wd=%E5%85%84%E5%BC%9F%E8%BF%9E
base_url = base_url+qs
二.urlib下的request模块
1.直接访问的网址:
from urllib import request
#(1)定义目标url
base_url = "http://www.langlang2017.com/index.html"
#(2)发起请求(GET)--向指定的url发送请求,并返回服务器响应的类文件对象
response = request.urlopen(base_url,timeout=0.5)
timeout可以选填
#(3)获取内容
html = response.read()
#(4)转码
html = html.decode('utf-8')
2.需要添加消息头和数据的url
(1)消息头
headers = {
"USer_Agent":"mozilla/5.0 (Windows nt 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
}
req = request.Request(url,headers=headers)
response = request.urlopen(req)
html = response.read()
html = html.decode("utf-8")
#如果需要再添加元素可以用add_header
#req.add_header("Connection","keep-alive")
#查看元素用get_header
#user_agent = req.get_header("User_agent")
(2)data数据
data = {
"kw": content
}
base_url = "http://fanyi.baidu.com/sug"
headers = {
"Content-Length": len(data),
}
#封装一个request对象(地址,数据,headers)数据需要转为2进制
req = request.Request(url=base_url, data=bytes(data, encoding="utf-8"), headers=headers)
response = request.urlopen(req)
3.含有cookie,登录名和密码,ss证书
from urllib import request
from urllib import parse,request
from http import cookiejar
import ssl
#ssl免验证加这一行代码就行
ssl._create_default_https_context = ssl._create_unverified_context
#在python当中使用用户名和密码进行登录,然后保存cookie
cookie = cookiejar.CookieJar() #生成cookie对象
cookie_handler = request.HTTPCookieProcessor(cookie)#生成cookie管理器
http_handler = request.HTTPHandler() #http请求管理器
https_handler = request.HTTPSHandler() #https请求管理器
#发起请求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)
#登录
login_url = "http://www.renren.com/PLogin.do"
data = {
"email":"18811176939",
"password":"123457"
}
data = parse.urlencode(data)
req = request.Request(login_url,data=bytes(data,'utf-8'))
response = opener.open(req)
html = response.read()
html = html.decode('utf-8')
4.md5加密
import hashlib
def getMD5(value):
md5 = hashlib.md5()
md5.update(bytes(value, encoding="utf-8"))
sign = md5.hexdigest()
return sign