爬虫之urllib下的request使用

一.url网址

1.含有汉字的url

from urllib import parse

word = '龙'

word=urllib.parse.quote(word)

url='https://baike.baidu.com/search/word?word=%s'%word

2.可以将字典添加到url中

from ulrlib import parse,request

base_url = "http://www.baidu.com/s?"

content = input("请输入你要搜索的内容：")

qs = {
"wd":content,
"rsv_sp":1
}

#将汉字转成unicode码
from urllib import parse
qs = parse.urlencode(qs) #wd=%E5%85%84%E5%BC%9F%E8%BF%9E

base_url = base_url+qs

二.urlib下的request模块

1.直接访问的网址:

from urllib import request

#(1)定义目标url
base_url = "http://www.langlang2017.com/index.html"

#(2)发起请求（GET）--向指定的url发送请求，并返回服务器响应的类文件对象
response = request.urlopen(base_url,timeout=0.5)

timeout可以选填

#(3)获取内容
html = response.read()

#(4)转码
html = html.decode('utf-8')

2.需要添加消息头和数据的url

(1)消息头

headers = {
"USer_Agent":"mozilla/5.0 (Windows nt 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
}

req = request.Request(url,headers=headers)

response = request.urlopen(req)
html = response.read()
html = html.decode("utf-8")

#如果需要再添加元素可以用add_header

#req.add_header("Connection","keep-alive")

#查看元素用get_header

#user_agent = req.get_header("User_agent")

(2)data数据

data = {
"kw": content
}

base_url = "http://fanyi.baidu.com/sug"

headers = {
"Content-Length": len(data),

}

#封装一个request对象（地址，数据，headers）数据需要转为2进制
req = request.Request(url=base_url, data=bytes(data, encoding="utf-8"), headers=headers)

response = request.urlopen(req)

3.含有cookie,登录名和密码,ss证书

from urllib import request

from urllib import parse,request

from http import cookiejar

import ssl

#ssl免验证加这一行代码就行
ssl._create_default_https_context = ssl._create_unverified_context

#在python当中使用用户名和密码进行登录，然后保存cookie

cookie = cookiejar.CookieJar() #生成cookie对象
cookie_handler = request.HTTPCookieProcessor(cookie)#生成cookie管理器
http_handler = request.HTTPHandler() #http请求管理器
https_handler = request.HTTPSHandler() #https请求管理器

#发起请求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)

#登录

login_url = "http://www.renren.com/PLogin.do"

data = {
"email":"18811176939",
"password":"123457"
}

data = parse.urlencode(data)

req = request.Request(login_url,data=bytes(data,'utf-8'))
response = opener.open(req)
html = response.read()
html = html.decode('utf-8')

4.md5加密

import hashlib

def getMD5(value):
　　md5 = hashlib.md5()
　　md5.update(bytes(value, encoding="utf-8"))
　　sign = md5.hexdigest()
　　return sign

posted on 2018-03-23 22:27 风过竹影阅读(1126) 评论(0) 编辑收藏举报