python--爬虫基础
urllib2
urllib2的使用
from urllib.request import *
# 设置头信息,两种方法
# 1.直接在request中填入参数
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"}
request = Request("此处填写Url(必须以http或https开头)", headers=header )
# 2.设置header
request = Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")
request.get_header("User-Agent") # 获取User-Agent
response = urlopen(request)
html = response.read()
print(html)
url转码
在url中使用汉字时,浏览器会自动将汉字转码,而在编写爬虫程序时,需要我们进行手动转码
from urllib.parse import *
# 编码
items = {"name": "张三"}
print(urllencode(items)) # 结果为:name=%E5%BC%A0%E4%B8%89
# 解码
urlquote()
跳过https认证
urllib2在使用urlopen打开读取https网站数据时会自动生成证书,但在访问某些未通过CA认证的证书时会报错(例如12306网站),这时,需要跳过ssl认证
from urllib.request import *
import ssl
context = ssl._create_unverified_context()
request = Request(url)
response = urlopen(request, context = context)
print(response.read())
Handler处理器
因为urllib2的urlopen方法不支持代理、cookie等功能,所以要用到Handler处理器。
- 添加代理
from urllib.request import *
# 代理开关
proxySwitch = True
httpProxy = ProxyHandler({"http":"账号(可选):密码(可选):112.95.224.58:8118"})
# 无代理处理器对象
nullProxy = ProxyHandler({})
if (proxySwitch):
opener = build_opener(httpProxy)
else:
opener = build_opener(nullProxy)
# 构建一个全局opener,之后所有请求都可以用urlopen()发送,也附带Handler的功能
install_opener(opener)
request = Request("http://www.baidu.com")
response = urlopen(request) #如未install_opener(),则使用response = opener.open(request)发送请求
print(response.read())
利用cookieJar模拟浏览器登陆
from http.cookiejar import CookieJar
from urllib.parse import urlencode
from urllib.request import *
cookie = CookieJar()
handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)
# 添加header
opener.addheaders = [("User-Agent", "xxx")]
url = "xxxxx"
# 登陆的用户名和密码
data = {"username": "xxx", "password": "xxx"}
data = urlencode(data)
request = Request(url, data = data)
response = opener.open(request)
print(response.read())