Python爬虫urllib模块
Python爬虫练习(urllib模块)
关注公众号“轻松学编程”了解更多。
1、获取百度首页数据
流程:a.设置请求地址 b.设置请求时间 c.获取响应(对响应进行解码)
'''
获取百度首页
'''
import urllib
from urllib import request # urllib.request = urllib2
'''
url, 请求地址
data=None, get请求,当data不为空时则是post请求
timeout 请求时间
'''
# 获取请求数据
try:
# 设置请求时间为1秒,超时则抛出错误
response = urllib.request.urlopen('http://www.baidu.com',timeout=1)
print(type(response))
print(response.read().decode('utf-8'))
except Exception as e:
print(e)
#打印结果为<urlopen error timed out>
2、模拟百度接口
流程:a.对关键字编码 b.设置请求地址 c.设置请求体(请求地址、请求头) d.获取响应(对响应进行解码)
'''
模拟百度接口
'''
import urllib
from urllib import request,parse
def baiduAPI(kw):
'''
:param kw: 关键字
:return: response,内容是html
'''
# a.对关键字编码
# 将字典拼接成wd1=123&wd2=1234字符串,即在url中携带参数
kw = {"kw": kw}
wd = urllib.parse.urlencode(kw,encoding="utf-8",errors="ignore")
#解码
# kw = urllib.parse.unquote(wd,encoding="utf-8",errors="ignore")
# b.设置请求地址
url = "http://www.baidu.com/s?wd=" + wd
# 设置请求头
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
# c.设置请求体
req = urllib.request.Request(url,headers=header)
# d.获取响应
response = urllib.request.urlopen(req)
# 对响应进行解码
return response.read().decode("utf-8")
if __name__ == '__main__':
kw = input("请输入搜索关键字:")
response = baiduAPI(kw)
print(response)
注意:路由地址补全:
http://www.baidu.com/
https://www.baidu.com
否则有可能报错:
#请求地址不完整
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
3、爬取豆瓣电影
流程:a.动态构建url b.设置请求体(请求地址、请求头) c.获取响应(对响应为json数据) d.解析json数据 e.把数据写入文件
打开豆瓣电影:https://movie.douban.com/ 找到“分类点击” f12(或右键选择检查)打开控制台调试器–>找到Newwork -->选中XHR -->刷新页面 -->查看响应的数据获取url -->把页面往下拖点击“获取更多” -->查看多出来的响应,对比其中的url差别。
import json
import urllib
from urllib import request
# 首页
# https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0
# 第二页
# https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=20
# a.动态构建url b.设置请求体(请求地址、请求头)
# c.获取响应(对响应为json数据) d.解析json数据 e.把数据写入文件
def getDouBanMovieInfo(pageRange):
'''
获取豆瓣电影信息
:param pageRange:页码范围
:return: json数据
'''
# a.动态构建url
for i in range(pageRange):
# 一页20条数据
url = "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=%d" %(i * 20)
# 设置请求头,模拟成是浏览器访问
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
# b.设置请求体(请求地址、请求头)
req = urllib.request.Request(url,headers=header)
# c.获取响应(对响应为json数据)
response = urllib.request.urlopen(req).read().decode("utf-8")
# d.解析json数据
mdata = json.loads(response)
data = mdata["data"]
# print(data)
for i in data:
# 明星
casts = i["casts"]
# 导演
directors = i["directors"]
# 电影名
title = i["title"]
# print(directors)
# e.把数据写入文件
with open("movie.txt","a+",encoding="utf-8",errors="ignore") as f:
item = "电影:"+title + " 导演:" + ",".join(directors) + " 演员:" + ",".join(casts)
f.write(item + "\n")
f.flush()
if __name__ == '__main__':
pageRange = int(input("输入页码范围:"))
# 读取1也数据
getDouBanMovieInfo(pageRange)
4、阿里招聘
流程:
1、获取相关岗位总页数
a.设置请求地址(种子) b.设置要提交的表单数据 c.获取响应(对响应为json数据) d.解析json数据 e.返回数据
2、获取相关岗位信息
a.设置请求地址 b.设置要提交的表单数据 c.设置请求体 d.获取响应(对响应为json数据) e.解析json数据 f.返回数据
import json
import urllib
from urllib import request,parse
def getTotalPage(kw):
'''
获取相关岗位总页数
:param kw: 搜索关键字
:return: int 总页数
'''
# a.设置请求地址(种子)
url = "https://job.alibaba.com/zhaopin/socialPositionList/doList.json"
# 设置请求头
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
# b.设置要提交的表单数据
data = {
"pageSize": "10",
"t": "0.6069253832315697",
"keyWord": kw,
"location": "",
"second": "",
"first": "",
"pageIndex": "1",
}
# 表单数据编码
data = urllib.parse.urlencode(data).encode("utf-8")
# 设置请求体
req = urllib.request.Request(url,data=data,headers=header)
# 获取响应
response = urllib.request.urlopen(req).read().decode("utf-8")
# d.转json
data = json.loads(response)
print(data)
return data["returnValue"]["totalPage"]
def getPosition(kw,totalPage):
'''
获取相关岗位数量
:param kw: 搜索关键字
:param totalPage: 总页数
:return:
'''
# pageIndex是从1开始
for i in range(1,totalPage + 1):
# a.设置请求地址
url = "https://job.alibaba.com/zhaopin/socialPositionList/doList.json"
# 设置请求头
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
# b.设置要提交的表单数据
data = {
"pageSize": "10",
"t": "0.6069253832315697",
"keyWord": kw,
"location": "",
"second": "",
"first": "",
"pageIndex": i,
}
# 表单数据编码
data = urllib.parse.urlencode(data).encode("utf-8")
# 设置请求体,POST请求
req = urllib.request.Request(url, data=data, headers=header)
# 获取响应
response = urllib.request.urlopen(req).read().decode("utf-8")
# d.转json
data = json.loads(response)
dataValue = data["returnValue"]["datas"]
for job in dataValue:
# 岗位名称
name = job["name"]
# # 要求
# requirement = job["requirement"]
# 学历
degree = job["degree"]
# # 描述
# description = job["description"]
# 工作经历
workExperience = job["workExperience"]
# print(name,requirement,degree,description,workExperience)
with open("joblist.txt","a+",encoding="utf-8") as f:
item = "岗位:" + name + " 学历:" + degree + " 工作经历:"+workExperience + "\n"
f.write(item)
f.flush()
if __name__ == '__main__':
# 查询岗位列表
jobList = ["python","java"]
# 获取职位总页数
for job in jobList:
totalPage = getTotalPage(job)
getPosition(job,totalPage)
5、12306
流程:
a.设置忽略安全证书 b.设置请求体(请求地址、请求头) c.获取响应(对响应进行解码)
import urllib
from urllib import request
import ssl
# a.设置忽略安全证书 b.设置请求体(请求地址、请求头) d.获取响应(对响应进行解码)
# 设置忽略安全证书
context = ssl._create_unverified_context()
url = "http://www.12306.cn/mormhweb/"
# 设置请求头,模拟成是浏览器访问
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
req = urllib.request.Request(url,headers=header)
response = urllib.request.urlopen(req,context=context).read().decode("utf-8")
print(response)
6、51job获取岗位数
流程:
a.设置正则匹配 b.对岗位进行utf-8编码 c.设置url d.设置请求体(请求地址、请求头) e.获取响应(对响应进行解码)
# 获取岗位数量
'''
<div class="rt">
共3500条职位
</div>
'''
import urllib
from urllib import request,parse
import re
# a.设置正则匹配 b.设置请求体(请求地址、请求头) d.获取响应(对响应进行解码)
# 设置正则匹配
jobNumRe = "共(\d+)条职位"
def getJobNum(job):
'''
获取职位数量
:param job: 职位
:return: int
'''
# 编码,空键
job = urllib.parse.urlencode({"":job})
url = "https://search.51job.com/list/030200,000000,0000,00,9,99,"+job+",2,1.html"
# 设置请求头,模拟成是浏览器访问
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
req = urllib.request.Request(url,headers=header)
response = urllib.request.urlopen(req).read().decode("gbk")
jobNum = re.findall(jobNumRe,response)
return jobNum
if __name__ == '__main__':
jobList = ["python","java","项目经理"]
for job in jobList:
print(getJobNum(job))
7、模拟有道翻译
import urllib
from urllib import request,parse
def getTranslate(kw):
'''
模拟有道翻译,
:param kw: 关键字
:return: 翻译结果
'''
# 设置请求头
header = {
"User-Agent": '''Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'''
}
# 设置表单数据
data = {
"i": kw,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "1530692157778",
"sign": "86981073c4755432afabd9680e3127ab",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTIME",
"typoResult": "false",
}
data = urllib.parse.urlencode(data).encode("utf-8")
# url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"
# {"errorCode":50},使用上面这个url会报错
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
#构建请求体
req = urllib.request.Request(url,data=data,headers=header)
response = urllib.request.urlopen(req)
# 字符串转成字典
retList = eval(response.read().decode("utf-8"))
# print(ret)
# print(retList["translateResult"])
retL = []
for ret in retList["translateResult"]:
# print(ret[0]["tgt"])
retL.append(ret[0]["tgt"])
return retL
if __name__ == '__main__':
kw = input("输入要翻译的词语:")
response = getTranslate(kw)
print(response)
后记
【后记】为了让大家能够轻松学编程,我创建了一个公众号【轻松学编程】,里面有让你快速学会编程的文章,当然也有一些干货提高你的编程水平,也有一些编程项目适合做一些课程设计等课题。
也可加我微信【1257309054】,拉你进群,大家一起交流学习。
如果文章对您有帮助,请我喝杯咖啡吧!
公众号
关注我,我们一起成长~~