爬虫2---Urllib库进阶
请求对象的定制(user-Agent反爬解决办法)
import urllib.request url = 'https://www.baidu.com' # url的组成 # https://www.baidu.com/s?wd=周杰伦 # http/https www.baidu.com 80/443 s wd = 周杰伦 # # 协议 主机 端口号 路径 参数 锚点 # http 80 # https 443 # mysql 3306 # oracle 1521 # redis 6379 # mongodb 27017 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 因为urlopen方法中不能存储字典 所以headers不能传递进去 # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request) content = response.read().decode('utf8') print(content)
编解码:urllib_get请求的quote方法
urllib.parse.quote() :将字符变为unicode编码
# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6 # 需求 获取 https://www.baidu.com/s?wd=周杰伦的网页源码 import urllib.request import urllib.parse url = 'https://www.baidu.com/s?wd=' # 请求对象的定制为了解决反爬的第一种手段 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 将周杰伦三个字变成unicode编码的格式 # 我们需要依赖于urllib.parse name = urllib.parse.quote('周杰伦') url = url + name # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的内容 content = response.read().decode('utf-8') # 打印数据 print(content)
编解码:urllib_get请求的urlencode方法(多个参数的时候)
urllib.parse.urlencode(data) data是对象类型
# urlencode应用场景:多个参数的时候 # https://www.baidu.com/s?wd=周杰伦&sex=男 # import urllib.parse # # data = { # 'wd':'周杰伦', # 'sex':'男', # 'location':'中国台湾省' # } # # a = urllib.parse.urlencode(data) # print(a) #获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7的网页源码 import urllib.request import urllib.parse base_url = 'https://www.baidu.com/s?' data = { 'wd':'周杰伦', 'sex':'男', 'location':'中国台湾省' } new_data = urllib.parse.urlencode(data) # 请求资源路径 url = base_url + new_data headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取网页源码的数据 content = response.read().decode('utf-8') # 打印数据 print(content)
post请求百度翻译:
写的url路径里面没有?,请求必须进行编码 (data = urllib.parse.urlencode(data).encode('utf-8')),请求参数需要放在请求对象定制的参数中,不会拼接在url后面。最后结果仍然是encode的编码
# post请求 import urllib.request import urllib.parse url = 'https://fanyi.baidu.com/sug' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } data = { 'kw':'spider' } # post请求的参数 必须要进行编码 data = urllib.parse.urlencode(data).encode('utf-8') # post的请求的参数 是不会拼接在url的后面的 而是需要放在请求对象定制的参数中 # post请求的参数 必须要进行编码 request = urllib.request.Request(url=url,data=data,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的数据 content = response.read().decode('utf-8') # 字符串--》json对象 import json obj = json.loads(content) print(obj) # post请求方式的参数 必须编码 data = urllib.parse.urlencode(data) # 编码之后 必须调用encode方法 data = urllib.parse.urlencode(data).encode('utf-8') # 参数是放在请求对象定制的方法中 request = urllib.request.Request(url=url,data=data,headers=headers)
post请求百度翻译详细翻译:
反爬的解决办法:
import urllib.request import urllib.parse url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh' headers = { # 'Accept': '*/*', # 'Accept-Encoding': 'gzip, deflate, br', # 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Connection': 'keep-alive', # 'Content-Length': '135', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'BIDUPSID=DAA8F9F0BD801A2929D96D69CF7EBF50; PSTM=1597202227; BAIDUID=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; __yjs_duid=1_c19765bd685fa6fa12c2853fc392f8db1618999058029; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDUSS_BFESS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=34435_31660_34405_34004_34073_34092_26350_34426_34323_22158_34390; delPer=1; BA_HECTOR=8185a12020018421b61gi6ka20q; BCLID=10943521300863382545; BDSFRCVID=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; BCLID_BFESS=10943521300863382545; BDSFRCVID_BFESS=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629701482,1629702031,1629702343,1629704515; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629704515; __yjs_st=2_MDBkZDdkNzg4YzYyZGU2NTM5NzBjZmQ0OTZiMWRmZGUxM2QwYzkwZTc2NTZmMmIxNDJkYzk4NzU1ZDUzN2U3Yjc4ZTJmYjE1YTUzMTljYWFkMWUwYmVmZGEzNmZjN2FlY2M3NDAzOThhZTY5NzI0MjVkMmQ0NWU3MWE1YTJmNGE5NDBhYjVlOWY3MTFiMWNjYTVhYWI0YThlMDVjODBkNWU2NjMwMzY2MjFhZDNkMzVhNGMzMGZkMWY2NjU5YzkxMDk3NTEzODJiZWUyMjEyYTk5YzY4ODUyYzNjZTJjMGM5MzhhMWE5YjU3NTM3NWZiOWQxNmU3MDVkODExYzFjN183XzliY2RhYjgz; ab_sr=1.0.1_ZTc2ZDFkMTU5ZTM0ZTM4MWVlNDU2MGEzYTM4MzZiY2I2MDIxNzY1Nzc1OWZjZGNiZWRhYjU5ZjYwZmNjMTE2ZjIzNmQxMTdiMzIzYTgzZjVjMTY0ZjM1YjMwZTdjMjhiNDRmN2QzMjMwNWRhZmUxYTJjZjZhNTViMGM2ODFlYjE5YTlmMWRjZDAwZGFmMDY4ZTFlNGJiZjU5YzE1MGIxN2FiYTU3NDgzZmI4MDdhMDM5NTQ0MjQxNDBiNzdhMDdl', # 'Host': 'fanyi.baidu.com', # 'Origin': 'https://fanyi.baidu.com', # 'Referer': 'https://fanyi.baidu.com/?aldtype=16047', # 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', # 'sec-ch-ua-mobile': '?0', # 'Sec-Fetch-Dest': 'empty', # 'Sec-Fetch-Mode': 'cors', # 'Sec-Fetch-Site': 'same-origin', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', # 'X-Requested-With': 'XMLHttpRequest', } data = { 'from': 'en', 'to': 'zh', 'query': 'love', 'transtype': 'realtime', 'simple_means_flag': '3', 'sign': '198772.518981', 'token': '5483bfa652979b41f9c90d91f3de875d', 'domain': 'common', } # post请求的参数 必须进行编码 并且要调用encode方法 data = urllib.parse.urlencode(data).encode('utf-8') # 请求对象的定制 request = urllib.request.Request(url = url,data = data,headers = headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的数据 content = response.read().decode('utf-8') import json obj = json.loads(content) print(obj)
AJax的get请求 :豆瓣电影第一页
# get请求 # 获取豆瓣电影的第一页的数据 并且保存起来 import urllib.request url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # (1) 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # (2)获取响应的数据 response = urllib.request.urlopen(request) content = response.read().decode('utf-8') # (3) 数据下载到本地 # open方法默认情况下使用的是gbk的编码 如果我们要想保存汉字 那么需要在open方法中指定编码格式为utf-8 # encoding = 'utf-8' # fp = open('douban.json','w',encoding='utf-8') # fp.write(content) with open('douban1.json','w',encoding='utf-8') as fp: fp.write(content)
ajax的get请求获取豆瓣电影前10页
根据ajax分页请求来
# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=0&limit=20 # https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=20&limit=20 # https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=40&limit=20 # https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=60&limit=20 # page 1 2 3 4 # start 0 20 40 60 # start (page - 1)*20 # 下载豆瓣电影前10页的数据 # (1) 请求对象的定制 # (2) 获取响应的数据 # (3) 下载数据 import urllib.parse import urllib.request def create_request(page): base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&' data = { 'start':(page - 1) * 20, 'limit':20 } data = urllib.parse.urlencode(data) url = base_url + data headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) return request def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content def down_load(page,content): with open('douban_' + str(page) + '.json','w',encoding='utf-8')as fp: fp.write(content) # 程序的入口 if __name__ == '__main__': start_page = int(input('请输入起始的页码')) end_page = int(input('请输入结束的页面')) for page in range(start_page,end_page+1): # 每一页都有自己的请求对象的定制 request = create_request(page) # 获取响应的数据 content = get_content(request) # 下载 down_load(page,content)
ajax post请求:请求肯德基官网
# 1页 # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname # post # cname: 北京 # pid: # pageIndex: 1 # pageSize: 10 # 2页 # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname # post # cname: 北京 # pid: # pageIndex: 2 # pageSize: 10 import urllib.request import urllib.parse def create_request(page): base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname' data = { 'cname': '北京', 'pid':'', 'pageIndex': page, 'pageSize': '10' } data = urllib.parse.urlencode(data).encode('utf-8') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } request = urllib.request.Request(url=base_url,headers=headers,data=data) return request def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content def down_load(page,content): with open('kfc_' + str(page) + '.json','w',encoding='utf-8')as fp: fp.write(content) if __name__ == '__main__': start_page = int(input('请输入起始页码')) end_page = int(input('请输入结束页码')) for page in range(start_page,end_page+1): # 请求对象的定制 request = create_request(page) # 获取网页源码 content = get_content(request) # 下载 down_load(page,content)
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· 字符编码:从基础到乱码解决
· 提示词工程——AI应用必不可少的技术