2019 7.23学习笔记
设置随机请求头需要用到的网站 www.useragentstring.com
middlewares.py设置如下:
class UseragentRandomchangeDownloaderMiddleware(object): USER_AGENTS=[ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'] def process_request(self, request, spider): user_agent=random.choice(self.USER_AGENTS) request.headers['User-Agent']=user_agent
httpbin.py设置如下
import scrapy import json class HttpbinSpider(scrapy.Spider): name = 'httpbin' allowed_domains = ['httpbin.org'] start_urls = ['http://httpbin.org/user-agent'] def parse(self, response): user_agent=json.loads(response.text)['user-agent'] print('*'*20) print(user_agent) print('*'*20) yield scrapy.Request(self.start_urls[0],dont_filter=True)
设置随机开放ip代理同上相似
设置独享ip代理
from scrapy import signals import random import base64 class IPProxyRandomchangeDownloaderMiddleware(object): def process_request(self, request, spider): proxy='ip地址:端口号' user_password="账号:密码" request.meta['proxy']=proxy #bytes b64_user_password=base64.b64encode(user_password.encode('utf-8')) request.headers['Proxy-Authorization']='Basic '+b64_user_password.decode('utf-8')
json.load跟json.loads的区别:
import json jsonStr = '{"name":"aspiring", "age": 17, "hobby": ["money","power", "read"],"parames":{"a":1,"b":2}}' # 将json格式的字符串转为python数据类型的对象 jsonData = json.loads(jsonStr) print(jsonData) print(type(jsonData)) print(jsonData['hobby']) # 加载json文件 path1 = r'E:\***\ddd.json' with open(path1,'rb') as f: data = json.load(f) print(data) # 字典类型 print(type(data))