2019 7.23学习笔记

设置随机请求头需要用到的网站 www.useragentstring.com

middlewares.py设置如下:

class UseragentRandomchangeDownloaderMiddleware(object):
    USER_AGENTS=[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36']

    def process_request(self, request, spider):
        user_agent=random.choice(self.USER_AGENTS)
        request.headers['User-Agent']=user_agent

httpbin.py设置如下

import scrapy
import json

class HttpbinSpider(scrapy.Spider):
    name = 'httpbin'
    allowed_domains = ['httpbin.org']
    start_urls = ['http://httpbin.org/user-agent']

    def parse(self, response):
        user_agent=json.loads(response.text)['user-agent']
        print('*'*20)
        print(user_agent)
        print('*'*20)
        yield scrapy.Request(self.start_urls[0],dont_filter=True)

设置随机开放ip代理同上相似

设置独享ip代理

from scrapy import signals
import random
import base64

class IPProxyRandomchangeDownloaderMiddleware(object):
    def process_request(self, request, spider):
        proxy='ip地址:端口号'
        user_password="账号:密码"
        request.meta['proxy']=proxy
        #bytes
        b64_user_password=base64.b64encode(user_password.encode('utf-8'))
        request.headers['Proxy-Authorization']='Basic '+b64_user_password.decode('utf-8')

json.load跟json.loads的区别:

import json
 
jsonStr = '{"name":"aspiring", "age": 17, "hobby": ["money","power", "read"],"parames":{"a":1,"b":2}}'
 
# 将json格式的字符串转为python数据类型的对象
jsonData = json.loads(jsonStr)
print(jsonData)
print(type(jsonData))
print(jsonData['hobby'])
 
# 加载json文件
path1 = r'E:\***\ddd.json'
 
with open(path1,'rb') as f:
    data = json.load(f)
    print(data)
    # 字典类型
    print(type(data))

 

posted on 2019-07-23 06:59  Joker乔柯  阅读(170)  评论(0编辑  收藏  举报

导航