爬虫基础, 乱码问题, jupyter, urllib, requests, lxml, multiprocessing并发, session, beautifulsoup

碰到乱码时解决方法

requests.get().text是根据HTML文件的headers中的编码来解码的, 出现乱码需要自己用content来获取信息然后解码

res = res.encode('iso-8859-1').decode('gbk') # 不知道用什么解码时, 就用这个, 一般html的header中有charset  

html = r.content
html = str(html,'utf-8') #html_doc=html.decode("utf-8","ignore")

r = requests.get("http://www.baidu.com")
r.encoding='utf-8'
html=r.text

方法二(如果还是不行, 用这方法):

# -*-coding:utf8-*-

import requests

req = requests.get("http://news.sina.com.cn/")

if req.encoding == 'ISO-8859-1':
    encodings = requests.utils.get_encodings_from_content(req.text)
    if encodings:
        encoding = encodings[0]
    else:
        encoding = req.apparent_encoding

    # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
    global encode_content
    encode_content = req.content.decode(encoding, 'replace') #如果设置为replace，则会用?取代非法字符；


print(encode_content)

with open('test.html','w',encoding='utf-8') as f:
    f.write(encode_content)
--------------------- 
作者：chaowanghn 
来源：CSDN 
原文：https://blog.csdn.net/chaowanghn/article/details/54889835 
版权声明：本文为博主原创文章，转载请附上博文链接！

Jupyter快捷键

插入cell: a b
删除: x
执行:shift+enter
tab:
cell模式切换: y(m->code) m(code->m)
shift+tab:打开帮助文档

爬虫的分类:

通用爬虫:
聚焦爬虫:
增量式:

不需要with open写入文件的写法: urllib

import urllib
urllib.request.urlretrieve(url, 'a.jpg')

View Code

requests.get requests.post

# get
for i in range(5):
param = {
'type': 'tv',
'tag':'热门',
'sort': 'recommend',
'page_limit': 20,
'page_start': i,
}
cont = requests.get(url, params=param).json()
print(cont)

# post
url = 'https://fanyi.baidu.com/sug'
wd = input('enter a word:')
data = {
'kw':wd
}
response = requests.post(url=url,data=data)

headers字典

headers = {
    'Connection':'close', #当请求成功后,马上断开该次请求(及时释放请求池中的资源)
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}

获取requests爬取结果

content = requests.get(url, params=param)
a = content.text # 字符串
b = content.content # b''
c = content.json() # json格式

etree

有的时候找不到href对应的位置, 可以在返回的内容里面搜一下 .mp4 或者其他格式, 还可以搜一下自己按了链接点出来的href, 还有js里面也可能有

from lxml import etree
import requests
import base64
import random


text = requests.get(url, headers).text  # 获取网页文本信息
tree = etree.HTML(text)  # etree的实例化
lis = tree.xpath('/html/body/div[5]/div[5]/div[1]/ul/li') # xpath解析
for el in lis:
    a = el.xpath('./div[2]/h2/a/text()')  # 第二次解析时, 要在/div前加个. 否则会从html的最开始解析, 在不加text()时, 输出一个列表, 可以用[0]来获取
    a = el.xpath('./div[3]//text()')
    print(a)

for el in lis:
    a = el.xpath('./a/img/@src')[0] # xpath是个列表, 要用[0]来获取元素
    print(a)

res = requests.get(url, headers) 
res = res.text 
res = res.encode('iso-8859-1').decode('gbk') # 不知道用什么解码时, 就用这个, 一般html的header中有charset  


# 在一个网页的etree中, 获取适合多个解析式的内容
li_list = tree.xpath('//div[@class="bottom"]/ul/li |  //div[@class="bottom"]/ul/div[2]/li')


# 下载文件
data = requests.get(url=download_url,headers=headers).content
        fileName = name+'.rar'
        with open(fileName,'wb') as fp:
            fp.write(data)

并发

from multiprocessing.dummy import Pool
pool = Pool(5)
pool.map(getvideo, lst)

二维码识别

用云打码识别

import http.client, mimetypes, urllib, json, time, requests

######################################################################

class YDMHttp:

    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username  
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response
    
    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001
    
    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text

######################################################################

# 用户名
username    = 'username'

# 密码
password    = 'password'                            

# 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
appid       = 1                                     

# 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
appkey      = '22cc5376925e9387a23cf797cb9ba745'    

# 图片文件
filename    = 'getimage.jpg'                        

# 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype    = 1004

# 超时时间，秒
timeout     = 60                                    

# 检查
if (username == 'username'):
    print('请设置好相关参数再测试')
else:
    # 初始化
    yundama = YDMHttp(username, password, appid, appkey)

    # 登陆云打码
    uid = yundama.login();
    print('uid: %s' % uid)

    # 查询余额
    balance = yundama.balance();
    print('balance: %s' % balance)

    # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
    cid, result = yundama.decode(filename, codetype, timeout);
    print('cid: %s, result: %s' % (cid, result))

######################################################################