可微TopK算子
形式及推导
形式:前向计算如下所示,
\[\text{TopK}(\vec{x}, k) = \sigma(\vec{x}+\Delta(\vec{x}, k))
\]
注意\(\Delta(\cdot)\)满足限制条件\(\sum \Delta(\vec{x}, k) = k\),并且\(\sigma(x) = \frac{1}{1+\exp\{-x\}}\)
梯度推导:
令\(f(\vec{x}, k) = \sigma(\vec{x}+\Delta(\vec{x}, k))\)
则
\[\frac{
\text{d} f(\vec{x}, k)_{i}
}{
\text{d} x_{j}
} = \frac{
\text{d} \sigma(x_{i}+\Delta(\vec{x}, k))
}{
\text{d} x_{j}
}
= \sigma'(x_i + \Delta(\vec{x})) \Big(
\mathbb{I}_{i=j} + \frac{\text{d}\Delta(\vec{x})}{\text{d}x_j}
\Big)
\]
难点在于如何计算\(\frac{\text{d}\Delta(\vec{x})}{\text{d}x_j}\)。
我们通过利用条件\(\sum \Delta(\vec{x}) = k\)来计算上述导数:
\[\frac{\text{d}k}{\text{d}x_j} = 0
= \sum_{i}\sigma'(x_i+\Delta(\vec{x}))\Big(
\mathbb{I}_{i=j} + \frac{\text{d}{\Delta(\vec{x})}}{\text{d}x_{j}}
\Big)
= \sigma'(x_{\color\red j}+\Delta(\vec{x}))
+ \frac{\text{d}{\Delta(\vec{x})}}{\text{d}x_{j}} \sum_{i}\sigma'(x_i+\Delta(\vec{x}))
\]
因此,我们可以得到:
\[\frac{\text{d}{\Delta(\vec{x})}}{\text{d}x_{j}} = \frac{
- \sigma'(x_{\color\red j} +\Delta(\vec{x}))
}{
\sum_{i}\sigma'(x_i+\Delta(\vec{x}))
}
\]
向量版本:如果令\(v = \sigma'(\vec{x}+\Delta(\vec{x}))\),则雅可比矩阵为
\[J_{\text{TopK}}(\vec{x}) = \text{diag}(\vec{v}) - \frac{\vec{v}\vec{v}^{\top}}{\Vert\vec{v}\Vert_1}
\]
其他细节:如何计算出\(\Delta(\vec{x})=k\)?可以通过二分法快速找到该函数的合适值。
实现
# %% differentiable top-k function
import torch
from torch.func import vmap, grad
from torch.autograd import Function
import torch.nn as nn
sigmoid = torch.sigmoid
sigmoid_grad = vmap(vmap(grad(sigmoid)))
class TopK(Function):
@staticmethod
def forward(ctx, xs, k):
ts, ps = _find_ts(xs, k)
ctx.save_for_backward(xs, ts)
return ps
@staticmethod
def backward(ctx, grad_output):
# Compute vjp, that is grad_output.T @ J.
xs, ts = ctx.saved_tensors
# Let v = sigmoid'(x + t)
v = sigmoid_grad(xs + ts)
s = v.sum(dim=1, keepdims=True)
# Jacobian is -vv.T/s + diag(v)
uv = grad_output * v
t1 = -uv.sum(dim=1, keepdims=True) * v / s
return t1 + uv, None
@torch.no_grad()
def _find_ts(xs, k):
# (batch_size, input_dim)
_, n = xs.shape
assert 0 < k < n
# Lo should be small enough that all sigmoids are in the 0 area.
# Similarly Hi is large enough that all are in their 1 area.
# (batch_size, 1)
lo = -xs.max(dim=1, keepdims=True).values - 10
hi = -xs.min(dim=1, keepdims=True).values + 10
for iteration in range(64):
mid = (hi + lo) / 2
subject = sigmoid(xs + mid).sum(dim=1)
mask = subject < k
lo[mask] = mid[mask]
hi[~mask] = mid[~mask]
ts = (lo + hi) / 2
return ts, sigmoid(xs + ts)
def test_check():
topk = TopK.apply
xs = torch.randn(2, 10)
ps = topk(xs, 2)
print(f"{xs=}")
print(f"{ps=}")
print(f"{ps.sum(dim=1)=}")
from torch.autograd import gradcheck
input = torch.randn(20, 10, dtype=torch.double, requires_grad=True)
for k in range(1, 10):
print(k, gradcheck(topk, (input, k), eps=1e-6, atol=1e-4))
def sgd_update():
topk = TopK.apply
batch_size = 2
k = 2
tau = 10
xs = torch.randn(batch_size, 10, dtype=torch.double, requires_grad=True)
target = torch.zeros_like(xs)
target[torch.arange(batch_size), torch.argsort(xs, descending=True)[:, :k].T] = 1.0
print(f"{xs=}")
print(f"{target=}")
loss_fn = nn.MSELoss()
learning_rate = 1
def fn(x):
x = x * tau
return topk(x, k)
for iteration in range(1, 1000 + 1):
ws = nn.Parameter(data=xs, requires_grad=True)
ps = fn(ws)
loss = loss_fn(ps.view(-1), target.view(-1))
loss.backward()
xs = ws - learning_rate * ws.grad
if iteration % 100 == 0:
print(f"{iteration=} {fn(xs)=}")
sgd_update()
相关资料
Differentiable top-k function - Stach Exchange
Softmax后传:寻找Top-K的光滑近似 - 科学空间