逻辑回归梯度下降推导
\[P(p=1|x;\theta) = h_\theta(x) = \frac{exp(\theta^Tx)}{1+exp(\theta^Tx)} \\
P(p=0|x;\theta) = 1 - h_\theta(x) = \frac{1}{1+exp(\theta^Tx)}
\]
\[\begin{align}
L(\theta) & = L(\theta;X,\vec{y}) = p(\vec{y}|X;θ) \\
& =\prod_{i=1}^mp(y^{(i)}|x^{(i)};\theta) \\
& =\prod_{i=1}^m(h_\theta(x^{(i)}))^{y^{(i)}}(1-h_\theta(x^{(i)}))^{1-y^{(i)}}
\end{align}
\]
\[\begin{align}
l(\theta) & = \log L(\theta) \\
& = \sum_{i=1}^m \left[ y^{(i)}\log h(x^{(i)}) + (1-y^{(i)})\log (1-h(x^{(i)})) \right] \\
& = \sum_{i=1}^m \left[ y^{(i)}\log \frac{h(x^{(i)})}{1-h(x^{(i)})} + \log (1-h(x^{(i)})) \right] \\
& = \sum_{i=1}^m \left[ y^{(i)}\theta^Tx + \log (1-h(x^{(i)})) \right]
\end{align}
\]
\[\frac{\partial}{\partial\theta_j}l(\theta) = \frac{\partial}{\partial\theta_j}y\theta^Tx + \frac{\partial}{\partial\theta_j}\log(1-h(x))
\]
\[\frac{\partial}{\partial\theta_j}y\theta^Tx = yx_j
\]
\[\begin{align}
\frac{\partial}{\partial\theta_j}\log(1-h(x)) & = \frac{1}{1-h(x)}*\frac{\partial}{\partial\theta_j}(1-h(x)) \\
& = \frac{1}{1-h(x)}*(-1)\frac{\partial}{\partial\theta_j}h(x) \\
& = \frac{1}{1-h(x)}*(-1)\frac{\partial}{\partial\theta_j} \frac{g(\theta^Tx)}{1+g(\theta^Tx)} \\
& = \frac{1}{1-h(x)}*(-1)\frac{\partial}{\partial\theta_j} \frac{g(\theta^Tx)}{1+g(\theta^Tx)} \\
\end{align}
\]
\[\begin{align}
\frac{\partial}{\partial\theta_j}\log(1-h(x)) & = \frac{\partial \log(1-h(x))}{\partial (1-h(x))} * \frac{\partial (1-h(x))}{\partial h(x)} * \frac{\partial h(x)}{\partial g(\theta^Tx)} * \frac{\partial g(\theta^Tx)}{\partial \theta^Tx} * \frac{\partial \theta^Tx}{\partial \theta_j} \\
& = \frac{1}{1-h(\theta(x))} * (-1) * \frac{1}{(1-g(\theta^Tx))^2} * g(\theta^Tx) * x_j \\
& = \frac{1}{1-h(\theta(x))} * (-1) * \frac{1}{1-g(\theta^Tx)} * \frac{g(\theta^Tx)}{1-g(\theta^Tx)} * x_j \\
& = \frac{1}{1-h(\theta^Tx)} * (-1) * (1-h(\theta^Tx) * h(\theta^Tx) * x_j \\
& = - h(\theta^Tx)x_j
\end{align}
\]
\[\frac{\partial}{\partial\theta_j}l(\theta) = yx_j - h(\theta^Tx)x_j = (y-h(\theta^Tx))x_j
\]