\(\large \bf{Theorem }\ 2.7:\)
\(f:\mathbb{R^d}\rightarrow\mathbb{R}\text{ be convex and differentiable with a global minimum }x^*;\text{ Suppose }f\text{ is smooth with parameter }L.\text{ Choosing stepsize: }\gamma = \frac{1}{L},\text{ gradients descent yields:}\)
\[\begin{align}
f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2
\end{align}
\]
\(\large\bf Proof:\)
\(f\text{ is differentiable and smooth, according to Lemma 2.6, we can get:}\)
\[\begin{align}
f(x_{t+1})-f(x_t)\leq -\frac{1}{2L}||g_t||^2
\end{align}
\]
\(\text{Therefore:}\)
\[\begin{align}
\frac{1}{2L}||g_t||^2\leq f(x_t)-f(x_{t+1})
\end{align}
\]
\(\text{Now we sum up:}\)
\[\begin{align}
\frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2&\leq \sum_{t=0}^{T-1}[f(x_t)-f(x_{t+1})]\\
&=f(x_0)-f(x_T)
\end{align}
\]
\(\gamma = 1/L,\text{ therefore from previous analysis:}\)
\[\begin{align}
\sum_{t=0}^{T-1}[f(x_t)-f(x^*)]\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}||x_0-x^*||^2
\end{align}
\]
\(\text{Combine (5) and (6):}\)
\[\begin{align}
\sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}||x_0-x^*||^2 \\
&\leq f(x_0)-f(x_T)+\frac{1}{2\gamma}||x_0-x^*||^2
\end{align}
\]
\(\text{Hence:}\)
\[\begin{align}
\sum_{t=1}^{T}[f(x_t)-f(x^*)]&\leq \frac{1}{2\gamma}||x_0-x^*||^2\\
&=\frac{L}{2}||x_0-x^*||^2
\end{align}
\]
\(\text{As the result:}\)
\[\begin{align}
T\cdot (f(x_T)-f(x^*))&\leq \sum_{t=1}^T[f(x_t)-f(x^*)]\\
&=\frac{L}{2}||x_0-x^*||^2
\end{align}
\]
\[\begin{align}
\Rightarrow f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2
\end{align}
\]
1. Smooth and strongly convex function:\(O(\log(1/\epsilon))\) steps
\(\text{First-order method: only use the gradient information to minimize }f.\)
\(\large\bf Definition\ 2.9:\)
\(\text{Strongly convex function: }\)
\[\begin{align}
f(y)\geq f(x)+\nabla f(x)^T(y-x)+\frac{L}{2}||x-y||^2
\end{align}
\]
\(\large \bf Lemma\ 2.10:\)
\(\text{if }f \text{ is strongly convex with parameter }\mu>0,\text{ then }f\text{ is }\bf{strictly\ convex\ and\ has\ a\ unique\ global\ minimum.}\)
\(\text{Assume that }f\text{ is stringly convex with }\mu,\text{ from vanilla analysis:}\)
\[\begin{align}
g_t(x_t-x^*)&=\nabla f(x_t)^T(x_t-x^*)\\
&\geq f(x_t)-f(x^*)+\frac{\mu}{2}||x_t-x^*||^2
\end{align}
\]
\(\text{Hence:}\)
\[\begin{align}
f(x_t)-f(x^*)&\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||x_{t+1}-x^*||^2]-\frac{\mu}{2}||x_t-x^*||^2
\end{align}
\]
\(\text{Rewrite it as:}\)
\[\begin{align}
||x_{t+1}-x^*||^2\leq 2\gamma [f(x^*)-f(x_t)]+\gamma^2||g_t||^2+(1-\mu\gamma)||x_t-x^*||^2
\end{align}
\]
\(\large\bf{Theorem\ 2.12:}\)
\(f:\mathbb{R^d}\rightarrow\mathbb{R}\text{ be convex and differnentiable. Suppose }f\text{ is smooth with }L,\text{ and strongly convex with }\mu. \text{ Choosing stepsize:}\)
\[\begin{align}
\gamma = 1/L
\end{align}
\]
\(\text{Gradient descent with arbitary }x_0\text{ satisfies the following two properties:}\)
\((i)\)
\[\begin{align}
||x_{t+1}-x^*||^2\leq (1-\frac{\mu}{L})||x_t-x^*||^2
\end{align}
\]
\(\large\bf Proof:\)
\(\text{By smooth, we know:}\)
\[\begin{align}
f(x^*)-f(x_t)\leq f(x_{t+1})-f(x_t)\leq -\frac{1}{2L}||g_t||^2
\end{align}
\]
\(\text{Combine (18), we get}\)
\[\begin{align}
||x_{t+1}-x^*||^2&\leq -\gamma^2||g_t||^2+\gamma^2||g_t||^2+(1-\mu\gamma)||x_t-x^*||^2\\
&\leq (1-\frac{\mu}{L})||x_t-x^*||^2
\end{align}
\]
\((ii)\)
\[\begin{align}
f(x_T)-f(x^*)\leq \frac{L}{2}(1-\frac{\mu}{L})^T||x_0-x^*||^2
\end{align}
\]
\(\large\bf Proof:\)
\(\text{From smooth:}\)
\[\begin{align}
f(x_t)\leq f(x^*)+\frac{L}{2}||x_t-x^*||^2
\end{align}
\]
\[\begin{align}
\Rightarrow f(x_T)-f(x^*)&\leq \frac{L}{2}||x_T-x^*||^2\\
&\leq ... \leq \frac{L}{2}(1-\frac{\mu}{L})^T||x_0-x^*||^2
\end{align}
\]