Source code for torchmin.trustregion.krylov

"""
TODO: this module is not yet complete. It is not ready for use.
"""
import numpy as np
from scipy.linalg import eigh_tridiagonal, get_lapack_funcs
import torch

from .base import _minimize_trust_region, BaseQuadraticSubproblem


[docs]def _minimize_trust_krylov(fun, x0, **trust_region_options):
    """Minimization of scalar function of one or more variables using
    the GLTR Krylov subspace trust-region algorithm.

    .. warning::
        This minimizer is in early stages and has not been rigorously
        tested. It may change in the near future.

    Parameters
    ----------
    fun : callable
        Scalar objective function to minimize.
    x0 : Tensor
        Initialization point.
    initial_tr_radius : float
        Initial trust-region radius.
    max_tr_radius : float
        Maximum value of the trust-region radius. No steps that are longer
        than this value will be proposed.
    eta : float
        Trust region related acceptance stringency for proposed steps.
    gtol : float
        Gradient norm must be less than ``gtol`` before successful
        termination.

    Returns
    -------
    result : OptimizeResult
        Result of the optimization routine.

    Notes
    -----
    This trust-region solver is based on the GLTR algorithm as
    described in [1]_ and [2]_.

    References
    ----------
    .. [1] F. Lenders, C. Kirches, and A. Potschka, "trlib: A vector-free
           implementation of the GLTR method for...",
           arXiv:1611.04718.
    .. [2] N. Gould, S. Lucidi, M. Roma, P. Toint: “Solving the Trust-Region
           Subproblem using the Lanczos Method”,
           SIAM J. Optim., 9(2), 504–525, 1999.
    .. [3] J. Nocedal and  S. Wright, "Numerical optimization",
           Springer Science & Business Media. pp. 83-91, 2006.

    """
    return _minimize_trust_region(fun, x0,
                                  subproblem=KrylovSubproblem,
                                  **trust_region_options)


class KrylovSubproblem(BaseQuadraticSubproblem):
    """The GLTR trust region sub-problem defined on an expanding
    Krylov subspace.

    Based on the implementation of GLTR described in [1]_.

    References
    ----------
    .. [1] F. Lenders, C. Kirches, and A. Potschka, "trlib: A vector-free
           implementation of the GLTR method for...",
           arXiv:1611.04718.
    .. [2] N. Gould, S. Lucidi, M. Roma, P. Toint: “Solving the Trust-Region
           Subproblem using the Lanczos Method”,
           SIAM J. Optim., 9(2), 504–525, 1999.
    .. [3] J. Nocedal and  S. Wright, "Numerical optimization",
           Springer Science & Business Media. pp. 83-91, 2006.
    """
    hess_prod = True
    max_lanczos = None
    max_ms_iters = 500  # max iterations of the Moré-Sorensen loop

    def __init__(self, x, fun, k_easy=0.1, k_hard=0.2, tol=1e-5, ortho=True,
                 debug=False):
        super().__init__(x, fun)
        self.eps = torch.finfo(x.dtype).eps
        self.k_easy = k_easy
        self.k_hard = k_hard
        self.tol = tol
        self.ortho = ortho
        self._debug = debug

    def tridiag_subproblem(self, Ta, Tb, tr_radius):
        """Solve the GLTR tridiagonal subproblem.

        Based on Algorithm 5.2 of [2]_. We factorize as follows:

        .. math::
            T + lambd * I = LDL^T

        Where `D` is diagonal and `L` unit (lower) bi-diagonal.
        """
        device, dtype = Ta.device, Ta.dtype

        # convert to numpy
        Ta = Ta.cpu().numpy()
        Tb = Tb.cpu().numpy()
        tr_radius = float(tr_radius)

        # right hand side
        rhs = np.zeros_like(Ta)
        rhs[0] = - float(self.jac_mag)

        # get LAPACK routines for factorizing and solving sym-PD tridiagonal
        ptsv, pttrs = get_lapack_funcs(('ptsv', 'pttrs'), (Ta, Tb, rhs))

        eig0 = None
        lambd_lb = 0.
        lambd = 0.
        for _ in range(self.max_ms_iters):
            lambd = max(lambd, lambd_lb)

            # factor T + lambd * I = LDL^T and solve LDL^T p = rhs
            d, e, p, info = ptsv(Ta + lambd, Tb, rhs)
            assert info >= 0  # sanity check
            if info > 0:
                assert eig0 is None  # sanity check; should only happen once
                # estimate smallest eigenvalue and continue
                eig0 = eigh_tridiagonal(
                    Ta, Tb, eigvals_only=True, select='i',
                    select_range=(0,0), lapack_driver='stebz').item()
                lambd_lb = max(1e-3 - eig0, 0)
                continue

            p_norm = np.linalg.norm(p)
            if p_norm < tr_radius:
                # TODO: add extra checks
                status = 0
                break
            elif abs(p_norm - tr_radius) / tr_radius <= self.k_easy:
                status = 1
                break

            # solve LDL^T q = p and compute <q, p>
            v, info = pttrs(d, e, p)
            q_norm2 = v.dot(p)

            # update lambd
            lambd += (p_norm**2 / q_norm2) * (p_norm - tr_radius) / tr_radius
        else:
            status = -1

        p = torch.tensor(p, device=device, dtype=dtype)

        return p, status, lambd

    def solve(self, tr_radius):
        g = self.jac
        gamma_0 = self.jac_mag
        n, = g.shape
        m = n if self.max_lanczos is None else min(n, self.max_lanczos)
        dtype = g.dtype
        device = g.device
        h_best = None
        error_best = float('inf')

        # Lanczos Q matrix buffer
        Q = torch.zeros(m, n, dtype=dtype, device=device)
        Q[0] = g / gamma_0

        # Lanczos T matrix buffers
        # a and b are the diagonal and off-diagonal entries of T, respectively
        a = torch.zeros(m, dtype=dtype, device=device)
        b = torch.zeros(m, dtype=dtype, device=device)

        # first lanczos iteration
        r = self.hessp(Q[0])
        torch.dot(Q[0], r, out=a[0])
        r.sub_(Q[0] * a[0])
        torch.linalg.norm(r, out=b[0])
        if b[0] < self.eps:
            raise RuntimeError('initial beta is zero.')

        # remaining iterations
        for i in range(1, m):
            torch.div(r, b[i-1], out=Q[i])
            r = self.hessp(Q[i])
            r.sub_(Q[i-1] * b[i-1])
            torch.dot(Q[i], r, out=a[i])
            r.sub_(Q[i] * a[i])
            if self.ortho:
                # Re-orthogonalize with Gram-Schmidt
                r.addmv_(Q[:i+1].T, Q[:i+1].mv(r), alpha=-1)
            torch.linalg.norm(r, out=b[i])
            if b[i] < self.eps:
                # This should never occur when self.ortho=True
                raise RuntimeError('reducible T matrix encountered.')

            # GLTR sub-problem
            h, status, lambd = self.tridiag_subproblem(a[:i+1], b[:i], tr_radius)

            if status >= 0:
                # convergence check; see Algorithm 1 of [1]_ and
                # Algorithm 5.1 of [2]_. Equivalent to the following:
                #     p = Q[:i+1].T.mv(h)
                #     error = torch.linalg.norm(self.hessp(p) + lambd * p + g)
                error = b[i] * h[-1].abs()
                if self._debug:
                    print('iter %3d - status: %d - lambd: %0.4e - error: %0.4e'
                          % (i+1, status, lambd, error))
                if error < error_best:
                    # we've found a new best
                    hits_boundary = status != 0
                    h_best = h
                    error_best = error
                    if error_best <= self.tol:
                        break

            elif self._debug:
                print('iter %3d - status: %d - lambd: %0.4e' %
                      (i+1, status, lambd))

        if h_best is None:
            # TODO: what should we do here?
            raise RuntimeError('gltr solution not found')

        # project h back to R^n
        p_best = Q[:i+1].T.mv(h_best)

        return p_best, hits_boundary