Source code for brainstorm.initializers

#!/usr/bin/env python
# coding=utf-8
from __future__ import division, print_function, unicode_literals

import numpy as np
import six

from brainstorm.describable import Describable
from brainstorm.randomness import Seedable
from brainstorm.utils import InitializationError

# somehow this construction is needed because in __all__ unicode does not work
__all__ = [str(a) for a in [
    'ArrayInitializer', 'DenseSqrtFanIn', 'DenseSqrtFanInOut', 'EchoState',
    'Gaussian', 'Identity', 'LstmOptInit', 'Orthogonal', 'RandomWalk',
    'SparseInputs', 'SparseOutputs', 'Uniform']]


# ########################### Support Classes #################################

class Initializer(Seedable, Describable):
    """
    Base class for all initializers. It inherits from Seedable, so every
    sub-class has access to self.rnd, and it provides basic methods for
    converting from and to a description.
    """

    def __call__(self, shape):
        raise NotImplementedError()

    def _assert_atleast2d(self, shape):
        if len(shape) < 2:
            raise InitializationError(
                "{} only works on >2D matrices, but shape was {}".format(
                    self.__class__.__name__, shape))


# ########################### Initializers ####################################

[docs]class ArrayInitializer(Initializer): """ Initializes the parameters as the values of the input array. """ def __init__(self, array): super(ArrayInitializer, self).__init__() self.array = np.array(array) def __call__(self, shape): if not self.array.shape == shape: raise InitializationError('Shape mismatch {} != {}' .format(self.array.shape, shape)) return self.array def __describe__(self): return self.array.tolist()
[docs]class DenseSqrtFanIn(Initializer): """ Initializes the parameters randomly according to a uniform distribution over the interval [-scale/sqrt(n), scale/sqrt(n)] where n is the number of inputs to each unit. Uses scale=sqrt(6) by default which is appropriate for rel units. When number of inputs and outputs are the same, this is equivalent to using ``DenseSqrtFanInOut``. Scaling: * rel: sqrt(6) * tanh: sqrt(3) * sigmoid: 4 * sqrt(3) * linear: 1 Args: scale (Optional(float or str)): The activation function dependent scaling factor. Can be either float or one of ['rel', 'tanh', 'sigmoid', 'linear']. Defaults to 'rel'. """ __default_values__ = {'scale': 'rel'} def __init__(self, scale='rel'): super(DenseSqrtFanIn, self).__init__() self.scale = scale def __call__(self, shape): self._assert_atleast2d(shape) num_in = np.prod(shape[1:]) if isinstance(self.scale, six.string_types): scale = { 'rel': np.sqrt(6), 'tanh': np.sqrt(3), 'sigmoid': 4 * np.sqrt(3), 'linear': 1 }[self.scale] else: scale = self.scale return scale * (2 * self.rnd.rand(*shape) - 1) / np.sqrt(num_in)
[docs]class DenseSqrtFanInOut(Initializer): """ Initializes the parameters randomly according to a uniform distribution over the interval [-scale/sqrt(n1+n2), scale/sqrt(n1+n2)] where n1 is the number of inputs to each unit and n2 is the number of units in the current layer. Uses scale=sqrt(12) by default which is appropriate for rel units. Scaling: * rel: sqrt(12) * tanh: sqrt(6) * sigmoid: 4 * sqrt(6) * linear: 1 Args: scale (Optional(float or str)): The activation function dependent scaling factor. Can be either float or one of ['rel', 'tanh', 'sigmoid', 'linear']. Defaults to 'rel'. Reference: Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks" International conference on artificial intelligence and statistics. 2010. """ __default_values__ = {'scale': 'rel'} def __init__(self, scale='rel'): super(DenseSqrtFanInOut, self).__init__() self.scale = scale def __call__(self, shape): self._assert_atleast2d(shape) n1, n2 = shape[0], np.prod(shape[1:]) if isinstance(self.scale, six.string_types): scale = { 'rel': np.sqrt(12), 'tanh': np.sqrt(6), 'sigmoid': 4 * np.sqrt(6), 'linear': 1 }[self.scale] else: scale = self.scale return scale * (2 * self.rnd.rand(*shape) - 1) / np.sqrt(n1 + n2)
[docs]class EchoState(Initializer): """ Classic echo state initialization. Creates a matrix with a fixed spectral radius (default=1.0). Spectral radius should be < 1 to satisfy ES-property. Only works for square matrices. Example: >>> net.initialize(default=Gaussian(), Recurrent={'R': EchoState(0.77)}) """ __default_values__ = {'spectral_radius': 1.0} def __init__(self, spectral_radius=1.0): super(EchoState, self).__init__() self.spectral_radius = spectral_radius def __call__(self, shape): self._assert_atleast2d(shape) if shape[0] != shape[1]: raise InitializationError("Matrix should be square but was: {}" "".format(shape)) parameters = self.rnd.uniform(-0.5, 0.5, size=shape) # normalizing and setting spectral radius (correct, slow): rho_parameters = max(abs(np.linalg.eig(parameters)[0])) return parameters * (self.spectral_radius / rho_parameters)
[docs]class Gaussian(Initializer): """ Initializes the parameters randomly according to a normal distribution of given mean and standard deviation. """ __default_values__ = {'mean': 0.0} def __init__(self, std=0.1, mean=0.0): super(Gaussian, self).__init__() self.std = std self.mean = mean def __call__(self, shape): return self.rnd.randn(*shape) * self.std + self.mean
[docs]class Identity(Initializer): """ Initialize a matrix to the (scaled) identity matrix + some noise. """ def __init__(self, scale=1.0, std=0.01, enforce_square=True): super(Identity, self).__init__() self.scale = scale self.std = std self.enforce_square = enforce_square def __call__(self, shape): if len(shape) != 2: raise InitializationError("Works only with 2D matrices but shape " "was: {}".format(shape)) if self.enforce_square and shape[0] != shape[1]: raise InitializationError("Matrix needs to be square, but was {}" "".format(shape)) weights = np.eye(shape[0], shape[1], dtype=np.float) * self.scale weights += self.rnd.randn(*shape) * self.std return weights
[docs]class LstmOptInit(Initializer): """ Used to initialize an LstmOpt layer. This is useful because in an LstmOpt layer all the parameters are concatenated for efficiency. The parameters (input_block, input_gate, forget_gate, and output_gate) can be scalars or Initializers themselves. """ def __init__(self, input_block=0.0, input_gate=0.0, forget_gate=0.0, output_gate=0.0): super(LstmOptInit, self).__init__() self.block_input = input_block self.input_gate = input_gate self.forget_gate = forget_gate self.output_gate = output_gate def __call__(self, shape): if shape[0] % 4 != 0: raise InitializationError("First dim of LstmOpt shape needs to be " "divisible by 4. But shape was {}" .format(shape)) weights = np.zeros(shape) n = shape[0] // 4 sub_shape = (n,) + shape[1:] weights[:n] = evaluate_initializer( self.block_input, sub_shape, seed=self.rnd.generate_seed()) weights[n:2 * n] = evaluate_initializer( self.input_gate, sub_shape, seed=self.rnd.generate_seed()) weights[2 * n:3 * n] = evaluate_initializer( self.forget_gate, sub_shape, seed=self.rnd.generate_seed()) weights[3 * n:] = evaluate_initializer( self.output_gate, sub_shape, seed=self.rnd.generate_seed()) return weights
[docs]class Orthogonal(Initializer): """ Orthogonal initialization. Reference: Saxe, Andrew M., James L. McClelland, and Surya Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks." arXiv preprint arXiv:1312.6120 (2013). """ def __init__(self, scale=1.0): super(Orthogonal, self).__init__() self.scale = scale def __call__(self, shape): if len(shape) != 2: raise InitializationError("Works only with 2D matrices but shape " "was: {}".format(shape)) a = self.rnd.randn(*shape) u, _, v = np.linalg.svd(a, full_matrices=False) q = u if u.shape == shape else v return (self.scale * q).reshape(shape)
[docs]class RandomWalk(Initializer): """ Initializes a (square) weight matrix with the random walk scheme proposed by: Sussillo, David, and L. F. Abbott. "Random Walk Initialization for Training Very Deep Feedforward Networks." arXiv:1412.6558 [cs, Stat], December 19, 2014. http://arxiv.org/abs/1412.6558. """ __default_values__ = {'scale': None} def __init__(self, act_func='linear', scale=None): super(RandomWalk, self).__init__() self.act_func = act_func self.scale = scale def __call__(self, shape): if len(shape) != 2: raise InitializationError("Works only with 2D matrices but shape " "was: {}".format(shape)) if shape[0] != shape[1]: raise InitializationError("Matrix needs to be square, but was {}" "".format(shape)) N = shape[1] if self.scale is None: scale = { 'linear': np.exp(1 / (2 * N)), 'rel': np.sqrt(2) * np.exp(1.2 / (max(N, 6) - 2.4)) }[self.act_func] else: scale = self.scale return scale * self.rnd.randn(*shape) / N
[docs]class SparseInputs(Initializer): """ Makes sure every unit only gets activation from a certain number of input units and the rest of the parameters are 0. The connections are initialized by evaluating the passed sub_initializer. Example: >>> net.initialize(FullyConnected=SparseInputs(Gaussian(), ... connections=10)) """ def __init__(self, sub_initializer, connections=15): super(SparseInputs, self).__init__() self.sub_initializer = sub_initializer self.connections = connections def __call__(self, shape): self._assert_atleast2d(shape) if shape[0] < self.connections: raise InitializationError("Input dimension to small: {} < {}" "".format(shape[0], self.connections)) sub_result = evaluate_initializer(self.sub_initializer, shape) connection_mask = np.zeros(shape) connection_mask[:self.connections, :] = 1. for i in range(shape[1]): self.rnd.shuffle(connection_mask[:, i]) return sub_result * connection_mask
[docs]class SparseOutputs(Initializer): """ Makes sure every unit is propagating its activation only to a certain number of output units, and the rest of the parameters are 0. The connections are initialized by evaluating the passed sub_initializer. Example: >>> net.initialize(FullyConnected=SparseOutputs(Gaussian(), connections=10)) """ def __init__(self, sub_initializer, connections=15): super(SparseOutputs, self).__init__() self.sub_initializer = sub_initializer self.connections = connections def __call__(self, shape): self._assert_atleast2d(shape) if shape[1] < self.connections: raise InitializationError("Output dimension to small: {} < {}" "".format(shape[1], self.connections)) sub_result = evaluate_initializer(self.sub_initializer, shape) connection_mask = np.zeros(shape) connection_mask[:, :self.connections] = 1. for i in range(shape[0]): self.rnd.shuffle(connection_mask[i, :]) return sub_result * connection_mask
[docs]class Uniform(Initializer): """ Initializes the parameters randomly according to a uniform distribution over the interval [low, high]. """ __default_values__ = {'low': None} def __init__(self, low=0.1, high=None): super(Uniform, self).__init__() self.low = low self.high = high self.__init_from_description__(None) def __init_from_description__(self, description): if self.high is None: self.low, self.high = sorted([-self.low, self.low]) assert self.low < self.high, \ "low has to be smaller than high but {} >= {}".format(self.low, self.high) def __call__(self, shape): v = ((self.high - self.low) * self.rnd.rand(*shape)) + self.low return v
# ########################### helper methods ################################## def evaluate_initializer(initializer, shape, fallback=None, seed=None): if isinstance(initializer, Initializer): if seed is not None: initializer.rnd.set_seed(seed) try: result = initializer(shape) except InitializationError: if fallback is not None: return evaluate_initializer(fallback, shape, seed=seed) raise else: if not isinstance(initializer, (int, float)): raise TypeError('type {} not supported as initializer' .format(type(initializer))) result = np.empty(shape, dtype=np.float64) result[:] = initializer return result