Source code for brainstorm.data_iterators

#!/usr/bin/env python
# coding=utf-8
from __future__ import division, print_function, unicode_literals

import math

import numpy as np
import six
from brainstorm.handlers._cpuop import _crop_images
from brainstorm.randomness import Seedable
from brainstorm.utils import IteratorValidationError


[docs]class DataIterator(Seedable): """Base class for Data Iterators. Attributes: data_shapes (dict[str, tuple[int]]): List of input names that this iterator provides. length (int | None): Number of iterations that this iterator will run. """ def __init__(self, data_shapes, length): """ Args: data_shapes (dict[str, tuple[int]]): List of input names that this iterator provides. length (int | None): Number of iterations that this iterator will run. """ super(DataIterator, self).__init__() self.data_shapes = data_shapes self.length = length def __call__(self, handler): pass
[docs]class AddGaussianNoise(DataIterator): """ Adds Gaussian noise to data generated by another iterator, which must provide named data items (such as Online, Minibatches, Undivided). Only Numpy data is supported, Supports usage of different means and standard deviations for different named data items. """ def __init__(self, iter, std_dict, mean_dict=None): """ Args: iter (DataIterator): Any DataIterator which iterates over data that noise should be added to. std_dict (dict[str, float]): Specifies the standard deviation of the noise that should be added for some of the named data items. mean_dict (Optional(dict[str, float])): Specifies the mean of the gaussian noise that should be added for some of the named data items. Defaults to None meaning all means are treated as 0. """ DataIterator.__init__(self, iter.data_shapes, iter.length) mean_keys = set(mean_dict.keys()) if mean_dict is not None else set() std_keys = set(std_dict.keys()) if mean_dict is not None and mean_keys != std_keys: raise IteratorValidationError( "means and standard deviations must be provided for the same " "data names. But {} != {}".format(mean_keys, std_keys)) for key in std_keys: if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) self.mean_dict = {} if mean_dict is None else mean_dict self.std_dict = std_dict self.iter = iter def __call__(self, handler=None): for data in self.iter(handler): for key, std in self.std_dict.items(): mean = self.mean_dict.get(key, 0.0) data[key] = data[key] + std * self.rnd.standard_normal( data[key].shape) + mean yield data
[docs]class AddSaltNPepper(DataIterator): """ Adds Salt&Pepper noise to data generated by another iterator, which must provide named data items (such as Online, Minibatches, Undivided). Only Numpy data is supported, Supports usage of different amounts and ratios of salt VS pepper for different named data items. """ def __init__(self, iter, prob_dict, ratio_dict=None): """ Args: iter (DataIterator): Any DataIterator which iterates over data that noise should be added to. prob_dict (dict[str, float]): Specifies the probability that an input is affected for some of the named data items. Omitted data items are treated as having an amount of 0. ratio_dict (Optional(dict[str, float])): Specifies the ratio of salt of all corrupted inputs. Defaults to None meaning the ratio is treated as 0.5. """ DataIterator.__init__(self, iter.data_shapes, iter.length) ratio_keys = set() if ratio_dict is None else set(ratio_dict.keys()) prob_keys = set(prob_dict.keys()) if ratio_dict is not None and ratio_keys != prob_keys: raise IteratorValidationError( "probabilities and ratios must be provided for the " "same data names. But {} != {}".format(prob_keys, ratio_keys)) for key in prob_keys: if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) self.ratio_dict = {} if ratio_dict is None else ratio_dict self.prob_dict = prob_dict self.iter = iter def __call__(self, handler=None): for data in self.iter(handler): for key, pr in self.prob_dict.items(): ratio = self.ratio_dict.get(key, 0.5) d = data[key].copy() r = self.rnd.rand(*d.shape) d[r >= 1.0 - pr * ratio] = 1.0 # salt d[r <= pr * (1.0 - ratio)] = 0.0 # pepper data[key] = d yield data
[docs]class Flip(DataIterator): """ Randomly flip images horizontally. Images are generated by another iterator, which must provide named data items (such as Online, Minibatches, Undivided). Only 5D Numpy data in TNHWC format is supported. Defaults to flipping the 'default' named data item with a probability of 0.5. Note that the last dimension is flipped, which typically corresponds to flipping images horizontally. """ def __init__(self, iter, prob_dict=None): """ Args: iter (DataIterator): Any DataIterator which iterates over data to be flipped. prob_dict (dict[str, float]): Specifies the probability of flipping for some named data items. """ Seedable.__init__(self) super(Flip, self).__init__(iter.data_shapes, iter.length) prob_dict = {'default': 0.5} if prob_dict is None else prob_dict for key in prob_dict.keys(): if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) if prob_dict[key] > 1.0 or prob_dict[key] < 0.0: raise IteratorValidationError("Invalid probability") if len(iter.data_shapes[key]) != 5: raise IteratorValidationError("Only 5D data is supported") self.prob_dict = prob_dict self.iter = iter def __call__(self, handler=None): for data in self.iter(handler): for name in self.prob_dict.keys(): assert isinstance(data[name], np.ndarray) for i in range(data[name].shape[1]): if self.rnd.random_sample() < self.prob_dict[name]: data[name][:, i, ...] = data[name][:, i, :, ::-1, :] yield data
[docs]class OneHot(DataIterator): """ Convert data to one hot vectors, according to provided vocabulary sizes. If vocabulary size is not provided for some data item, it is yielded as is. Currently this iterator only supports 3D data where the last (right-most) dimension is sized 1. """ def __init__(self, iter, vocab_size_dict): """ Args: iter (DataIterator): DataIterator which iterates over the indices to be converted to one hot. vocab_size_dict (dict[str, int]): Specifies the size of one hot vectors (the vocabulary size) for some named data items. """ DataIterator.__init__(self, iter.data_shapes, iter.length) for key in vocab_size_dict.keys(): if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) if not isinstance(vocab_size_dict[key], int): raise IteratorValidationError("Vocabulary size must be int") shape = iter.data_shapes[key] if not (shape[-1] == 1 and len(shape) == 3): raise IteratorValidationError("Only 3D data is supported") self.vocab_size_dict = vocab_size_dict self.iter = iter def __call__(self, handler=None): for data in self.iter(handler): for name in self.vocab_size_dict.keys(): vocab_size = self.vocab_size_dict[name] new_data = np.eye(vocab_size, dtype=np.bool)[data[name]] new_data = new_data.reshape((new_data.shape[0], new_data.shape[1], new_data.shape[3])) data[name] = new_data yield data
[docs]class MultiHot(DataIterator): """ Convert data to multi hot vectors, according to provided vocabulary sizes. If vocabulary size is not provided for some data item, it is yielded as is. Currently this iterator only supports 3D data. """ def __init__(self, iter, vocab_size_dict): """ Args: iter (DataIterator): DataIterator which iterates over the indices to be converted to multi hot. vocab_size_dict (dict[str, int]): Specifies the size of multi hot vectors (the vocabulary size) for some named data items. """ DataIterator.__init__(self, iter.data_shapes, iter.length) for key in vocab_size_dict.keys(): if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) if not isinstance(vocab_size_dict[key], int): raise IteratorValidationError("Vocabulary size must be int") shape = iter.data_shapes[key] if not len(shape) == 3: raise IteratorValidationError("Only 3D data is supported") self.vocab_size_dict = vocab_size_dict self.iter = iter def __call__(self, handler): for data in self.iter(handler): for name in self.vocab_size_dict.keys(): vocab_size = self.vocab_size_dict[name] new_data = np.eye(vocab_size, dtype=np.bool)[data[name]].max(2) data[name] = new_data yield data
[docs]class Pad(DataIterator): """ Pads images equally on all sides. Images are generated by another iterator, which must provide named data items (such as Online, Minibatches, Undivided). Only 5D Numpy data in TNHWC format is supported. 5D data corresponds to sequences of multi-channel images, which is the typical use case. Zero-padding is used unless specified otherwise. """ def __init__(self, iter, size_dict, value_dict=None): """ Args: iter (DataIterator): A DataIterator which iterates over the images to be padded. size_dict (dict[str, int]): Specifies the padding sizes for some named data items. value_dict (dict[str, int]): Specifies the pad values for some named data items. """ super(Pad, self).__init__(iter.data_shapes, iter.length) if value_dict is not None: if set(size_dict.keys()) != set(value_dict.keys()): raise IteratorValidationError( "padding sizes and values must be provided for the same " "data names") for key in size_dict.keys(): if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) if len(iter.data_shapes[key]) != 5: raise IteratorValidationError("Only 5D data is supported") self.value_dict = {} if value_dict is None else value_dict self.size_dict = size_dict self.iter = iter def __call__(self, handler=None): for data in self.iter(handler): for name in self.size_dict.keys(): assert isinstance(data[name], np.ndarray) t, b, h, w, c = data[name].shape size = self.size_dict[name] val = self.value_dict.get(name, 0.0) new_data = val * np.ones((t, b, h + 2 * size, w + 2 * size, c)) new_data[:, :, size: -size, size: -size, :] = data[name] data[name] = new_data yield data
[docs]class RandomCrop(DataIterator): """ Randomly crops image data. Images are generated by another iterator, which must provide named data items (such as Online, Minibatches, Undivided). Only 5D Numpy data in TNHWC format is supported. 5D data corresponds to sequences of multi-channel images, which is the typical use case. """ def __init__(self, iter, shape_dict): """ Args: iter (DataIterator): A DataIterator which iterates over data to be cropped. shape_dict (dict[str, (int, int)]): Specifies the crop shapes for some named data items. """ super(RandomCrop, self).__init__(iter.data_shapes, iter.length) for key, val in shape_dict.items(): if key not in iter.data_shapes: raise IteratorValidationError( "key {} is not present in iterator. Available keys: {" "}".format(key, iter.data_shapes.keys())) if not (isinstance(val, tuple) and len(val) == 2): raise IteratorValidationError("Shape must be a size 2 tuple") data_shape = iter.data_shapes[key] if len(data_shape) != 5: raise IteratorValidationError("Only 5D data is supported") if val[0] > data_shape[2] or val[0] < 0: raise IteratorValidationError("Invalid crop height") if val[1] > data_shape[3] or val[1] < 0: raise IteratorValidationError("Invalid crop width") self.shape_dict = shape_dict self.iter = iter def __call__(self, handler=None): for data in self.iter(handler): for name in self.shape_dict.keys(): assert isinstance(data[name], np.ndarray) t, n, h, w, c = data[name].shape crop_h, crop_w = self.shape_dict[name] max_r = h - crop_h max_c = w - crop_w row_indices = self.rnd.random_integers(0, max_r, n) col_indices = self.rnd.random_integers(0, max_c, n) cropped = np.zeros((t, n, crop_h, crop_w, c)) _crop_images(data[name], crop_h, crop_w, row_indices, col_indices, cropped) data[name] = cropped yield data
[docs]class Undivided(DataIterator): """ Processes the entire data in one block (only one iteration). """ def __init__(self, **named_data): """ Args: **named_data (dict[str, np.ndarray]): Named arrays with 3+ dimensions i.e. ('T', 'B', ...). """ _assert_correct_data_format(named_data) data_shapes = {n: v.shape for n, v in named_data.items()} super(Undivided, self).__init__(data_shapes, 1) self.data = named_data self.total_size = int(sum(d.size for d in self.data.values())) def __call__(self, handler=None): yield self.data
[docs]class Minibatches(DataIterator): """ Minibatch iterator for inputs and targets. If either a 'mask' is given or some other means of determining sequence length is specified by `cut_according_to`, this iterator also cuts the sequences in each minibatch to their maximum length (which can be less than the maximum length over the whole dataset). Note: When shuffling is enabled, this iterator only randomizes the order of minibatches, but doesn't re-shuffle instances across batches. """ def __init__(self, batch_size=1, shuffle=True, cut_according_to='mask', **named_data): """ Args: batch_size (int): The number of data instances per batch. Defaults to 1. Brainstorm assumes that the second dimension (from the left) of the data indexes independent data items. shuffle (Optional[bool]): Flag indicating whether the order of batches should be randomized at the beginning of every pass through the data. cut_according_to (Optional[str or list or array]: Specify how to determine the length of the sequences for shortening them to the longest sequence of the current mini-batch. Defaults to 'mask' in which case it will determine the length of the sequences from the 'mask' named data entry. Can be any other data name, or a list where the i-th entry is an integer specifying the length of the i-th sequence. **named_data (dict[str, np.ndarray]): Named arrays with 3+ dimensions i.e. ('T', 'B', ...). """ nr_sequences, time_steps = _assert_correct_data_format(named_data) data_shapes = {n: v.shape for n, v in named_data.items()} nr_batches = int(math.ceil(nr_sequences / batch_size)) super(Minibatches, self).__init__(data_shapes, nr_batches) self.data = named_data self.shuffle = shuffle self.batch_size = batch_size if isinstance(cut_according_to, (six.string_types, type(None))): if cut_according_to in named_data: self.seq_lens = _calculate_lengths_from_mask( named_data[cut_according_to]) else: self.seq_lens = time_steps * np.ones(nr_sequences, dtype=np.int) else: self.seq_lens = np.array(cut_according_to) assert self.seq_lens.shape == (nr_sequences, ) self.sample_size = int( sum(d.shape[0] * np.prod(d.shape[2:]) * batch_size for d in self.data.values())) def __call__(self, handler=None): indices = np.arange(self.length) if self.shuffle: self.rnd.shuffle(indices) for idx in indices: batch_slice = slice(idx * self.batch_size, (idx + 1) * self.batch_size) time_slice = slice(None, np.max(self.seq_lens[batch_slice])) data = {k: v[time_slice, batch_slice] for k, v in self.data.items()} yield data
def _assert_correct_data_format(named_data): nr_sequences = {} nr_timesteps = {} for name, data in named_data.items(): if not hasattr(data, 'shape'): raise IteratorValidationError( "{} has a wrong type. (no shape attribute)".format(name) ) if len(data.shape) < 3: raise IteratorValidationError( 'All inputs have to have at least 3 dimensions, where the ' 'first two are time_size and batch_size.') nr_sequences[name] = data.shape[1] nr_timesteps[name] = data.shape[0] if min(nr_sequences.values()) != max(nr_sequences.values()): raise IteratorValidationError( 'The number of sequences of all inputs must be equal, but got {}' .format(nr_sequences)) if min(nr_timesteps.values()) != max(nr_timesteps.values()): raise IteratorValidationError( 'The number of time steps of all inputs must be equal, ' 'but got {}'.format(nr_timesteps)) return int(min(nr_sequences.values())), min(nr_timesteps.values()) def _calculate_lengths_from_mask(mask): assert mask.shape[2:] == (1,) b = mask[:, :, 0] != 0 lengths = mask.shape[0] - b[::-1].argmax(axis=0) lengths[b.max(axis=0) == 0] = 0 return lengths