import numpy as np, pandas as pd, warnings
from sklearn.model_selection import train_test_split
from scipy.sparse import isspmatrix, isspmatrix_csr, vstack, csr_matrix
import ctypes, multiprocessing
from . import _wrapper_double, _wrapper_float
### Tasks will be outputed from python as strings rather than enums
task_dct = {
101 : 'calc_grad',
102 : 'calc_grad_same_batch',
103 : 'calc_grad_big_batch',
104 : 'calc_hess_vec',
105 : 'calc_fun_val_batch'
}
info_dct = {
201 : 'func_increased',
202 : 'curvature_too_small',
203 : 'search_direction_was_nan',
200 : 'no_problems_encountered'
}
#### Pre-defined step size sequences
def _step_size_sqrt(initial_step_size, iteration_num):
return initial_step_size / np.sqrt(iteration_num + 1)
def _step_size_const(initial_step_size, iteration_num):
return initial_step_size
### Class with common methods
class _StochQN:
def __init__(self):
pass
def _check_fit_inputs(self, X, y, sample_weight, additional_kwargs={}, check_sp=False):
assert X.shape[0] > 0
assert X.shape[0] == y.shape[0]
if sample_weight is not None:
assert sample_weight.shape[0] == X.shape[0]
if additional_kwargs is None:
additional_kwargs = dict()
assert isinstance(additional_kwargs, dict)
if check_sp:
X, y = self._check_sp_type(X), self._check_sp_type(y)
sample_weight = self._check_sp_type(sample_weight) if sample_weight is not None else None
return X, y, sample_weight, additional_kwargs
def _check_sp_type(self, X):
if isspmatrix(X):
if not isspmatrix_csr(X):
warnings.warn("'.fit' method only supports sparse CSR matrices. Sparse inputs will be cast to CSR.")
X = csr_matrix(X)
return X
def _get_long_batch(self, X, y, w, batch):
### there will be a mismatch when running partial_fit before fit, or when running fit repeatedly
### if it's with partial fit, it's possible to reuse the cases stored in the object for long batch
diff = (batch + 1) % self.optimizer.bfgs_upd_freq
if (batch + 1) >= (self.optimizer.bfgs_upd_freq - diff):
st_ix = (batch + 1 - (self.optimizer.bfgs_upd_freq - diff)) * self.batch_size
end_ix = min(X.shape[0], (batch + 1) * self.batch_size)
X_long = X[st_ix : end_ix]
y_long = y[st_ix : end_ix]
w_long = w[st_ix : end_ix] if w is not None else None
else:
### In theory, one could take a chunk from the beginning and a chunk from the end,
### but it's faster to just take a larger continuous sample.
### Note that there is a check to ensure that the number of batches is >= upd_freq
X_long = X[:min(X.shape[0], (self.optimizer.bfgs_upd_freq - diff) * self.batch_size)]
y_long = y[:min(X.shape[0], (self.optimizer.bfgs_upd_freq - diff) * self.batch_size)]
w_long = w[:min(X.shape[0], (self.optimizer.bfgs_upd_freq - diff) * self.batch_size)] if w is not None else None
if diff > 0:
self.stored_samples_X.append(X_long)
self.stored_samples_y.append(y_long)
self.stored_samples_w.append(w_long)
X_long, y_long, w_long = self._get_stored_batch()
return X_long, y_long, w_long
def _stack_batch(self, lst_batch):
nsparse = np.sum([isspmatrix(X) for X in lst_batch])
if nsparse > 0:
if nsparse < len(lst_batch):
warnings.warn("When passing mixed batches of sparse and non-sparse data, these are forced to dense.")
X_long = np.r_[tuple(lst_batch)]
else:
X_long = vstack(lst_batch)
else:
X_long = np.r_[tuple(lst_batch)]
return X_long
def _get_stored_batch(self):
if len(self.stored_samples_X) == 0:
raise ValueError("Unexpected error, please open an issue in GitHub explaining what you were doing.")
X_long, y_long = self._stack_batch(self.stored_samples_X), self._stack_batch(self.stored_samples_y)
sum_w_missing = np.sum([w is None for w in self.stored_samples_w])
if sum_w_missing == len(self.stored_samples_w):
w_long = None
else:
if sum_w_missing != 0:
warnings.warn("Passed batches with and without sample weights, missing weights will be set to 1.")
self.stored_samples_w = [self.stored_samples_w[w] if self.stored_samples_w[w] is not None else np.ones((self.stored_samples_X[w].shape[0], 1)) for w in range(len(self.stored_samples_w))]
w_long = self._stack_batch(self.stored_samples_w)
self._reset_saved_batch()
return X_long, y_long, w_long
def _reset_saved_batch(self):
self.stored_samples_X = list()
self.stored_samples_y = list()
self.stored_samples_w = list()
def _add_common_attributes(self, x0, batches_per_epoch, step_size, grad_fun, obj_fun, pred_fun, decr_step_size,
callback_epoch, callback_iter, valset_frac, tol, nepochs, kwargs_cb, random_state, shuffle_data,
verbose, use_grad_diff, use_float):
assert batches_per_epoch > 0
assert isinstance(batches_per_epoch, int)
assert step_size > 0
if decr_step_size == "auto":
decr_step_size = _step_size_sqrt
elif decr_step_size is None:
decr_step_size = _step_size_const
else:
if not callable(decr_step_size):
raise ValueError("'decr_step_size' must be a function taking as input the initial step size and the iteration number, starting at zero.")
decr_step_size = decr_step_size
msg_cb = "Callback must be a function taking as argument the values of 'x' and additional keyword arguments, or 'None'"
if callback_epoch is not None:
if not callable(callback_epoch):
raise ValueError(msg_cb)
if callback_iter is not None:
if not callable(callback_iter):
raise ValueError(msg_cb)
msg_fun = lambda fun_name, allow_None: "'" + fun_name + \
"' must be a function that takes as argument the variables values, X, y, sample_weight, and additional keyword arguments" +\
(", or 'None" if allow_None else "") + "."
if not callable(grad_fun):
raise ValueError(msg_fun("grad_fun", False))
if pred_fun is not None:
if not callable(pred_fun):
raise ValueError(msg_fun("pred_fun", True))
if valset_frac is not None:
assert valset_frac > 0
assert valset_frac < 1
assert tol > 0
if not callable(obj_fun):
raise ValueError(msg_fun("obj_fun", False))
assert nepochs > 0
assert isinstance(nepochs, int)
if kwargs_cb is not None:
assert isinstance(kwargs_cb, dict)
else:
kwargs_cb = dict()
if random_state is None:
random_state = 1
self.c_real_t = ctypes.c_float if use_float else ctypes.c_double
self.x = x0
self.n = self.x.shape[0]
self.step_size = step_size
self.obj_fun = obj_fun
self.pred_fun = pred_fun
self.grad_fun = grad_fun
self.callback_epoch = callback_epoch
self.callback_iter = callback_iter
self.tol = tol
self.nepochs = nepochs
self.batches_per_epoch = batches_per_epoch
self.decr_step_size = decr_step_size
self.kwargs_cb = kwargs_cb
self.valset_frac = valset_frac
self.random_state = random_state
self.use_float = bool(use_float)
self.verbose = bool(verbose)
self.shuffle_data = bool(shuffle_data)
self.use_grad_diff = bool(use_grad_diff)
self.epoch = 0
self.req = self.optimizer.run_optimizer(self.x, self.step_size)
if self.x.dtype != self.c_real_t:
raise ValueError("'x0' has wrong dtype.")
if len(self.x.shape) > 1:
raise ValueError("'x0' must be a 1-dimensional array.")
if self.optimizer_name != "oLBFGS":
### in case partial fit is used
self._reset_saved_batch()
else:
del self.use_grad_diff
def fit(self, X, y, sample_weight=None, additional_kwargs={}, valset=None):
"""
Fit model to sample data
Parameters
----------
X : array(n_samples, m)
Sample data to which to fit the model.
y : array(n_samples, )
Labels or target values for the sample data.
sample_weight : None or array(n_samples, )
Observations weights for the sample data.
additional_kwargs : dict
Additional keyword arguments to pass to the objective, gradient, and Hessian-vector functions.
valset : tuple(3)
User-provided validation set containing (X_val, y_val, sample_weight_val).
At the end of each epoch, will calculate objective function on this set, and if
the decrease from the objective function in the previous epoch is below tolerance,
will terminate procedure earlier.
If 'valset_frac' was provided and a validation set is passed, 'valset_frac' will be ignored.
Must provide objective function in order to use a validation set.
Returns
-------
self : obj
This object.
"""
X, y, sample_weight, additional_kwargs = self._check_fit_inputs(X, y, sample_weight, additional_kwargs, check_sp=True)
if valset is not None:
if self.obj_fun is None:
raise ValueError("Must provide objective function when using a validation set for monitoring.")
assert isinstance(valset, tuple)
assert len(valset) == 3
X_val, y_val, w_val = valset
X_val, y_val, w_val, additional_kwargs = self._check_fit_inputs(X_val, y_val, w_val, additional_kwargs, check_sp=False)
if self.valset_frac is not None:
warnings.warn("'valset_frac' is ignored when passign a validation set to '.fit'.")
elif self.valset_frac is not None:
if sample_weight is None:
X, X_val, y, y_val = train_test_split(X, y, test_size=self.valset_frac, random_state=self.random_state)
w_val = None
else:
X, X_val, y, y_val, sample_weight, w_val = train_test_split(X, y, sample_weight, test_size=self.valset_frac, random_state=self.random_state)
else:
X_val, y_val, w_val = None, None, None
obj_last_epoch = np.inf
print_term_msg = True if self.verbose else False
self.batch_size = int(np.ceil(X.shape[0] / self.batches_per_epoch))
for self.epoch in range(self.nepochs):
if self.shuffle_data:
np.random.seed(self.random_state + self.epoch)
rand_ord = np.argsort( np.random.random(size=X.shape[0]) )
X = X[rand_ord]
y = y[rand_ord]
sample_weight = sample_weight[rand_ord] if sample_weight is not None else None
for batch in range(self.batches_per_epoch):
st_batch_ix = batch * self.batch_size
end_batch_ix = min(X.shape[0], (batch + 1) * self.batch_size)
X_batch = X[st_batch_ix : end_batch_ix]
y_batch = y[st_batch_ix : end_batch_ix]
w_batch = sample_weight[st_batch_ix : end_batch_ix] if sample_weight is not None else None
self._fit_batch(X_batch, y_batch, w_batch, additional_kwargs, is_user_batch=False,
X_full=X, y_full=y, w_full=sample_weight, X_val=X_val, y_val=y_val, w_val=w_val, batch=batch)
if self.callback_epoch is not None:
self.callback_epoch(self.x, **self.kwargs_cb)
if X_val is not None and self.obj_fun is not None:
obj_this_epoch = self.obj_fun(self.x, X_val, y_val, sample_weight=w_val, **additional_kwargs)
if self.verbose:
print((self.optimizer_name + " - epoch: %2d, f(x): %12.4f") % (self.epoch + 1, obj_this_epoch) )
if (obj_last_epoch - obj_this_epoch) < self.tol and obj_this_epoch <= obj_last_epoch:
if self.verbose:
print(self.optimizer_name + " - Optimization procedure terminated (decrease below tolerance).")
print_term_msg = False
break
else:
obj_last_epoch = obj_this_epoch
if print_term_msg:
print(self.optimizer_name + " - Optimization procedure terminated (reached number of epochs).")
return self
def partial_fit(self, X, y, sample_weight=None, additional_kwargs={}):
"""
Update model with user-provided batches of data
Note
----
In SQN and adaQN, the data passed to all calls in partial fit will be stored in a limited-memory
container which will be used to calculate Hessian-vector products or large-batch gradients.
The size of this container is determined by the inputs 'batch_size' and 'bfgs_upd_freq' passed
in the constructor call.
Note
----
The step size in partial fit is determined by the number of optimizer iterations rather than the number
of epochs, thus for a given amount of data, the default step size will be much smaller than when calling 'fit'.
Recommended to provide a custom step size function ('decr_step_size' in the initialization), as otherwise the
step size sequence will be too small.
Parameters
----------
X : array(n_samples, m)
Sample data to with which to update the model.
y : array(n_samples, )
Labels or target values for the sample data.
sample_weight : None or array(n_samples, )
Observations weights for the sample data.
additional_kwargs : dict
Additional keyword arguments to pass to the objective, gradient, and Hessian-vector functions.
Returns
-------
self : obj
This object.
"""
X, y, sample_weight, additional_kwargs = self._check_fit_inputs(X, y, sample_weight, additional_kwargs, check_sp=False)
save_batch = False
if self.optimizer_name == "SQN":
save_batch = True
elif self.optimizer_name == "adaQN":
if self.use_grad_diff or (self.optimizer.max_incr > 0 if self.optimizer.max_incr is not None else False):
save_batch = True
if save_batch:
self.stored_samples_X.append(X)
self.stored_samples_y.append(y)
self.stored_samples_w.append(sample_weight)
self._fit_batch(X, y, sample_weight, additional_kwargs, is_user_batch=True)
return self
def _fit_batch(self, X_batch, y_batch, w_batch, additional_kwargs, is_user_batch=False,
X_full=None, y_full=None, w_full=None, X_val=None, y_val=None, w_val=None, batch=None):
while True:
if self.req["task"] == "calc_grad" or self.req["task"] == "calc_grad_same_batch":
self.optimizer.update_gradient(self.grad_fun(self.req["requested_on"], X_batch, y_batch, sample_weight=w_batch, **additional_kwargs))
else:
if self.req["task"] == "calc_fun_val_batch" and X_val is not None:
self.optimizer.update_function(self.obj_fun(self.req["requested_on"], X_val, y_val, sample_weight=w_val, **additional_kwargs))
else:
if is_user_batch:
X_long, y_long, w_long = self._get_stored_batch()
else:
X_long, y_long, w_long = self._get_long_batch(X_full, y_full, w_full, batch)
if self.req["task"] == "calc_grad_big_batch":
self.optimizer.update_gradient(self.grad_fun(self.req["requested_on"], X_long, y_long, sample_weight=w_long, **additional_kwargs))
elif self.req["task"] == "calc_hess_vec":
self.optimizer.update_hess_vec(self.hess_vec_fun(self.req["requested_on"][0], self.req["requested_on"][1], X_long, y_long, sample_weight=w_long, **additional_kwargs))
elif self.req["task"] == "calc_fun_val_batch":
self.optimizer.update_function(self.obj_fun(self.req["requested_on"], X_long, y_long, sample_weight=w_long, **additional_kwargs))
else:
raise ValueError("Unexpected error. Please open an issue in GitHub explaining what you were doing.")
if is_user_batch:
step_size = self.decr_step_size(self.step_size, self.niter)
else:
step_size = self.decr_step_size(self.step_size, self.epoch)
self.req = self.optimizer.run_optimizer(self.x, step_size)
if self.verbose:
if self.req["info"]["iteration_info"] != "no_problems_encountered":
if is_user_batch:
print( (self.optimizer_name + " - at iteration %3d: " + self.req["info"]["iteration_info"]) % self.niter )
else:
print( (self.optimizer_name + " - at iteration %3d, epoch %2d: " + self.req["info"]["iteration_info"]) % (self.niter, self.epoch + 1) )
if self.req["task"] == "calc_grad":
if self.callback_iter is not None:
self.callback_iter(self.x, **self.kwargs_cb)
break
def predict(self, X, additional_kwargs={}):
"""
Make predictions on new data
Note
----
Using this method requires passing 'pred_fun' in the initialization.
Parameters
----------
X : array(n_samples, m)
New data to pass to user-provided predict function.
additional_kwargs : dict
Additional keyword arguments to pass to user-provided predict function.
"""
if self.pred_fun is None:
raise ValueError("Must supply predict function in order to call this method.")
else:
return self.pred_fun(self.x, X, **additional_kwargs)
def get_x(self):
"""
Get a copy of current values of the variables
Returns
-------
x : array(n, )
Current variable values.
"""
return self.x.copy()
#### Optimizers
[docs]class oLBFGS(_StochQN):
"""
oLBFGS optimizer
Optimizes an empirical (convex) loss function over batches of sample data.
Parameters
----------
x0 : array (m, )
Initial values of the variables to optimize (refered hereafter as 'x').
grad_fun : function(x, X, y, sample_weight, **kwargs) --> array(m, )
Function that calculates the empirical gradient at values 'x' on data 'X' and 'y'.
Note: output must be one-dimensional and with the same number of entries as 'x',
otherwise the Python session might segfault.
(The extra keyword arguments are passed in the 'fit' method, not here)
obj_fun : function(x, X, y, sample_weight, **kwargs) --> float
Function that calculates the empirical objective value at values 'x' on data 'X' and 'y'.
Only used when using a validation set ('valset_frac' not None, or 'valset' passed to fit).
Ignored when fitting the data in user-provided batches.
(The extra keyword arguments are passed in the 'fit' method, not here)
pred_fun : None or function(xopt, X)
Prediction function taking as input the optimal 'x' values as obtained by the
optimization procedure, and new observation 'X' on which to make predictions.
If passed, will have an additional method oLBFGS.predict(X, *args) that calls
this function with current values of 'x'.
batches_per_epoch : int
Number of batches per epoch (each batch will have the same number of observations except for the
last one which might be smaller).
step_size : float
Initial step size to use.
(Can be modified after object is already initialized)
decr_step_size : str "auto", None, or function(initial_step_size, epoch) -> float
Function that determines the step size during each epoch, taking as input the initial
step size and the epoch number (starting at zero).
If "auto", will use 1/sqrt(iteration).
If None, will use constant step size.
For 'partial_fit', it will take as input the number of iterations of the algorithm rather than epoch,
so it's very recommended to provide a custom function when passing data in user-provided batches.
Can be modified after the object has been initialized (oLBFGS.decr_step_size)
shuffle_data : bool
Whether to shuffle the data at the beginning of each epoch.
random_state : int
Random seed to use for shuffling data and selecting validation set.
The algorithm is deterministic so it's not used for anything else.
nepochs : int
Number of epochs for which to run the optimization procedure.
Might terminate earlier if using a validation set for monitoring.
valset_frac : float(0, 1) or None
Percent of the data to use as validation set for early stopping.
Can also pass a user-provided validation set to 'fit', in which case it will
be ignored.
If passing None, will run for the number of epochs passed in 'nepochs'.
tol : float
If the objective function calculated on the validation set decrease by less than
'tol' upon completion of an epoch, will terminate the optimization procedure.
Ignored when not using a validation set.
callback_epoch : None or function*(x, **kwargs)
Callback function to call at the end of each epoch
callback_iter : None or function*(x, **kwargs)
Callback function to call at the end of each iteration
kwargs_cb : tuple
Additional arguments to pass to 'callback' and 'stop_crit'.
(Can be modified after object is already initialized)
verbose : bool
Whether to print messages when there is some problem during an iteration
(e.g. correction pair not meeting minum curvature).
mem_size : int
Number of correction pairs to store for approximation of Hessian-vector products.
hess_init : float or None
value to which to initialize the diagonal of H0.
If passing 0, will use the same initializion as for SQN (s_last*y_last / y_last*y_last).
min_curvature : float or None
Minimum value of s*y / s*s in order to accept a correction pair.
y_reg : float or None
regularizer for 'y' vector (gets added y_reg * s)
check_nan : bool
Whether to check for variables becoming NaN after each iteration, and reverting the step if they do
(will also reset BFGS memory).
nthreads : int
Number of parallel threads to use. If set to -1, will determine the number of available threads and use
all of them. Note however that not all the computations can be parallelized.
use_float : bool
Whether to use C 'float' type (np.float32). If 'False' (the default), will use 'double' type (np.float64).
The variables and gradient must be of this same dtype.
References
----------
.. [1] Schraudolph, N.N., Yu, J. and Günter, S., 2007, March.
"A stochastic quasi-Newton method for online convex optimization."
In Artificial Intelligence and Statistics (pp. 436-443).
"""
def __init__(self, x0, grad_fun, obj_fun=None, pred_fun=None, batches_per_epoch=25, step_size=1e-3, decr_step_size="auto",
shuffle_data=True, random_state=1, nepochs=25, valset_frac=None, tol=1e-1,
callback_epoch=None, callback_iter=None, kwargs_cb={}, verbose=True,
mem_size=10, hess_init=None, min_curvature=1e-4, y_reg=None, check_nan=True, nthreads=-1, use_float=False):
self.optimizer_name = "oLBFGS"
self.optimizer = oLBFGS_free(mem_size, hess_init, min_curvature, y_reg, check_nan, nthreads, use_float)
use_grad_diff = True
self._add_common_attributes(x0, batches_per_epoch, step_size, grad_fun, obj_fun, pred_fun, decr_step_size,
callback_epoch, callback_iter, valset_frac, tol, nepochs, kwargs_cb, random_state, shuffle_data,
verbose, use_grad_diff, use_float)
@property
def niter(self):
return self.optimizer._oLBFGS.niter
[docs]class SQN(_StochQN):
"""
SQN optimizer
Optimizes an empirical (convex) loss function over batches of sample data.
Parameters
----------
x0 : array (m, )
Initial values of the variables to optimize (refered hereafter as 'x').
grad_fun : function(x, X, y, sample_weight, **kwargs) --> array(m, )
Function that calculates the empirical gradient at values 'x' on data 'X' and 'y'.
Note: output must be one-dimensional and with the same number of entries as 'x',
otherwise the Python session might segfault.
(The extra keyword arguments are passed in the 'fit' method, not here)
obj_fun : function(x, X, y, sample_weight, **kwargs) --> float
Function that calculates the empirical objective value at values 'x' on data 'X' and 'y'.
Only used when using a validation set ('valset_frac' not None, or 'valset' passed to fit).
Ignored when fitting the data in user-provided batches.
(The extra keyword arguments are passed in the 'fit' method, not here)
hess_vec_fun : function(x, vec, X, y, sample_weight, **kwargs) --> array(m, )
Function that calculates the product of a vector the empirical Hessian at values 'x' on data 'X' and 'y'.
Ignored when using 'use_grad_diff=True'.
Note: output must be one-dimensional and with the same number of entries as 'x',
otherwise the Python session might segfault.
These products are calculated on a larger batch than the gradients (given by batch_size * bfgs_upd_freq).
(The extra keyword arguments are passed in the 'fit' method, not here)
pred_fun : None or function(xopt, X)
Prediction function taking as input the optimal 'x' values as obtained by the
optimization procedure, and new observation 'X' on which to make predictions.
If passed, will have an additional method oLBFGS.predict(X, *args) that calls
this function with current values of 'x'.
batches_per_epoch : int
Number of batches per epoch (each batch will have the same number of observations except for the
last one which might be smaller).
step_size : float
Initial step size to use.
(Can be modified after object is already initialized)
decr_step_size : str "auto", None, or function(initial_step_size, epoch) -> float
Function that determines the step size during each epoch, taking as input the initial
step size and the epoch number (starting at zero).
If "auto", will use 1/sqrt(iteration).
If None, will use constant step size.
For 'partial_fit', it will take as input the number of iterations of the algorithm rather than epoch,
so it's very recommended to provide a custom function when passing data in user-provided batches.
Can be modified after the object has been initialized (oLBFGS.decr_step_size)
shuffle_data : bool
Whether to shuffle the data at the beginning of each epoch.
random_state : int
Random seed to use for shuffling data and selecting validation set.
The algorithm is deterministic so it's not used for anything else.
nepochs : int
Number of epochs for which to run the optimization procedure.
Might terminate earlier if using a validation set for monitoring.
valset_frac : float(0, 1) or None
Percent of the data to use as validation set for early stopping.
Can also pass a user-provided validation set to 'fit', in which case it will
be ignored.
If passing None, will run for the number of epochs passed in 'nepochs'.
tol : float
If the objective function calculated on the validation set decrease by less than
'tol' upon completion of an epoch, will terminate the optimization procedure.
Ignored when not using a validation set.
callback_epoch : None or function*(x, **kwargs)
Callback function to call at the end of each epoch
callback_iter : None or function*(x, **kwargs)
Callback function to call at the end of each iteration
kwargs_cb : tuple
Additional arguments to pass to 'callback' and 'stop_crit'.
(Can be modified after object is already initialized)
verbose : bool
Whether to print messages when there is some problem during an iteration
(e.g. correction pair not meeting minum curvature).
mem_size : int
Number of correction pairs to store for approximation of Hessian-vector products.
bfgs_upd_freq : int
Number of iterations (batches) after which to generate a BFGS correction pair.
min_curvature : float or None
Minimum value of s*y / s*s in order to accept a correction pair.
y_reg : float or None
regularizer for 'y' vector (gets added y_reg * s)
use_grad_diff : bool
Whether to create the correction pairs using differences between gradients instead of Hessian-vector products.
These gradients are calculated on a larger batch than the regular ones (given by batch_size * bfgs_upd_freq).
check_nan : bool
Whether to check for variables becoming NaN after each iteration, and reverting the step if they do
(will also reset BFGS memory).
nthreads : int
Number of parallel threads to use. If set to -1, will determine the number of available threads and use
all of them. Note however that not all the computations can be parallelized.
use_float : bool
Whether to use C 'float' type (np.float32). If 'False' (the default), will use 'double' type (np.float64).
The variables, gradient, and hessian-vector must be of this same dtype.
References
----------
.. [1] Byrd, R.H., Hansen, S.L., Nocedal, J. and Singer, Y., 2016. "A stochastic quasi-Newton method for large-scale optimization."
SIAM Journal on Optimization, 26(2), pp.1008-1031.
.. [2] Wright, S. and Nocedal, J., 1999. "Numerical optimization." (ch 7)
Springer Science, 35(67-68), p.7.
"""
def __init__(self, x0, grad_fun, obj_fun=None, hess_vec_fun=None, pred_fun=None, batches_per_epoch=25, step_size=1e-3, decr_step_size="auto",
shuffle_data=True, random_state=1, nepochs=25, valset_frac=None, tol=1e-1,
callback_epoch=None, callback_iter=None, kwargs_cb={}, verbose=True,
mem_size=10, bfgs_upd_freq=20, min_curvature=1e-4, y_reg=None, use_grad_diff=False, check_nan=True, nthreads=-1, use_float=False):
if not use_grad_diff and hess_vec_fun is None:
raise ValueError("If not using 'use_grad_diff', must provide function that evaluates Hessian-vector product.")
if hess_vec_fun is not None:
if use_grad_diff:
warnings.warn("Hessian-vector function is ignored when passing 'use_grad_diff=True'.")
use_grad_diff = None
else:
if not callable(hess_vec_fun):
raise ValueError("'hess_vec_fun' must be a function that takes as input the values of 'x' and a vector, returning the Hessian-vector product.")
self.optimizer_name = "SQN"
self.optimizer = SQN_free(mem_size, bfgs_upd_freq, min_curvature, y_reg, use_grad_diff, check_nan, nthreads, use_float)
self._add_common_attributes(x0, batches_per_epoch, step_size, grad_fun, obj_fun, pred_fun, decr_step_size,
callback_epoch, callback_iter, valset_frac, tol, nepochs, kwargs_cb, random_state, shuffle_data,
verbose, use_grad_diff, use_float)
self.hess_vec_fun = hess_vec_fun
@property
def niter(self):
return self.optimizer._SQN.niter
[docs]class adaQN(_StochQN):
"""
adaQN optimizer
Optimizes an empirical (possibly non-convex) loss function over batches of sample data.
Parameters
----------
x0 : array (m, )
Initial values of the variables to optimize (refered hereafter as 'x').
grad_fun : function(x, X, y, sample_weight, **kwargs) --> array(m, )
Function that calculates the empirical gradient at values 'x' on data 'X' and 'y'.
Note: output must be one-dimensional and with the same number of entries as 'x',
otherwise the Python session might segfault.
(The extra keyword arguments are passed in the 'fit' method, not here)
obj_fun : function(x, X, y, sample_weight, **kwargs) --> float
Function that calculates the empirical objective value at values 'x' on data 'X' and 'y'.
Will be ignored if passing 'max_incr=None' and no validation set
('valset_frac=None', and no 'valset' passed to fit).
(The extra keyword arguments are passed in the 'fit' method, not here)
pred_fun : None or function(xopt, X)
Prediction function taking as input the optimal 'x' values as obtained by the
optimization procedure, and new observation 'X' on which to make predictions.
If passed, will have an additional method oLBFGS.predict(X, *args) that calls
this function with current values of 'x'.
batches_per_epoch : int
Number of batches per epoch (each batch will have the same number of observations except for the
last one which might be smaller).
step_size : float
Initial step size to use.
(Can be modified after object is already initialized)
decr_step_size : str "auto", None, or function(initial_step_size, epoch) -> float
Function that determines the step size during each epoch, taking as input the initial
step size and the epoch number (starting at zero).
If "auto", will use 1/sqrt(iteration).
If None, will use constant step size.
For 'partial_fit', it will take as input the number of iterations of the algorithm rather than epoch,
so it's very recommended to provide a custom function when passing data in user-provided batches.
Can be modified after the object has been initialized (oLBFGS.decr_step_size)
shuffle_data : bool
Whether to shuffle the data at the beginning of each epoch.
random_state : int
Random seed to use for shuffling data and selecting validation set.
The algorithm is deterministic so it's not used for anything else.
nepochs : int
Number of epochs for which to run the optimization procedure.
Might terminate earlier if using a validation set for monitoring.
valset_frac : float(0, 1) or None
Percent of the data to use as validation set for early stopping.
Can also pass a user-provided validation set to 'fit', in which case it will
be ignored.
If passing None, will run for the number of epochs passed in 'nepochs'.
tol : float
If the objective function calculated on the validation set decrease by less than
'tol' upon completion of an epoch, will terminate the optimization procedure.
Ignored when not using a validation set.
callback_epoch : None or function*(x, **kwargs)
Callback function to call at the end of each epoch
callback_iter : None or function*(x, **kwargs)
Callback function to call at the end of each iteration
kwargs_cb : tuple
Additional arguments to pass to 'callback' and 'stop_crit'.
(Can be modified after object is already initialized)
verbose : bool
Whether to print messages when there is some problem during an iteration
(e.g. correction pair not meeting minum curvature).
mem_size : int
Number of correction pairs to store for approximation of Hessian-vector products.
fisher_size : int or None
Number of gradients to store for calculation of the empirical Fisher product with gradients.
If passing 'None', will force 'use_grad_diff' to 'True'.
bfgs_upd_freq : int
Number of iterations (batches) after which to generate a BFGS correction pair.
max_incr : float or None
Maximum ratio of function values in the validation set under the average values of 'x' during current epoch
vs. previous epoch. If the ratio is above this threshold, the BFGS and Fisher memories will be reset, and 'x'
values reverted to their previous average.
If not using a validation set, will take a longer batch for function evaluations (same as used for gradients
when using 'use_grad_diff=True').
min_curvature : float or None
Minimum value of s*y / s*s in order to accept a correction pair.
y_reg : float or None
regularizer for 'y' vector (gets added y_reg * s)
scal_reg : float
Regularization parameter to use in the denominator for AdaGrad and RMSProp scaling.
rmsprop_weight : float(0,1) or None
If not 'None', will use RMSProp formula instead of AdaGrad for approximated inverse-Hessian initialization.
(Recommended to use lower initial step size + passing 'decr_step_size')
use_grad_diff : bool
Whether to create the correction pairs using differences between gradients instead of Fisher matrix.
These gradients are calculated on a larger batch than the regular ones (given by batch_size * bfgs_upd_freq).
If 'True', fisher_size will be set to None, and empirical Fisher matrix will not be used.
check_nan : bool
Whether to check for variables becoming NaN after each iteration, and reverting the step if they do
(will also reset BFGS memory).
nthreads : int
Number of parallel threads to use. If set to -1, will determine the number of available threads and use
all of them. Note however that not all the computations can be parallelized.
use_float : bool
Whether to use C 'float' type (np.float32). If 'False' (the default), will use 'double' type (np.float64).
The variables and gradient must be of this same dtype.
References
----------
.. [1] Keskar, N.S. and Berahas, A.S., 2016, September. "adaQN: An Adaptive Quasi-Newton Algorithm for Training RNNs."
In Joint European Conference on Machine Learning and Knowledge Discovery in Databases (pp. 1-16). Springer, Cham.
.. [2] Wright, S. and Nocedal, J., 1999. "Numerical optimization." (ch 7)
Springer Science, 35(67-68), p.7.
"""
def __init__(self, x0, grad_fun, obj_fun=None, pred_fun=None, batches_per_epoch=25, step_size=1e-1, decr_step_size=None,
shuffle_data=True, random_state=1, nepochs=25, valset_frac=None, tol=1e-1,
callback_epoch=None, callback_iter=None, kwargs_cb={}, verbose=True,
mem_size=10, fisher_size=100, bfgs_upd_freq=20, max_incr=1.01, min_curvature=1e-4, y_reg=None,
scal_reg=1e-4, rmsprop_weight=None, use_grad_diff=False, check_nan=True, nthreads=-1, use_float=False):
if max_incr is not None:
if obj_fun is None:
raise ValueError("Must provide objective function when passing 'max_incr'.")
if use_grad_diff and fisher_size is not None:
warnings.warn("'fisher_size' ignored when using 'use_grad_diff=True'.")
if fisher_size is None:
use_grad_diff = True
self.optimizer_name = "adaQN"
self.optimizer = adaQN_free(mem_size, fisher_size, bfgs_upd_freq, max_incr, min_curvature, scal_reg,
rmsprop_weight, y_reg, use_grad_diff, check_nan, nthreads, use_float)
self._add_common_attributes(x0, batches_per_epoch, step_size, grad_fun, obj_fun, pred_fun, decr_step_size,
callback_epoch, callback_iter, valset_frac, tol, nepochs, kwargs_cb, random_state, shuffle_data,
verbose, use_grad_diff, use_float)
@property
def niter(self):
return self.optimizer._adaQN.niter
#############################
class _BFGS_mem_holder:
def __init__(self, mem_size, n, min_curvature, y_reg, upd_freq, use_float):
c_real_t = ctypes.c_float if use_float else ctypes.c_double
self.s_mem = np.empty((n * mem_size), dtype = c_real_t)
self.y_mem = np.empty((n * mem_size), dtype = c_real_t)
self.buffer_rho = np.empty(mem_size, dtype = c_real_t)
self.buffer_alpha = np.empty(mem_size, dtype = c_real_t)
if min_curvature > 0:
self.s_bak = np.empty(n, dtype = c_real_t)
self.y_bak = np.empty(n, dtype = c_real_t)
else:
self.s_bak = np.empty(1, dtype = c_real_t)
self.y_bak = np.empty(1, dtype = c_real_t)
self.mem_size = ctypes.c_size_t(mem_size).value
self.mem_used = ctypes.c_size_t(0).value
self.mem_st_ix = ctypes.c_size_t(0).value
self.upd_freq = ctypes.c_size_t(upd_freq).value
self.y_reg = c_real_t(y_reg).value
self.min_curvature = c_real_t(min_curvature).value
class _Fisher_mem_holder:
def __init__(self, mem_size, n, use_float):
c_real_t = ctypes.c_float if use_float else ctypes.c_double
self.F = np.empty((n * mem_size), dtype = c_real_t)
self.buffer_y = np.empty(mem_size, dtype = c_real_t)
self.mem_size = ctypes.c_size_t(mem_size).value
self.mem_used = ctypes.c_size_t(0).value
self.mem_st_ix = ctypes.c_size_t(0).value
class _oLBFGS_holder:
def __init__(self, n, mem_size, hess_init, y_reg, min_curvature, check_nan, nthreads, use_float):
c_real_t = ctypes.c_float if use_float else ctypes.c_double
self.BFGS_mem = _BFGS_mem_holder(mem_size, n, min_curvature, y_reg, 1, use_float)
self.grad_prev = np.empty(n, dtype = c_real_t)
self.hess_init = c_real_t(hess_init).value
self.niter = ctypes.c_int(0).value
self.section = ctypes.c_int(0).value
self.nthreads = ctypes.c_int(nthreads).value
self.check_nan = ctypes.c_int(bool(check_nan)).value
self.n = ctypes.c_int(n).value
class _SQN_holder:
def __init__(self, n, mem_size, bfgs_upd_freq, min_curvature,
use_grad_diff, y_reg, check_nan, nthreads, use_float):
c_real_t = ctypes.c_float if use_float else ctypes.c_double
self.BFGS_mem = _BFGS_mem_holder(mem_size, n, min_curvature, y_reg, bfgs_upd_freq, use_float)
if use_grad_diff:
self.grad_prev = np.empty(n, dtype = c_real_t)
else:
self.grad_prev = np.empty(1, dtype = c_real_t)
self.x_sum = np.zeros(n, dtype = c_real_t)
self.x_avg_prev = np.empty(n, dtype = c_real_t)
self.use_grad_diff = ctypes.c_int(use_grad_diff).value
self.niter = ctypes.c_int(0).value
self.section = ctypes.c_int(0).value
self.nthreads = ctypes.c_int(nthreads).value
self.check_nan = ctypes.c_int(bool(check_nan)).value
self.n = ctypes.c_int(n).value
class _adaQN_holder:
def __init__(self, n, mem_size, fisher_size, bfgs_upd_freq,
max_incr, min_curvature, scal_reg, rmsprop_weight,
use_grad_diff, y_reg, check_nan, nthreads, use_float):
c_real_t = ctypes.c_float if use_float else ctypes.c_double
self.BFGS_mem = _BFGS_mem_holder(mem_size, n, min_curvature, y_reg, bfgs_upd_freq, use_float)
if use_grad_diff:
self.Fisher_mem = _Fisher_mem_holder(1, 1, use_float)
self.grad_prev = np.empty(n, dtype = c_real_t)
else:
self.Fisher_mem = _Fisher_mem_holder(fisher_size, n, use_float)
self.grad_prev = np.empty(1, dtype = c_real_t)
self.H0 = np.empty(n, dtype = c_real_t)
self.x_sum = np.zeros(n, dtype = c_real_t)
self.x_avg_prev = np.empty(n, dtype = c_real_t)
self.grad_sum_sq = np.zeros(n, dtype = c_real_t)
self.max_incr = c_real_t(max_incr).value
self.scal_reg = c_real_t(scal_reg).value
self.rmsprop_weight = c_real_t(rmsprop_weight).value
self.use_grad_diff = ctypes.c_int(use_grad_diff).value
self.f_prev = c_real_t(0).value
self.niter = ctypes.c_int(0).value
self.section = ctypes.c_int(0).value
self.nthreads = ctypes.c_int(nthreads).value
self.check_nan = ctypes.c_int(bool(check_nan)).value
self.n = ctypes.c_int(n).value
#####################
class _StochQN_free:
def _take_common_inputs(self, mem_size, min_curvature, y_reg, check_nan, nthreads, use_float):
assert mem_size > 0
assert isinstance(mem_size, int)
if min_curvature is not None:
assert min_curvature > 0
else:
min_curvature = 0
if y_reg is not None:
assert y_reg > 0
else:
y_reg = 0
if nthreads is None:
nthreads = 1
if nthreads <= 0:
nthreads = multiprocessing.cpu_count()
assert isinstance(nthreads, int)
assert nthreads >= 1
self.mem_size = mem_size
self.min_curvature = min_curvature
self.y_reg = y_reg
self.check_nan = bool(check_nan)
self.nthreads = nthreads
self.use_float = bool(use_float)
self.c_real_t = ctypes.c_float if self.use_float else ctypes.c_double
def update_gradient(self, gradient):
"""
Pass requested gradient to optimizer
Parameters
----------
gradient : array(m, )
Gradient calculated as requested, evaluated at values given in "requested_on",
calcualted either in a regular batch (task = "calc_grad"), same batch as before
(task = "calc_grad_same_batch" - oLBFGS only), or a larger batch of data (task = "calc_grad_big_batch"), perhaps
including all the cases from the last such calculation (SQN and adaQN with 'use_grad_diff=True').
"""
if gradient.dtype != self.c_real_t:
gradient = gradient.astype(self.c_real_t)
if len(gradient.shape) > 1:
gradient = gradient.reshape(-1)
self.gradient[:] = gradient
[docs]class oLBFGS_free(_StochQN_free):
"""
oLBFGS optimizer (free mode)
Optimizes an empirical (convex) loss function over batches of sample data. Compared to
class 'oLBFGS', this version lets the user do all the calculations from the outside, only
interacting with the object by means of a function that returns a request type and is fed the
required calculation through a method 'update_gradient'.
Order in which requests are made:
========== loop ===========
* calc_grad
* calc_grad_same_batch (might skip if using check_nan)
===========================
Parameters
----------
mem_size : int
Number of correction pairs to store for approximation of Hessian-vector products.
hess_init : float or None
value to which to initialize the diagonal of H0.
If passing 'None', will use the same initializion as for SQN (s_last*y_last / y_last*y_last).
min_curvature : float or None
Minimum value of s*y / s*s in order to accept a correction pair.
y_reg : float or None
Regularizer for 'y' vector (gets added y_reg * s).
check_nan : bool
Whether to check for variables becoming NaN after each iteration, and reverting the step if they do
(will also reset BFGS memory).
nthreads : int
Number of parallel threads to use. If set to -1, will determine the number of available threads and use
all of them. Note however that not all the computations can be parallelized.
use_float : bool
Whether to use C 'float' type (np.float32). If 'False' (the default), will use 'double' type (np.float64).
The variables and gradient must be of this same dtype.
"""
def __init__(self, mem_size=10, hess_init=None, min_curvature=1e-4, y_reg=None,
check_nan=True, nthreads=-1, use_float=False):
self._take_common_inputs(mem_size, min_curvature, y_reg, check_nan, nthreads, use_float)
if hess_init is not None:
assert hess_init > 0
else:
hess_init = 0
self.hess_init = hess_init
self.initialized = False
def _initialize(self, n):
self._oLBFGS = _oLBFGS_holder(n, self.mem_size, self.hess_init, self.y_reg, self.min_curvature,
self.check_nan, self.nthreads, self.use_float)
self.gradient = np.empty(n, dtype = self.c_real_t)
self.initialized = True
[docs] def run_optimizer(self, x, step_size):
"""
Continue optimization process after supplying the calculation requested from the last run
Continue the optimization process from where it was left since the last calculation was
requested. Will internally do all the updates that are possible until the moment some
calculation of function/gradient/hessian-vector is required.
Note
----
The first time this is run, no calculation needs to be supplied.
Parameters
----------
x : array(m, )
Current values of the variables. Will be modified in-place.
Do NOT modify the values between runs.
step_size : float
Step size for the next update (note that variables are not updated during all runs).
Returns
-------
request : dict
Dictionary with the calculation required to proceed and iteration information.
Structure:
* task : str - one of "calc_grad", "calc_grad_same_batch" (oLBFGS w. 'min_curvature' or 'check_nan'),
"calc_hess_vec" (SQN wo. 'use_grad_diff'), "calc_fun_val_batch" (adaQN w. 'max_incr'),
"calc_grad_big_batch" (SQN and adaQN w. 'use_grad_diff').
* requested_on : array(m, ) or tuple(array(m, ), array(m, )), containing the values on which
the request in "task" has to be evaluated. In the case of Hessian-vector products (SQN), the
first vector is the values of 'x' and the second is the vector with which the product is required.
* info : dict(x_changed_in_run : bool, iteration_number : int, iteration_info : str),
iteration_info can be one of "no_problems_encountered", "search_direction_was_nan",
"func_increased", "curvature_too_small".
"""
assert isinstance(x, np.ndarray)
if x.dtype != self.c_real_t:
raise ValueError("x' has wrong dtype.")
if not self.initialized:
self._initialize(x.shape[0])
c_funs = _wrapper_float if self.use_float else _wrapper_double
x_changed, niter, section, \
mem_used, mem_st_ix, \
task, iter_info, req_arr = c_funs.py_run_oLBFGS(self._oLBFGS, x, self.gradient, step_size)
self._oLBFGS.niter = ctypes.c_size_t(niter).value
self._oLBFGS.section = ctypes.c_int(section).value
self._oLBFGS.BFGS_mem.mem_used = ctypes.c_size_t(mem_used).value
self._oLBFGS.BFGS_mem.mem_st_ix = ctypes.c_size_t(mem_st_ix).value
out = {
"task" : task_dct[task],
"requested_on" : req_arr,
"info" : {
"x_changed_in_run" : bool(x_changed),
"iteration_number" : niter,
"iteration_info" : info_dct[iter_info]
}
}
return out
[docs]class SQN_free(_StochQN_free):
"""
SQN optimizer (free mode)
Optimizes an empirical (convex) loss function over batches of sample data. Compared to
class 'SQN', this version lets the user do all the calculations from the outside, only
interacting with the object by means of a function that returns a request type and is fed the
required calculation through methods 'update_gradient' and 'update_hess_vec'.
Order in which requests are made:
========== loop ===========
* calc_grad
... (repeat calc_grad)
if 'use_grad_diff':
* calc_grad_big_batch
else:
* calc_hess_vec
===========================
Parameters
----------
mem_size : int
Number of correction pairs to store for approximation of Hessian-vector products.
bfgs_upd_freq : int
Number of iterations (batches) after which to generate a BFGS correction pair.
min_curvature : float or None
Minimum value of s*y / s*s in order to accept a correction pair.
y_reg : float or None
Regularizer for 'y' vector (gets added y_reg * s).
use_grad_diff : bool
Whether to create the correction pairs using differences between gradients instead of Hessian-vector products.
These gradients are calculated on a larger batch than the regular ones (given by batch_size * bfgs_upd_freq).
check_nan : bool
Whether to check for variables becoming NaN after each iteration, and reverting the step if they do
(will also reset BFGS memory).
nthreads : int
Number of parallel threads to use. If set to -1, will determine the number of available threads and use
all of them. Note however that not all the computations can be parallelized.
use_float : bool
Whether to use C 'float' type (np.float32). If 'False' (the default), will use 'double' type (np.float64).
The variables and gradient must be of this same dtype.
"""
def __init__(self, mem_size=10, bfgs_upd_freq=20, min_curvature=1e-4, y_reg=None, use_grad_diff=False,
check_nan=True, nthreads=-1, use_float=False):
self._take_common_inputs(mem_size, min_curvature, y_reg, check_nan, nthreads, use_float)
assert bfgs_upd_freq > 0
self.bfgs_upd_freq = int(bfgs_upd_freq)
self.use_grad_diff = bool(use_grad_diff)
self.initialized = False
def _initialize(self, n):
self._SQN = _SQN_holder(n, self.mem_size, self.bfgs_upd_freq, self.min_curvature,
self.use_grad_diff, self.y_reg, self.check_nan, self.nthreads, self.use_float)
self.gradient = np.empty(n, dtype = self.c_real_t)
if not self.use_grad_diff:
self.hess_vec = np.empty(n, dtype = self.c_real_t)
else:
self.hess_vec = np.empty(1, dtype = self.c_real_t)
self.initialized = True
[docs] def update_hess_vec(self, hess_vec):
"""
Pass requested Hessian-vector product to optimizer (task = "calc_hess_vec")
Parameters
----------
hess_vec : array(m, )
Product of the Hessian evaluated at "requested_on"[0] with the vector "requested_on"[1],
calculated a larger batch of data than the gradient, perhaps including all the cases from the last such calculation.
"""
if hess_vec.dtype != self.c_real_t:
hess_vec = hess_vec.astype(self.c_real_t)
if len(hess_vec.shape) > 1:
hess_vec = hess_vec.reshape(-1)
self.hess_vec[:] = hess_vec
[docs] def run_optimizer(self, x, step_size):
"""
Continue optimization process after supplying the calculation requested from the last run
Continue the optimization process from where it was left since the last calculation was
requested. Will internally do all the updates that are possible until the moment some
calculation of function/gradient/hessian-vector is required.
Note
----
The first time this is run, no calculation needs to be supplied.
Parameters
----------
x : array(m, )
Current values of the variables. Will be modified in-place.
step_size : float
Step size for the next update (note that variables are not updated during all runs).
Returns
-------
request : dict
Dictionary with the calculation required to proceed and iteration information.
Structure:
* task : str - one of "calc_grad", "calc_grad_same_batch" (oLBFGS w. 'min_curvature' or 'check_nan'),
"calc_hess_vec" (SQN wo. 'use_grad_diff'), "calc_fun_val_batch" (adaQN w. 'max_incr'),
"calc_grad_big_batch" (SQN and adaQN w. 'use_grad_diff').
* requested_on : array(m, ) or tuple(array(m, ), array(m, )), containing the values on which
the request in "task" has to be evaluated. In the case of Hessian-vector products (SQN), the
first vector is the values of 'x' and the second is the vector with which the product is required.
* info : dict(x_changed_in_run : bool, iteration_number : int, iteration_info : str),
iteration_info can be one of "no_problems_encountered", "search_direction_was_nan",
"func_increased", "curvature_too_small".
"""
assert isinstance(x, np.ndarray)
if x.dtype != self.c_real_t:
raise ValueError("x' has wrong dtype.")
if not self.initialized:
self._initialize(x.shape[0])
c_funs = _wrapper_float if self.use_float else _wrapper_double
x_changed, niter, section, \
mem_used, mem_st_ix, \
task, iter_info, req, req_vec = c_funs.py_run_SQN(self._SQN, x, step_size, self.gradient, self.hess_vec)
self._SQN.niter = ctypes.c_size_t(niter).value
self._SQN.section = ctypes.c_int(section).value
self._SQN.BFGS_mem.mem_used = ctypes.c_size_t(mem_used).value
self._SQN.BFGS_mem.mem_st_ix = ctypes.c_size_t(mem_st_ix).value
out = {
"task" : task_dct[task],
"requested_on" : None,
"info" : {
"x_changed_in_run" : bool(x_changed),
"iteration_number" : niter,
"iteration_info" : info_dct[iter_info]
}
}
if req_vec is not None:
out["requested_on"] = (req, req_vec)
else:
out["requested_on"] = req
return out
[docs]class adaQN_free(_StochQN_free):
"""
adaQN optimizer (free mode)
Optimizes an empirical (perhaps non-convex) loss function over batches of sample data. Compared to
class 'adaQN', this version lets the user do all the calculations from the outside, only
interacting with the object by means of a function that returns a request type and is fed the
required calculation through methods 'update_gradient' and 'update_function'.
Order in which requests are made:
========== loop ===========
* calc_grad
... (repeat calc_grad)
if max_incr > 0:
* calc_fun_val_batch
if 'use_grad_diff':
* calc_grad_big_batch (skipped if below max_incr)
===========================
Parameters
----------
mem_size : int
Number of correction pairs to store for approximation of Hessian-vector products.
fisher_size : int or None
Number of gradients to store for calculation of the empirical Fisher product with gradients.
If passing 'None', will force 'use_grad_diff' to 'True'.
bfgs_upd_freq : int
Number of iterations (batches) after which to generate a BFGS correction pair.
max_incr : float or None
Maximum ratio of function values in the validation set under the average values of 'x' during current epoch
vs. previous epoch. If the ratio is above this threshold, the BFGS and Fisher memories will be reset, and 'x'
values reverted to their previous average.
If not using a validation set, will take a longer batch for function evaluations (same as used for gradients
when using 'use_grad_diff=True').
min_curvature : float or None
Minimum value of s*y / s*s in order to accept a correction pair.
scal_reg : float
Regularization parameter to use in the denominator for AdaGrad and RMSProp scaling.
rmsprop_weight : float(0,1) or None
If not 'None', will use RMSProp formula instead of AdaGrad for approximated inverse-Hessian initialization.
y_reg : float or None
Regularizer for 'y' vector (gets added y_reg * s).
use_grad_diff : bool
Whether to create the correction pairs using differences between gradients instead of Fisher matrix.
These gradients are calculated on a larger batch than the regular ones (given by batch_size * bfgs_upd_freq).
If 'True', fisher_size will be set to None, and empirical Fisher matrix will not be used.
check_nan : bool
Whether to check for variables becoming NaN after each iteration, and reverting the step if they do
(will also reset BFGS memory).
nthreads : int
Number of parallel threads to use. If set to -1, will determine the number of available threads and use
all of them. Note however that not all the computations can be parallelized.
use_float : bool
Whether to use C 'float' type (np.float32). If 'False' (the default), will use 'double' type (np.float64).
The variables and gradient must be of this same dtype.
"""
def __init__(self, mem_size=10, fisher_size=100, bfgs_upd_freq=20, max_incr=1.01, min_curvature=1e-4, scal_reg=1e-4,
rmsprop_weight=None, y_reg=None, use_grad_diff=False, check_nan=True, nthreads=-1, use_float=False):
self._take_common_inputs(mem_size, min_curvature, y_reg, check_nan, nthreads, use_float)
assert bfgs_upd_freq > 0
bfgs_upd_freq = int(bfgs_upd_freq)
if not use_grad_diff:
assert fisher_size > 0
fisher_size = int(fisher_size)
else:
fisher_size = 0
if max_incr is not None:
assert max_incr > 0
else:
max_incr = 0
assert scal_reg > 0
if rmsprop_weight is not None:
assert rmsprop_weight > 0
assert rmsprop_weight < 1
else:
rmsprop_weight = 0
self.fisher_size = fisher_size
self.bfgs_upd_freq = bfgs_upd_freq
self.max_incr = max_incr
self.scal_reg = scal_reg
self.rmsprop_weight = rmsprop_weight
self.use_grad_diff = bool(use_grad_diff)
self.initialized = False
def _initialize(self, n):
self._adaQN = _adaQN_holder(n, self.mem_size, self.fisher_size, self.bfgs_upd_freq, self.max_incr,
self.min_curvature, self.scal_reg, self.rmsprop_weight,
self.use_grad_diff, self.y_reg, self.check_nan, self.nthreads, self.use_float)
self.gradient = np.empty(n, dtype = self.c_real_t)
self.f = self.c_real_t(0.0).value
self.initialized = True
[docs] def update_function(self, fun):
"""
Pass requested function evaluation to optimizer (task = "calc_fun_val_batch")
Parameters
----------
fun : float
Function evaluated at "requested_on" under a validation set or a larger batch, perhaps
including all the cases from the last such calculation.
"""
self.f = self.c_real_t(float(fun)).value
[docs] def run_optimizer(self, x, step_size):
"""
Continue optimization process after supplying the calculation requested from the last run
Continue the optimization process from where it was left since the last calculation was
requested. Will internally do all the updates that are possible until the moment some
calculation of function/gradient/hessian-vector is required.
Note
----
The first time this is run, no calculation needs to be supplied.
Parameters
----------
x : array(m, )
Current values of the variables. Will be modified in-place.
Do NOT modify the values between runs.
step_size : float
Step size for the next update (note that variables are not updated during all runs).
Returns
-------
request : dict
Dictionary with the calculation required to proceed and iteration information.
Structure:
* task : str - one of "calc_grad", "calc_grad_same_batch" (oLBFGS w. 'min_curvature' or 'check_nan'),
"calc_hess_vec" (SQN wo. 'use_grad_diff'), "calc_fun_val_batch" (adaQN w. 'max_incr'),
"calc_grad_big_batch" (SQN and adaQN w. 'use_grad_diff').
* requested_on : array(m, ) or tuple(array(m, ), array(m, )), containing the values on which
the request in "task" has to be evaluated. In the case of Hessian-vector products (SQN), the
first vector is the values of 'x' and the second is the vector with which the product is required.
* info : dict(x_changed_in_run : bool, iteration_number : int, iteration_info : str),
iteration_info can be one of "no_problems_encountered", "search_direction_was_nan",
"func_increased", "curvature_too_small".
"""
assert isinstance(x, np.ndarray)
if x.dtype != self.c_real_t:
raise ValueError("x' has wrong dtype.")
if not self.initialized:
self._initialize(x.shape[0])
c_funs = _wrapper_float if self.use_float else _wrapper_double
x_changed, niter, section, \
mem_used, mem_st_ix, \
f_mem_used, f_mem_st_ix, f_prev, \
task, iter_info, req = c_funs.py_run_adaQN(self._adaQN, x, self.gradient, step_size, self.f)
self._adaQN.niter = ctypes.c_size_t(niter).value
self._adaQN.section = ctypes.c_int(section).value
self._adaQN.f_prev = self.c_real_t(f_prev).value
self._adaQN.BFGS_mem.mem_used = ctypes.c_size_t(mem_used).value
self._adaQN.BFGS_mem.mem_st_ix = ctypes.c_size_t(mem_st_ix).value
self._adaQN.Fisher_mem.mem_used = ctypes.c_size_t(f_mem_used).value
self._adaQN.Fisher_mem.mem_st_ix = ctypes.c_size_t(f_mem_st_ix).value
out = {
"task" : task_dct[task],
"requested_on" : req,
"info" : {
"x_changed_in_run" : bool(x_changed),
"iteration_number" : niter,
"iteration_info" : info_dct[iter_info]
}
}
return out