5
from caffe2.python import core, utils
10
AFTER_OPTIMIZER = "after_optimizer"
19
Adds regularization to train_net for given parameter. Its factor ahead of
20
regularization is given when initialization.
21
The param should be a BlobReference.
24
def __call__(self, net, param_init_net, param, grad=None, by=None):
25
assert isinstance(param, core.BlobReference)
26
by_enum = utils.EnumClassKeyVals(RegularizationBy)
27
assert by in by_enum.values(), (
28
"Regularizer of type {} is called with invalid by={}, "
29
"not in {}".format(self.__class__, by, by_enum.values())
31
run_func = "_run_" + by
34
), "Regularizer of type {} does not implement function {}".format(
35
self.__class__, run_func
37
return getattr(self, run_func)(net, param_init_net, param, grad)
39
def _run_on_loss(self, net, param_init_net, param, grad=None):
42
def _run_after_optimizer(self, net, param_init_net, param, grad):
45
def _feature_grouping(self, param, net):
49
param_mul = net.Mul([param, param], [net.NextScopedBlob("param_mul")])
50
param_reduced = net.ReduceFrontSum(
51
[param_mul], [net.NextScopedBlob("param_reduced")]
53
grouped_feature_weight_vec = net.Pow(
55
[net.NextScopedBlob("grouped_feature_weight_vec")],
59
return grouped_feature_weight_vec
74
if min is not None and (open_range or left_open)
79
if max is not None and (open_range or right_open)
83
[param, grad.indices, grad.values]
84
if isinstance(grad, core.GradientSlice)
87
net.EnsureClipped(input_blobs, [param], min=min, max=max)
90
class L1Norm(Regularizer):
91
def __init__(self, reg_lambda):
93
assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
95
self.reg_lambda = reg_lambda
97
def _run_on_loss(self, net, param_init_net, param, grad=None):
98
output_blob = net.NextScopedBlob(param + "_l1_regularization")
99
net.LpNorm([param], [output_blob], p=1)
100
net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
103
class LpNorm(Regularizer):
104
def __init__(self, reg_lambda, p_value=0.5):
106
reg_lambda: parameter to scale regularization by
108
p_value: determines what type of Lp norm to calculate. If p > 0,
109
we will calculate Lp norm with the formula:
110
pow( sum_i { pow(theda_i, p) } , 1/p)
113
assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
114
assert p_value > 0, "p_value factor should be greater than 0"
115
self.p_value = p_value
116
self.reg_lambda = reg_lambda
119
def _run_on_loss(self, net, param_init_net, param, grad=None):
125
output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
126
grouped_feature_weight_vec = self._feature_grouping(param, net)
130
lp_vec_raised = net.Pow(
131
[grouped_feature_weight_vec],
132
[net.NextScopedBlob("lp_vec_raised")],
133
exponent=self.p_value,
135
lp_vec_summed = net.ReduceFrontSum(
136
[lp_vec_raised], [net.NextScopedBlob("lp_vec_summed")]
140
[net.NextScopedBlob("lp_vec")],
141
exponent=(1 / self.p_value),
143
net.Scale([lp_norm], [output_blob], scale=self.reg_lambda)
147
class L0ApproxNorm(Regularizer):
148
def __init__(self, reg_lambda, alpha=0.01, budget=0):
150
reg_lambda: parameter to scale regularization by
152
alpha: hyper parameter to tune that is only used in the calculation
153
of approximate L0 norm
155
budget: desired number of features. If the number of features is greater
156
than the budget amount, then the least important features will
157
be penalized. If there are fewer features than the desired
158
budget, no penalization will be applied. Optional parameter, if
159
0, then no budget is used
162
assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
163
assert alpha > 0, "alpha factor must be a positive value greater than 0"
164
assert budget >= 0, "budget factor must be greater than or equal to 0"
165
self.reg_lambda = reg_lambda
167
self.budget = float(budget)
169
def _run_on_loss(self, net, param_init_net, param, grad=None):
175
output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
176
grouped_feature_weight_vec = self._feature_grouping(param, net)
180
l0_abs = net.Abs([grouped_feature_weight_vec], [net.NextScopedBlob("l0_abs")])
181
l0_min = net.Clip([l0_abs], [net.NextScopedBlob("l0_min")], max=self.alpha)
182
l0_summed = net.ReduceFrontSum([l0_min], [net.NextScopedBlob("l0_summed")])
184
[l0_summed], [net.NextScopedBlob("l0_norm")], scale=(1 / self.alpha)
190
budget_blob = net.ConstantFill([], "budget", shape=[1], value=self.budget)
191
l0_sub_budget = net.Sub(
192
[l0_norm, budget_blob], [net.NextScopedBlob("l0_budget")]
194
relu_l0_sub_budget = net.Relu(
195
[l0_sub_budget], [net.NextScopedBlob("relu_l0_sub_budget")]
197
net.Scale([relu_l0_sub_budget], [output_blob], scale=self.reg_lambda)
199
net.Scale([l0_norm], [output_blob], scale=self.reg_lambda)
202
class L1NormTrimmed(Regularizer):
204
The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
206
def __init__(self, reg_lambda, k):
208
assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
209
assert isinstance(k, int), "k should be an interger as expected #. after selection"
210
assert k >= 1, "k should be larger than 1"
212
self.reg_lambda = reg_lambda
215
def _run_on_loss(self, net, param_init_net, param, grad=None):
216
output_blob = net.NextScopedBlob(param + "_l1_trimmed_regularization")
217
abs = net.Abs([param], [net.NextScopedBlob("abs")])
218
sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
219
topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
220
topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
221
net.Sub([sum_abs, topk_sum], [output_blob])
222
net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
226
class L2Norm(Regularizer):
227
def __init__(self, reg_lambda):
229
assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
231
self.reg_lambda = reg_lambda
233
def _run_on_loss(self, net, param_init_net, param, grad=None):
234
output_blob = net.NextScopedBlob(param + "_l2_regularization")
235
net.LpNorm([param], [output_blob], p=2)
236
net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
240
class ElasticNet(Regularizer):
241
def __init__(self, l1, l2):
246
def _run_on_loss(self, net, param_init_net, param, grad=None):
247
output_blob = net.NextScopedBlob(param + "_elastic_net_regularization")
248
l2_blob = net.NextScopedBlob(param + "_l2_blob")
249
l1_blob = net.NextScopedBlob(param + "_l1_blob")
250
net.LpNorm([param], [l2_blob], p=2)
251
net.LpNorm([param], [l1_blob], p=1)
252
net.Scale([l2_blob], [l2_blob], scale=self.l2)
253
net.Scale([l1_blob], [l1_blob], scale=self.l1)
254
net.Add([l1_blob, l2_blob], [output_blob])
258
class ElasticNetL1NormTrimmed(Regularizer):
259
def __init__(self, l1, l2, k):
265
def _run_on_loss(self, net, param_init_net, param, grad=None):
266
output_blob = net.NextScopedBlob(param + "_elastic_net_l1_trimmed_regularization")
267
l2_blob = net.NextScopedBlob(param + "_l2_blob")
268
net.LpNorm([param], [l2_blob], p=2)
269
net.Scale([l2_blob], [l2_blob], scale=self.l2)
271
l1_blob = net.NextScopedBlob(param + "_l1_blob")
272
abs = net.Abs([param], [net.NextScopedBlob("abs")])
273
sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
274
topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
275
topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
276
net.Sub([sum_abs, topk_sum], [l1_blob])
277
net.Scale([l1_blob], [l1_blob], scale=self.l1)
279
net.Add([l1_blob, l2_blob], [output_blob])
283
class MaxNorm(Regularizer):
284
def __init__(self, norm=1.0, dtype=None):
289
def _run_after_optimizer(self, net, param_init_net, param, grad):
290
assert self.norm > 0, "norm should be bigger than 0."
291
if isinstance(grad, core.GradientSlice):
292
if self.dtype and self.dtype == 'fp16':
293
net.Float16SparseNormalize(
294
[param, grad.indices],
301
[param, grad.indices],
307
raise NotImplementedError("MaxNorm is not supported for dense parameters")
310
class ConstantNorm(Regularizer):
311
def __init__(self, norm=1.0):
315
def _run_after_optimizer(self, net, param_init_net, param, grad):
316
assert self.norm > 0, "norm should be bigger than 0."
317
if isinstance(grad, core.GradientSlice):
319
[param, grad.indices],
325
raise NotImplementedError(
326
"ConstantNorm is not supported for dense parameters"
330
class SparseLpNorm(Regularizer):
331
def __init__(self, p, reg_lambda):
333
assert p in (1.0, 2.0), "Sparse Lp regularization only implemented for p = 1.0 and p = 2.0."
334
assert reg_lambda > 0, "factor ahead of regularization should be greater than 0."
336
self.reg_lambda = reg_lambda
338
def _run_after_optimizer(self, net, param_init_net, param, grad):
339
if isinstance(grad, core.GradientSlice):
340
net.SparseLpRegularizer(
341
[param, grad.indices],
344
reg_lambda=self.reg_lambda,
347
raise NotImplementedError("SparseLpNorm is not supported for dense parameters")
350
class SparseL1Norm(SparseLpNorm):
351
def __init__(self, reg_lambda):
352
super().__init__(p=1.0, reg_lambda=reg_lambda)
355
class SparseL2Norm(SparseLpNorm):
356
def __init__(self, reg_lambda):
357
super().__init__(p=2.0, reg_lambda=reg_lambda)
360
class LogBarrier(Regularizer):
362
Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
363
35(67-68), 7. Chapter 19
366
def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
368
discount is a positive weight that is decreasing, and here it is implemented
369
similar to the learning rate. It is specified by a learning rate policy and
370
corresponding options
373
assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
374
self.reg_lambda = reg_lambda
375
self.discount_policy = discount_policy
376
self.discount_options = discount_options or {"gamma": 1.0, "power": 1.0}
378
def _run_on_loss(self, net, param_init_net, param, grad=None):
379
iteration = utils.BuildUniqueMutexIter(param_init_net, net)
381
discount = net.NextScopedBlob(param + "_log_barrier_discount")
385
base_lr=-self.reg_lambda,
386
policy=self.discount_policy,
387
**self.discount_options
392
param_non_neg = net.NextScopedBlob(param + "_non_neg")
393
net.Clip([param], [param_non_neg], min=self.kEpsilon)
394
param_log = net.NextScopedBlob(param + "_log")
395
net.Log([param_non_neg], [param_log])
396
param_log_sum = net.NextScopedBlob(param + "_log_sum")
397
net.SumElements([param_log], [param_log_sum])
398
output_blob = net.NextScopedBlob(param + "_log_barrier")
399
net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
402
def _run_after_optimizer(self, net, param_init_net, param, grad):
403
self._ensure_clipped(net, param, grad, min=0, open_range=True)
406
class BoundedGradientProjection(Regularizer):
408
Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
409
35(67-68), 7. Chapter 16
413
self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
416
lb = float(lb) if lb is not None else None
417
ub = float(ub) if ub is not None else None
418
epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
419
assert epsilon > 0, "Bounded Gradient Projection with invalid eps={eps}".format(
426
lb + (epsilon if left_open else 0.)
427
<= ub - (epsilon if right_open else 0.)
430
"Bounded Gradient Projection with invalid "
431
"{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
434
lp="(" if left_open else "[",
435
rp=")" if right_open else "]",
439
self.left_open = left_open
440
self.right_open = right_open
441
self.kEpsilon = epsilon
445
def _run_after_optimizer(self, net, param_init_net, param, grad):
446
self._ensure_clipped(
452
left_open=self.left_open,
453
right_open=self.right_open,
457
class GroupL1Norm(Regularizer):
459
Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
460
Neurocomputing 241 (2017): 81-89.
462
This regularizer computes l1 norm of a weight matrix based on groups.
463
There are essentially three stages in the computation:
464
1. Compute the l2 norm on all the members of each group
465
2. Scale each l2 norm by the size of each group
466
3. Compute the l1 norm of the scaled l2 norms
468
def __init__(self, reg_lambda, groups, stabilizing_val=0):
471
reg_lambda: The weight of the regularization term.
472
groups: A list of integers describing the size of each group.
473
The length of the list is the number of groups.
476
stabilizing_val: The computation of GroupL1Norm involves the Sqrt
477
operator. When values are small, its gradient can be numerically
478
unstable and causing gradient explosion. Adding this term to
479
stabilize gradient calculation. Recommended value of this term is
480
1e-8, but it depends on the specific scenarios. If the implementation
481
of the gradient operator of Sqrt has taken into stability into
482
consideration, this term won't be necessary.
487
), "regularization weight should be 0 or positive"
488
assert isinstance(groups, list), "groups needs to be a list"
490
self.reg_lambda = (reg_lambda)
492
self.stabilizing_val = stabilizing_val
494
def _run_on_loss(self, net, param_init_net, param, grad=None):
497
param: The input blob to regularize. It should be a weight matrix
498
blob with shape (output_dim, input_dim). input_dim should be
499
equal to the sum of self.groups.
502
group_l1_norm: The output blob after applying regularization.
504
These are the steps of computation:
505
1. square all elements
507
3. lengthssum by group
508
4. square_root all elements
509
5. normalize each group based on group size
510
6. compute l1 norm of each group
511
7. scale the result with the regularization lambda
513
squared = net.Sqr(param)
514
reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
515
lengths_sum = net.LengthsSum(
518
net.GivenTensorIntFill(
519
[], 1, shape=[len(self.groups)], values=self.groups
524
if self.stabilizing_val:
526
[lengths_sum, net.ConstantFill([], 1, value=self.stabilizing_val)],
531
sqrt = net.Sqrt(lengths_sum)
540
shape=[len(self.groups)],
541
values=np.sqrt(self.groups) * self.reg_lambda
544
['normalized_l2_norm_scaled']
547
group_l1_norm = net.LpNorm(l2_scaled, ['group_l1_nrom'], p=1)