DefaultQuantization Algorithm

Overview

DefaultQuantization algorithm is designed to perform a fast but at the same time accurate 8-bits quantization of NNs. It consists of three algorithms that are sequentially applied to a model:

This algorithm uses a two-stage statistic collection procedure so the wall-time of quantization basically depends on the size of the calibration subset used for it.

Parameters

The algorithm accepts all the parameters introduced by three algorithms that it relies on. All these parameters can be roughly divided into two groups: mandatory and optional.

"name": "DefaultQuantization", // optimization algorithm name
"params": {
"preset": "performance", // Preset [performance (default), accuracy] which controls the quantization mode (symmetric and asymmetric respectively)
"stat_subset_size": 300, // Size of subset to calculate activations statistics used for quantization. The whole dataset is used if no parameter specified
}
"name": "DefaultQuantization", // optimization algorithm name
"params": {
/* Preset is a collection of optimization algorithm parameters that will specify to the algorithm
to improve which metric the algorithm needs to concentrate. Each optimization algorithm supports
[performance, accuracy] presets which control the quantization mode (symmetric and asymmetric respectively)*/
"preset": "accuracy",
"stat_subset_size": 300, // Size of subset to calculate activations statistics that can be used
// For quantization parameters calculation.
"ignored": {
"scope": [
"<NODE_NAME>" // List of nodes that are excluded from optimization
],
"operations": [ // List of types that are excluded from optimization
{
"type": "<NODE_TYPE>", // Type of ignored operation
"attributes": { // If attributes are defined they will be considered during the ignorance
"<NAME>": "<VALUE>" // Lists of values to filter by
}
}
]
},
/* Manually specified quantization parameters */
/* Quantization parameters for weights */
"weights": { // Weights quantization parameters used by MinMaxAlgorithm
"bits": 8, // Bit-width, default is 8
"mode": "symmetric", // Quantization mode, default is "symmetric"
"level_low": 0, // Minimum level in the integer range in which we quantize to, default is 0 for unsigned range, -2^(bit-1) - for signed
"level_high": 255, // Maximum level in the integer range in which we quantize to, default is 2^bits-1 for unsigned range, 2^(bit-1)-1 - for signed
"granularity": "perchannel", // Quantization scale granularity: ["pertensor" (default), "perchannel"]
"range_estimator": { // Range estimator that is used to get the quantization ranges and filter outliers based on the statistics
"max": { // Parameters to estimate top quantization border
"type": "quantile", // Estimator type: ["max" (default), "quantile"]
"outlier_prob": 0.0001 // Outlier probability used in the "quantile" estimator
},
"min": { // Parameters to estimate bottom quantization border (used only in asymmetric mode)
"type": "quantile", // Estimator type: ["max" (default), "quantile"]
"outlier_prob": 0.0001 // Outlier probability used in the "quantile" estimator
}
}
},
/* Quantization parameters for activations */
"activations": {
"range_estimator": { // Range estimator that is used to get the quantization ranges and filter outliers based on the statistics
"preset": "quantile",
/* OR */
/* minimum of quantization range */
/* maximum of quantization range */
"max": { // Parameters to estimate top quantization border
"aggregator": "mean", // Batch aggregation type: ["mean" (default), "max", "min", "median", "mean_no_outliers", "median_no_outliers", "hl_estimator"]
"type": "quantile", // Estimator type: ["max" (default), "quantile"]
"outlier_prob": 0.0001 // Outlier probability used in the "quantile" estimator
},
"min": { // Parameters to estimate top quantization border
"aggregator": "mean", // Batch aggregation type: ["mean" (default), "max", "min", "median", "mean_no_outliers", "median_no_outliers", "hl_estimator"]
"type": "quantile", // Estimator type [min, max, abs_max, quantile, abs_quantile]
"outlier_prob": 0.0001 // Outlier probability used in the "quantile" estimator
}
}
}
}