diff --git a/examples/quantumnat/quantize.py b/examples/quantumnat/quantize.py new file mode 100644 index 00000000..ff305952 --- /dev/null +++ b/examples/quantumnat/quantize.py @@ -0,0 +1,298 @@ +''' +Description: +Author: Jiaqi Gu (jqgu@utexas.edu) +Date: 2021-05-09 21:28:08 +LastEditors: Jiaqi Gu (jqgu@utexas.edu) +LastEditTime: 2021-05-11 01:19:11 +''' +from typing import Optional + +import torch +from torch import Tensor +from torch.types import Device +from torchpack.utils.logging import logger + + +__all__ = ["PACTActivationQuantizer"] + + +# PACT activation: https://arxiv.org/pdf/1805.06085.pdf + + +class PACTQuantFunc(torch.autograd.Function): + r"""PACT (PArametrized Clipping acTivation) quantization function for activations. + Implements a :py:class:`torch.autograd.Function` for quantizing activations in :math:`Q` bits using the PACT strategy. + In forward propagation, the function is defined as + + .. math:: + \mathbf{y} = f(\mathbf{x}) = 1/\varepsilon \cdot \left\lfloor\mathrm{clip}_{ [0,\alpha) } (\mathbf{x})\right\rfloor \cdot \varepsilon + + where :math:`\varepsilon` is the quantization precision: + + .. math:: + \varepsilon = \alpha / (2^Q - 1) + + In backward propagation, using the Straight-Through Estimator, the gradient of the function is defined as + + .. math:: + \mathbf{\nabla}_\mathbf{x} \mathcal{L} &\doteq \mathbf{\nabla}_\mathbf{y} \mathcal{L} + + It can be applied by using its static `.apply` method: + + :param input: the tensor containing :math:`x`, the activations to be quantized. + :type input: `torch.Tensor` + :param eps: the precomputed value of :math:`\varepsilon`. + :type eps: `torch.Tensor` or float + :param alpha: the value of :math:`\alpha`. + :type alpha: `torch.Tensor` or float + :param delta: constant to sum to `eps` for numerical stability (default unused, 0 ). + :type delta: `torch.Tensor` or float + + :return: The quantized input activations tensor. + :rtype: `torch.Tensor` + """ + + @staticmethod + def forward(ctx, input, level, alpha, quant_noise_mask, lower_bound, + upper_bound): + # where_input_clipped = (input < -1) | (input > alpha) + # where_input_ltalpha = (input < alpha) + # ctx.save_for_backward(where_input_clipped, where_input_ltalpha) + # upper_thres = alpha.data[0]-eps.data[0] + input = input.clamp(lower_bound, upper_bound) + input = input - lower_bound + eps = (upper_bound - lower_bound) / (level - 1) + input_q = (input / eps).round() * eps + lower_bound + + # input_q = input.div(eps).floor_().mul_(eps) + + if quant_noise_mask is not None: + return input_q.data.sub_(input.data).masked_fill_(quant_noise_mask, 0).add_(input) + else: + return input_q + + @staticmethod + def backward(ctx, grad_output): + # see Hubara et al., Section 2.3 + # where_input_clipped, where_input_ltalpha = ctx.saved_tensors + # grad_input = grad_output.masked_fill(where_input_clipped, 0) + # if ctx.needs_input_grad[2]: + # grad_alpha = grad_output.masked_fill( + # where_input_ltalpha, 0).sum().expand(1) + # else: + # grad_alpha = None + grad_input = grad_output + return grad_input, None, None, None, None, None + + +pact_quantize = PACTQuantFunc.apply + + +class PACTActivationQuantizer(torch.nn.Module): + r"""PACT (PArametrized Clipping acTivation) activation. + Implements a :py:class:`torch.nn.Module` to implement PACT-style activations. It is meant to replace :py:class:`torch.nn.ReLU`, :py:class:`torch.nn.ReLU6` and + similar activations in a PACT-quantized network. + This layer can also operate in a special mode, defined by the `statistics_only` member, in which the layer runs in + forward-prop without quantization, collecting statistics on the activations that can then be + used to reset the value of :math:`\alpha`. + In this mode, the layer collects: + - tensor-wise maximum value ever seen + - running average with momentum 0.9 + - running variance with momentum 0.9 + """ + + def __init__(self, module: torch.nn.Module, precision: Optional[float]=None, level=None, alpha: float = 1.0, backprop_alpha: bool = True, + statistics_only: bool = False, leaky: Optional[float] = + None, quant_ratio: float = 1.0, device: Device = + torch.device("cuda"), lower_bound=-2, upper_bound=2) -> None: + r"""Constructor. Initializes a :py:class:`torch.nn.Parameter` for :math:`\alpha` and sets + up the initial value of the `statistics_only` member. + :param precision: instance defining the current quantization level (default `None`). + :type precision: : bitwidth + :param alpha: the value of :math:`\alpha`. + :type alpha: `torch.Tensor` or float + :param backprop_alpha: default `True`; if `False`, do not update the value of `\alpha` with backpropagation. + :type backprop_alpha: bool + :param statistics_only: initialization value of `statistics_only` member. + :type statistics_only: bool + :param leaky: leaky relu alpha + :type leaky: float + :param quant_ratio: quantization ratio used in QuantNoise [ICLR'21] + :type quant_ratio: float + """ + + super().__init__() + self.module = module + self.precision = precision + self.level = level + self.device = device + self.alpha = torch.nn.Parameter(torch.tensor( + (alpha,), device=device), requires_grad=backprop_alpha) + self.alpha_p = alpha + self.statistics_only = statistics_only + self.deployment = False + self.eps_in = None + self.leaky = leaky + self.lower_bound = lower_bound + self.upper_bound = upper_bound + + # these are only used to gather statistics + self.max = torch.nn.Parameter(torch.zeros_like( + self.alpha.data), requires_grad=False) + self.min = torch.nn.Parameter(torch.zeros_like( + self.alpha.data), requires_grad=False) + self.running_mean = torch.nn.Parameter( + torch.zeros_like(self.alpha.data), requires_grad=False) + self.running_var = torch.nn.Parameter( + torch.ones_like(self.alpha.data), requires_grad=False) + + self.precise = False + + # set quant noise ratio + self.set_quant_ratio(quant_ratio) + + ## quantization hook + self.handle = None + # self.register_hook() + + def set_static_precision(self, limit_at_32_bits: bool = True, **kwargs) -> None: + r"""Sets static parameters used only for deployment. + """ + # item() --> conversion to float + # apparently causes a slight, but not invisibile, numerical divergence + # between FQ and QD stages + self.eps_static = self.alpha.clone().detach()/(2.0**(self.precision)-1) + self.alpha_static = self.alpha.clone().detach() + # D is selected as a power-of-two + D = 2.0**torch.ceil(torch.log2(self.requantization_factor * + self.eps_static / self.eps_in)) + if not limit_at_32_bits: + self.D = D + else: + self.D = min(D, 2.0**(32-1-(self.precision))) + + def get_output_eps(self, eps_in: Tensor) -> Tensor: + r"""Get the output quantum (:math:`\varepsilon`) given the input one. + :param eps_in: input quantum :math:`\varepsilon_{in}`. + :type eps_in: :py:class:`torch.Tensor` + :return: output quantum :math:`\varepsilon_{out}`. + :rtype: :py:class:`torch.Tensor` + """ + + return self.alpha/(2.0**(self.precision)-1) + + def reset_alpha(self, use_max: bool = True, nb_std: float = 5.0) -> None: + r"""Reset the value of :math:`\alpha`. If `use_max` is `True`, then the highest tensor-wise value collected + in the statistics collection phase is used. If `False`, the collected standard deviation multiplied by + `nb_std` is used as a parameter + :param use_max: if True, use the tensor-wise maximum value collected in the statistics run as new :math:`\alpha` (default True). + :type use_max: bool + :param nb_std: number of standard deviations to be used to initialize :math:`\alpha` if `use_max` is False. + :type nb_std: float + """ + + if use_max: + self.alpha.data.copy_(self.max) + else: + self.alpha.data.copy_(self.running_var.data.sqrt().mul(nb_std)) + + def get_statistics(self): + r"""Returns the statistics collected up to now. + + :return: The collected statistics (maximum, running average, running variance). + :rtype: tuple of floats + """ + return self.max.item(), self.running_mean.item(), self.running_var.item() + + def set_quant_ratio(self, quant_ratio=None): + if(quant_ratio is None): + # get recommended value + quant_ratio = [None, 0.2, 0.3, 0.4, 0.5, 0.55, 0.6, 0.7, 0.8, 0.83, + 0.86, 0.89, 0.92, 0.95, 0.98, 0.99, 1][min(self.precision, 16)] + assert 0 <= quant_ratio <= 1, logger.error( + f"Wrong quant ratio. Must in [0,1], but got {quant_ratio}") + self.quant_ratio = quant_ratio + + def register_hook(self): + + def quantize_hook(module, x, y): + r"""Forward-prop function for PACT-quantized activations. + + See :py:class:`nemo.quant.pact_quant.PACT_QuantFunc` for details on the normal operation performed by this layer. + In statistics mode, it uses a normal ReLU and collects statistics in the background. + :param x: input activations tensor. + :type x: :py:class:`torch.Tensor` + + :return: output activations tensor. + :rtype: :py:class:`torch.Tensor` + """ + + if self.statistics_only: + if self.leaky is None: + y = torch.nn.functional.relu(y, inplace=True) + else: + y = torch.nn.functional.leaky_relu(y, self.leaky, inplace=True) + with torch.no_grad(): + self.max[:] = max(self.max.item(), y.max()) + self.min[:] = min(self.min.item(), y.min()) + self.running_mean[:] = 0.9 * \ + self.running_mean.item() + 0.1 * y.mean() + self.running_var[:] = 0.9 * \ + self.running_var.item() + 0.1 * y.std()*y.std() + return y + else: + # QuantNoise ICLR 2021 + if(self.quant_ratio < 1 and module.training): + # implementation from fairseq + # must fully quantize during inference + quant_noise_mask = torch.empty_like( + y, dtype=torch.bool).bernoulli_(1-self.quant_ratio) + else: + quant_noise_mask = None + if self.level is not None: + level = self.level + else: + level = 2 ** self.precision + # eps = self.alpha/(2.0**(self.precision)-1) + return pact_quantize(y, level, self.alpha, quant_noise_mask, + self.lower_bound, self.upper_bound) + + # register hook + self.handle = self.module.register_forward_hook(quantize_hook) + return self.handle + + def remove_hook(self) -> None: + ## remove the forward hook + if(self.handle is not None): + self.handle.remove() + + +if __name__ == "__main__": + import pdb + pdb.set_trace() + device = torch.device("cuda") + class Model(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, x): + y = x + 0.3 + return y + model = Model().to(device) + model.train() + quantizer = PACTActivationQuantizer(module=model, precision=4, + quant_ratio=0.1, device=device, + backprop_alpha=False) + quantizer.set_quant_ratio(0.8) + torch.manual_seed(10) + torch.cuda.manual_seed_all(10) + x = torch.randn(4,4, device=device, requires_grad=True) + y = model(x) + loss = y.sum() + loss.backward() + print(x) + print(y) + print(quantizer.alpha.data) + print(quantizer.alpha.grad) + + \ No newline at end of file diff --git a/examples/quantumnat/quantumnat.py b/examples/quantumnat/quantumnat.py index 14ca8182..35a7042e 100644 --- a/examples/quantumnat/quantumnat.py +++ b/examples/quantumnat/quantumnat.py @@ -54,6 +54,8 @@ import random import numpy as np +from quantize import PACTActivationQuantizer + class QFCModel(tq.QuantumModule): class QLayer(tq.QuantumModule): @@ -101,6 +103,7 @@ def __init__(self): self.q_layer = self.QLayer() self.measure = tq.MeasureAll(tq.PauliZ) + self.norm = torch.nn.BatchNorm1d(self.n_wires) def forward(self, x, use_qiskit=False): qdev = tq.QuantumDevice(n_wires=self.n_wires, bsz=x.shape[0], device=x.device, record_op=True) @@ -125,9 +128,9 @@ def forward(self, x, use_qiskit=False): self.encoder(qdev, x) self.q_layer(qdev) x = self.measure(qdev) - - x = x.reshape(bsz, 2, 2).sum(-1).squeeze() - x = F.log_softmax(x, dim=1) + + # simplified version of post-measurement normalization, implemented with batch norm + x = self.norm(x) return x @@ -138,6 +141,11 @@ def train(dataflow, model, device, optimizer): targets = feed_dict["digit"].to(device) outputs = model(inputs) + + bsz = outputs.shape[0] + outputs = outputs.reshape(bsz, 2, 2).sum(-1).squeeze() + outputs = F.log_softmax(outputs, dim=1) + loss = F.nll_loss(outputs, targets) optimizer.zero_grad() loss.backward() @@ -154,6 +162,9 @@ def valid_test(dataflow, split, model, device, qiskit=False): targets = feed_dict["digit"].to(device) outputs = model(inputs, use_qiskit=qiskit) + bsz = outputs.shape[0] + outputs = outputs.reshape(bsz, 2, 2).sum(-1).squeeze() + outputs = F.log_softmax(outputs, dim=1) target_all.append(targets) output_all.append(outputs) @@ -177,9 +188,7 @@ def main(): "--static", action="store_true", help="compute with " "static mode" ) parser.add_argument("--pdb", action="store_true", help="debug with pdb") - parser.add_argument( - "--wires-per-block", type=int, default=2, help="wires per block int static mode" - ) + parser.add_argument( "--epochs", type=int, default=30, help="number of training epochs" ) @@ -244,77 +253,54 @@ def main(): model.measure.set_v_c_reg_mapping(get_v_c_reg_mapping(circ_transpiled)) model.q_layer = q_layer + # noise inject, initilized this noise model which will inject noise to gates noise_model_tq = tq.NoiseModelTQ( noise_model_name="ibmq_quito", n_epochs=n_epochs, # noise_total_prob=0.5, # ignored_ops=configs.trainer.ignored_noise_ops, - factor=1, + factor=10, add_thermal=True, ) noise_model_tq.is_add_noise = True - # noise_model_tq.v_c_reg_mapping = {'v2c': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6}, - # 'c2v': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6}, - # } - # noise_model_tq.p_c_reg_mapping = {'p2c': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6}, - # 'c2p': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6}, - # } - # noise_model_tq.p_v_reg_mapping ={'p2v': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6}, - # 'v2p': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6}, - # } noise_model_tq.v_c_reg_mapping = get_v_c_reg_mapping(circ_transpiled) noise_model_tq.p_c_reg_mapping = get_p_c_reg_mapping(circ_transpiled) noise_model_tq.p_v_reg_mapping = get_p_v_reg_mapping(circ_transpiled) - model.set_noise_model_tq(noise_model_tq) + # model.set_noise_model_tq(noise_model_tq) optimizer = optim.Adam(model.parameters(), lr=5e-3, weight_decay=1e-4) scheduler = CosineAnnealingLR(optimizer, T_max=n_epochs) - if args.static: - # optionally to switch to the static mode, which can bring speedup - # on training - model.q_layer.static_on(wires_per_block=args.wires_per_block) + # post-training quantization quantizer, in this model, there is only one node, meaning the output of the quantum layer is not encoded + # again in the later quantum layer. post-training quantization is more effective for multi-node models. + quantizer = PACTActivationQuantizer( + module=model, + precision=4, + alpha=1.0, + backprop_alpha=False, + device=device, + lower_bound=-5, + upper_bound=5, + ) for epoch in range(1, n_epochs + 1): # train print(f"Epoch {epoch}:") + quantizer.register_hook() train(dataflow, model, device, optimizer) print(optimizer.param_groups[0]["lr"]) # valid valid_test(dataflow, "valid", model, device) scheduler.step() + quantizer.remove_hook() + print(noise_model_tq.noise_counter) # test valid_test(dataflow, "test", model, device, qiskit=False) - # run on Qiskit simulator and real Quantum Computers - try: - from qiskit import IBMQ - - # firstly perform simulate - print(f"\nTest with Qiskit Simulator") - backend_name = "ibmq_quito" - processor_simulation = QiskitProcessor(use_real_qc=False, noise_model_name=backend_name) - model.set_qiskit_processor(processor_simulation) - valid_test(dataflow, "test", model, device, qiskit=True) - # valid_test(dataflow, "valid", model, device, qiskit=True) - - # then try to run on REAL QC - print(f"\nTest on Real Quantum Computer {backend_name}") - processor_real_qc = QiskitProcessor(use_real_qc=True, backend_name=backend_name) - model.set_qiskit_processor(processor_real_qc) - valid_test(dataflow, "test", model, device, qiskit=True) - except ImportError: - print( - "Please install qiskit, create an IBM Q Experience Account and " - "save the account token according to the instruction at " - "'https://github.com/Qiskit/qiskit-ibmq-provider', " - "then try again." - ) - if __name__ == "__main__": - main() \ No newline at end of file + main()