diff --git a/examples/quantumnat/quantize.py b/examples/quantumnat/quantize.py
new file mode 100644
index 00000000..ff305952
--- /dev/null
+++ b/examples/quantumnat/quantize.py
@@ -0,0 +1,298 @@
+'''
+Description:
+Author: Jiaqi Gu (jqgu@utexas.edu)
+Date: 2021-05-09 21:28:08
+LastEditors: Jiaqi Gu (jqgu@utexas.edu)
+LastEditTime: 2021-05-11 01:19:11
+'''
+from typing import Optional
+
+import torch
+from torch import Tensor
+from torch.types import Device
+from torchpack.utils.logging import logger
+
+
+__all__ = ["PACTActivationQuantizer"]
+
+
+# PACT activation: https://arxiv.org/pdf/1805.06085.pdf
+
+
+class PACTQuantFunc(torch.autograd.Function):
+    r"""PACT (PArametrized Clipping acTivation) quantization function for activations.
+        Implements a :py:class:`torch.autograd.Function` for quantizing activations in :math:`Q` bits using the PACT strategy.
+        In forward propagation, the function is defined as
+
+        .. math::
+            \mathbf{y} = f(\mathbf{x}) = 1/\varepsilon \cdot \left\lfloor\mathrm{clip}_{ [0,\alpha) } (\mathbf{x})\right\rfloor \cdot \varepsilon
+
+        where :math:`\varepsilon` is the quantization precision:
+
+        .. math::
+            \varepsilon = \alpha / (2^Q - 1)
+
+        In backward propagation, using the Straight-Through Estimator, the gradient of the function is defined as
+
+        .. math::
+            \mathbf{\nabla}_\mathbf{x} \mathcal{L} &\doteq \mathbf{\nabla}_\mathbf{y} \mathcal{L}
+
+        It can be applied by using its static `.apply` method:
+
+    :param input: the tensor containing :math:`x`, the activations to be quantized.
+    :type  input: `torch.Tensor`
+    :param eps: the precomputed value of :math:`\varepsilon`.
+    :type  eps: `torch.Tensor` or float
+    :param alpha: the value of :math:`\alpha`.
+    :type  alpha: `torch.Tensor` or float
+    :param delta: constant to sum to `eps` for numerical stability (default unused, 0 ).
+    :type  delta: `torch.Tensor` or float
+
+    :return: The quantized input activations tensor.
+    :rtype:  `torch.Tensor`
+    """
+
+    @staticmethod
+    def forward(ctx, input, level, alpha, quant_noise_mask, lower_bound,
+                upper_bound):
+        # where_input_clipped = (input < -1) | (input > alpha)
+        # where_input_ltalpha = (input < alpha)
+        # ctx.save_for_backward(where_input_clipped, where_input_ltalpha)
+        # upper_thres = alpha.data[0]-eps.data[0]
+        input = input.clamp(lower_bound, upper_bound)
+        input = input - lower_bound
+        eps = (upper_bound - lower_bound) / (level - 1)
+        input_q = (input / eps).round() * eps + lower_bound
+
+        # input_q = input.div(eps).floor_().mul_(eps)
+
+        if quant_noise_mask is not None:
+            return input_q.data.sub_(input.data).masked_fill_(quant_noise_mask, 0).add_(input)
+        else:
+            return input_q
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # see Hubara et al., Section 2.3
+        # where_input_clipped, where_input_ltalpha = ctx.saved_tensors
+        # grad_input = grad_output.masked_fill(where_input_clipped, 0)
+        # if ctx.needs_input_grad[2]:
+        #     grad_alpha = grad_output.masked_fill(
+        #         where_input_ltalpha, 0).sum().expand(1)
+        # else:
+        #     grad_alpha = None
+        grad_input = grad_output
+        return grad_input, None, None, None, None, None
+
+
+pact_quantize = PACTQuantFunc.apply
+
+
+class PACTActivationQuantizer(torch.nn.Module):
+    r"""PACT (PArametrized Clipping acTivation) activation.
+    Implements a :py:class:`torch.nn.Module` to implement PACT-style activations. It is meant to replace :py:class:`torch.nn.ReLU`, :py:class:`torch.nn.ReLU6` and
+    similar activations in a PACT-quantized network.
+    This layer can also operate in a special mode, defined by the `statistics_only` member, in which the layer runs in
+    forward-prop without quantization, collecting statistics on the activations that can then be
+    used to reset the value of :math:`\alpha`.
+    In this mode, the layer collects:
+    - tensor-wise maximum value ever seen
+    - running average with momentum 0.9
+    - running variance with momentum 0.9
+    """
+
+    def __init__(self, module: torch.nn.Module, precision: Optional[float]=None, level=None, alpha: float = 1.0, backprop_alpha: bool = True,
+                 statistics_only: bool = False, leaky: Optional[float] =
+                 None, quant_ratio: float = 1.0, device: Device =
+                 torch.device("cuda"), lower_bound=-2, upper_bound=2) -> None:
+        r"""Constructor. Initializes a :py:class:`torch.nn.Parameter` for :math:`\alpha` and sets
+            up the initial value of the `statistics_only` member.
+        :param precision: instance defining the current quantization level (default `None`).
+        :type  precision: : bitwidth
+        :param alpha: the value of :math:`\alpha`.
+        :type  alpha: `torch.Tensor` or float
+        :param backprop_alpha: default `True`; if `False`, do not update the value of `\alpha` with backpropagation.
+        :type  backprop_alpha: bool
+        :param statistics_only: initialization value of `statistics_only` member.
+        :type  statistics_only: bool
+        :param leaky: leaky relu alpha
+        :type  leaky: float
+        :param quant_ratio: quantization ratio used in QuantNoise [ICLR'21]
+        :type  quant_ratio: float
+        """
+
+        super().__init__()
+        self.module = module
+        self.precision = precision
+        self.level = level
+        self.device = device
+        self.alpha = torch.nn.Parameter(torch.tensor(
+            (alpha,), device=device), requires_grad=backprop_alpha)
+        self.alpha_p = alpha
+        self.statistics_only = statistics_only
+        self.deployment = False
+        self.eps_in = None
+        self.leaky = leaky
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+
+        # these are only used to gather statistics
+        self.max = torch.nn.Parameter(torch.zeros_like(
+            self.alpha.data), requires_grad=False)
+        self.min = torch.nn.Parameter(torch.zeros_like(
+            self.alpha.data), requires_grad=False)
+        self.running_mean = torch.nn.Parameter(
+            torch.zeros_like(self.alpha.data), requires_grad=False)
+        self.running_var = torch.nn.Parameter(
+            torch.ones_like(self.alpha.data),  requires_grad=False)
+
+        self.precise = False
+
+        # set quant noise ratio
+        self.set_quant_ratio(quant_ratio)
+
+        ## quantization hook
+        self.handle = None
+        # self.register_hook()
+
+    def set_static_precision(self, limit_at_32_bits: bool = True, **kwargs) -> None:
+        r"""Sets static parameters used only for deployment.
+        """
+        # item() --> conversion to float
+        # apparently causes a slight, but not invisibile, numerical divergence
+        # between FQ and QD stages
+        self.eps_static = self.alpha.clone().detach()/(2.0**(self.precision)-1)
+        self.alpha_static = self.alpha.clone().detach()
+        # D is selected as a power-of-two
+        D = 2.0**torch.ceil(torch.log2(self.requantization_factor *
+                                       self.eps_static / self.eps_in))
+        if not limit_at_32_bits:
+            self.D = D
+        else:
+            self.D = min(D, 2.0**(32-1-(self.precision)))
+
+    def get_output_eps(self, eps_in: Tensor) -> Tensor:
+        r"""Get the output quantum (:math:`\varepsilon`) given the input one.
+        :param eps_in: input quantum :math:`\varepsilon_{in}`.
+        :type  eps_in: :py:class:`torch.Tensor`
+        :return: output quantum :math:`\varepsilon_{out}`.
+        :rtype:  :py:class:`torch.Tensor`
+        """
+
+        return self.alpha/(2.0**(self.precision)-1)
+
+    def reset_alpha(self, use_max: bool = True, nb_std: float = 5.0) -> None:
+        r"""Reset the value of :math:`\alpha`. If `use_max` is `True`, then the highest tensor-wise value collected
+            in the statistics collection phase is used. If `False`, the collected standard deviation multiplied by
+            `nb_std` is used as a parameter
+        :param use_max: if True, use the tensor-wise maximum value collected in the statistics run as new :math:`\alpha` (default True).
+        :type  use_max: bool
+        :param nb_std: number of standard deviations to be used to initialize :math:`\alpha` if `use_max` is False.
+        :type  nb_std: float
+        """
+
+        if use_max:
+            self.alpha.data.copy_(self.max)
+        else:
+            self.alpha.data.copy_(self.running_var.data.sqrt().mul(nb_std))
+
+    def get_statistics(self):
+        r"""Returns the statistics collected up to now.
+
+        :return: The collected statistics (maximum, running average, running variance).
+        :rtype:  tuple of floats
+        """
+        return self.max.item(), self.running_mean.item(), self.running_var.item()
+
+    def set_quant_ratio(self, quant_ratio=None):
+        if(quant_ratio is None):
+            # get recommended value
+            quant_ratio = [None, 0.2, 0.3, 0.4, 0.5, 0.55, 0.6, 0.7, 0.8, 0.83,
+                           0.86, 0.89, 0.92, 0.95, 0.98, 0.99, 1][min(self.precision, 16)]
+        assert 0 <= quant_ratio <= 1, logger.error(
+            f"Wrong quant ratio. Must in [0,1], but got {quant_ratio}")
+        self.quant_ratio = quant_ratio
+
+    def register_hook(self):
+
+        def quantize_hook(module, x, y):
+            r"""Forward-prop function for PACT-quantized activations.
+
+            See :py:class:`nemo.quant.pact_quant.PACT_QuantFunc` for details on the normal operation performed by this layer.
+            In statistics mode, it uses a normal ReLU and collects statistics in the background.
+            :param x: input activations tensor.
+            :type  x: :py:class:`torch.Tensor`
+
+            :return: output activations tensor.
+            :rtype:  :py:class:`torch.Tensor`
+            """
+
+            if self.statistics_only:
+                if self.leaky is None:
+                    y = torch.nn.functional.relu(y, inplace=True)
+                else:
+                    y = torch.nn.functional.leaky_relu(y, self.leaky, inplace=True)
+                with torch.no_grad():
+                    self.max[:] = max(self.max.item(), y.max())
+                    self.min[:] = min(self.min.item(), y.min())
+                    self.running_mean[:] = 0.9 * \
+                        self.running_mean.item() + 0.1 * y.mean()
+                    self.running_var[:] = 0.9 * \
+                        self.running_var.item() + 0.1 * y.std()*y.std()
+                return y
+            else:
+                # QuantNoise ICLR 2021
+                if(self.quant_ratio < 1 and module.training):
+                    # implementation from fairseq
+                    # must fully quantize during inference
+                    quant_noise_mask = torch.empty_like(
+                        y, dtype=torch.bool).bernoulli_(1-self.quant_ratio)
+                else:
+                    quant_noise_mask = None
+                if self.level is not None:
+                    level = self.level
+                else:
+                    level = 2 ** self.precision
+                # eps = self.alpha/(2.0**(self.precision)-1)
+                return pact_quantize(y, level, self.alpha, quant_noise_mask,
+                                     self.lower_bound, self.upper_bound)
+
+        # register hook
+        self.handle = self.module.register_forward_hook(quantize_hook)
+        return self.handle
+
+    def remove_hook(self) -> None:
+        ## remove the forward hook
+        if(self.handle is not None):
+            self.handle.remove()
+
+
+if __name__ == "__main__":
+    import pdb
+    pdb.set_trace()
+    device = torch.device("cuda")
+    class Model(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+
+        def forward(self, x):
+            y = x + 0.3
+            return y
+    model = Model().to(device)
+    model.train()
+    quantizer = PACTActivationQuantizer(module=model, precision=4,
+                                        quant_ratio=0.1, device=device,
+                                        backprop_alpha=False)
+    quantizer.set_quant_ratio(0.8)
+    torch.manual_seed(10)
+    torch.cuda.manual_seed_all(10)
+    x = torch.randn(4,4, device=device, requires_grad=True)
+    y = model(x)
+    loss = y.sum()
+    loss.backward()
+    print(x)
+    print(y)
+    print(quantizer.alpha.data)
+    print(quantizer.alpha.grad)
+
+    
\ No newline at end of file
diff --git a/examples/quantumnat/quantumnat.py b/examples/quantumnat/quantumnat.py
index 14ca8182..35a7042e 100644
--- a/examples/quantumnat/quantumnat.py
+++ b/examples/quantumnat/quantumnat.py
@@ -54,6 +54,8 @@
 import random
 import numpy as np
 
+from quantize import PACTActivationQuantizer
+
 
 class QFCModel(tq.QuantumModule):
     class QLayer(tq.QuantumModule):
@@ -101,6 +103,7 @@ def __init__(self):
 
         self.q_layer = self.QLayer()
         self.measure = tq.MeasureAll(tq.PauliZ)
+        self.norm = torch.nn.BatchNorm1d(self.n_wires)
 
     def forward(self, x, use_qiskit=False):
         qdev = tq.QuantumDevice(n_wires=self.n_wires, bsz=x.shape[0], device=x.device, record_op=True)
@@ -125,9 +128,9 @@ def forward(self, x, use_qiskit=False):
             self.encoder(qdev, x)
             self.q_layer(qdev)
             x = self.measure(qdev)
-
-        x = x.reshape(bsz, 2, 2).sum(-1).squeeze()
-        x = F.log_softmax(x, dim=1)
+        
+        # simplified version of post-measurement normalization, implemented with batch norm
+        x = self.norm(x)
 
         return x
 
@@ -138,6 +141,11 @@ def train(dataflow, model, device, optimizer):
         targets = feed_dict["digit"].to(device)
 
         outputs = model(inputs)
+
+        bsz = outputs.shape[0]
+        outputs = outputs.reshape(bsz, 2, 2).sum(-1).squeeze()
+        outputs = F.log_softmax(outputs, dim=1)
+
         loss = F.nll_loss(outputs, targets)
         optimizer.zero_grad()
         loss.backward()
@@ -154,6 +162,9 @@ def valid_test(dataflow, split, model, device, qiskit=False):
             targets = feed_dict["digit"].to(device)
 
             outputs = model(inputs, use_qiskit=qiskit)
+            bsz = outputs.shape[0]
+            outputs = outputs.reshape(bsz, 2, 2).sum(-1).squeeze()
+            outputs = F.log_softmax(outputs, dim=1)
 
             target_all.append(targets)
             output_all.append(outputs)
@@ -177,9 +188,7 @@ def main():
         "--static", action="store_true", help="compute with " "static mode"
     )
     parser.add_argument("--pdb", action="store_true", help="debug with pdb")
-    parser.add_argument(
-        "--wires-per-block", type=int, default=2, help="wires per block int static mode"
-    )
+
     parser.add_argument(
         "--epochs", type=int, default=30, help="number of training epochs"
     )
@@ -244,77 +253,54 @@ def main():
     model.measure.set_v_c_reg_mapping(get_v_c_reg_mapping(circ_transpiled))
     model.q_layer = q_layer
 
+    # noise inject, initilized this noise model which will inject noise to gates
     noise_model_tq = tq.NoiseModelTQ(
         noise_model_name="ibmq_quito",
         n_epochs=n_epochs,
         # noise_total_prob=0.5,
         # ignored_ops=configs.trainer.ignored_noise_ops,
-        factor=1,
+        factor=10,
         add_thermal=True,
     )
 
     noise_model_tq.is_add_noise = True
-    # noise_model_tq.v_c_reg_mapping = {'v2c': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6},
-    #                                   'c2v': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6},
-    #                                   }
-    # noise_model_tq.p_c_reg_mapping = {'p2c': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6},
-    #                                   'c2p': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6},
-    #                                   }
-    # noise_model_tq.p_v_reg_mapping ={'p2v': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6},
-    #                                   'v2p': {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6},
-    #                                   }
     noise_model_tq.v_c_reg_mapping = get_v_c_reg_mapping(circ_transpiled)
     noise_model_tq.p_c_reg_mapping = get_p_c_reg_mapping(circ_transpiled)
     noise_model_tq.p_v_reg_mapping = get_p_v_reg_mapping(circ_transpiled)
-    model.set_noise_model_tq(noise_model_tq)
+    # model.set_noise_model_tq(noise_model_tq)
 
     optimizer = optim.Adam(model.parameters(), lr=5e-3, weight_decay=1e-4)
     scheduler = CosineAnnealingLR(optimizer, T_max=n_epochs)
 
-    if args.static:
-        # optionally to switch to the static mode, which can bring speedup
-        # on training
-        model.q_layer.static_on(wires_per_block=args.wires_per_block)
+    # post-training quantization quantizer, in this model, there is only one node, meaning the output of the quantum layer is not encoded
+    # again in the later quantum layer. post-training quantization is more effective for multi-node models.
+    quantizer = PACTActivationQuantizer(
+                module=model,
+                precision=4,
+                alpha=1.0,
+                backprop_alpha=False,
+                device=device,
+                lower_bound=-5,
+                upper_bound=5,
+    )
 
     for epoch in range(1, n_epochs + 1):
         # train
         print(f"Epoch {epoch}:")
+        quantizer.register_hook()
         train(dataflow, model, device, optimizer)
         print(optimizer.param_groups[0]["lr"])
 
         # valid
         valid_test(dataflow, "valid", model, device)
         scheduler.step()
+        quantizer.remove_hook()
+    
     print(noise_model_tq.noise_counter)
 
     # test
     valid_test(dataflow, "test", model, device, qiskit=False)
 
-    # run on Qiskit simulator and real Quantum Computers
-    try:
-        from qiskit import IBMQ
-
-        # firstly perform simulate
-        print(f"\nTest with Qiskit Simulator")
-        backend_name = "ibmq_quito"
-        processor_simulation = QiskitProcessor(use_real_qc=False, noise_model_name=backend_name)
-        model.set_qiskit_processor(processor_simulation)
-        valid_test(dataflow, "test", model, device, qiskit=True)
-        # valid_test(dataflow, "valid", model, device, qiskit=True)
-
-        # then try to run on REAL QC
-        print(f"\nTest on Real Quantum Computer {backend_name}")
-        processor_real_qc = QiskitProcessor(use_real_qc=True, backend_name=backend_name)
-        model.set_qiskit_processor(processor_real_qc)
-        valid_test(dataflow, "test", model, device, qiskit=True)
-    except ImportError:
-        print(
-            "Please install qiskit, create an IBM Q Experience Account and "
-            "save the account token according to the instruction at "
-            "'https://github.com/Qiskit/qiskit-ibmq-provider', "
-            "then try again."
-        )
-
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()