From 6667caa56e98c3e5ba0dafd798e590d87572fb0d Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 24 Jan 2024 11:47:32 +0800
Subject: [PATCH] Format with black

---
 src/qibotn/QiboCircuitConvertor.py |  85 +++----
 src/qibotn/backends.py             |  41 ++--
 src/qibotn/cutn.py                 | 368 +++++++++++++++++------------
 3 files changed, 277 insertions(+), 217 deletions(-)

diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py
index 11aaa716..d3a0569a 100644
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -95,8 +95,7 @@ def init_intermediate_circuit(self, circuit):
             required_shape = self.op_shape_from_qubits(len(gate_qubits))
             self.gate_tensors.append(
                 (
-                    cp.asarray(gate.matrix(), dtype=self.dtype).reshape(
-                        required_shape),
+                    cp.asarray(gate.matrix(), dtype=self.dtype).reshape(required_shape),
                     gate_qubits,
                 )
             )
@@ -111,7 +110,6 @@ def init_basis_map(self, backend, dtype):
 
         self.basis_map = {"0": state_0, "1": state_1}
 
-
     def init_inverse_circuit(self, circuit):
         self.gate_tensors_inverse = []
         gates_qubits_inverse = []
@@ -132,14 +130,13 @@ def init_inverse_circuit(self, circuit):
 
         # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
         self.active_qubits_inverse = np.unique(gates_qubits_inverse)
-        
-        
-    def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp):
+
+    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp):
         """
         Populate the gates for all pauli operators.
 
         Args:
-            pauli_map: A dictionary mapping qubits to pauli operators. 
+            pauli_map: A dictionary mapping qubits to pauli operators.
             dtype: Data type for the tensor operands.
             backend: The package the tensor operands belong to.
 
@@ -147,70 +144,74 @@ def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp):
             A sequence of pauli gates.
         """
         asarray = backend.asarray
-        pauli_i = asarray([[1,0], [0,1]], dtype=dtype)
-        pauli_x = asarray([[0,1], [1,0]], dtype=dtype)
-        pauli_y = asarray([[0,-1j], [1j,0]], dtype=dtype)
-        pauli_z = asarray([[1,0], [0,-1]], dtype=dtype)
-        
-        operand_map = {'I': pauli_i,
-                    'X': pauli_x,
-                    'Y': pauli_y,
-                    'Z': pauli_z}
+        pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
+        pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
+        pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype)
+        pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype)
+
+        operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z}
         gates = []
         for qubit, pauli_char in pauli_map.items():
             operand = operand_map.get(pauli_char)
             if operand is None:
-                raise ValueError('pauli string character must be one of I/X/Y/Z')
+                raise ValueError("pauli string character must be one of I/X/Y/Z")
             gates.append((operand, (qubit,)))
         return gates
 
     def expectation_operands(self, pauli_string):
-        #assign pauli string to qubit
-        #_get_forward_inverse_metadata()
-        input_bitstring = "0" * self.circuit.nqubits #Need all qubits!
+        # assign pauli string to qubit
+        # _get_forward_inverse_metadata()
+        input_bitstring = "0" * self.circuit.nqubits  # Need all qubits!
 
         input_operands = self._get_bitstring_tensors(input_bitstring)
-        pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))        
+        pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
         pauli_map = pauli_string
         coned_qubits = pauli_map.keys()
-        
+
         (
             mode_labels,
             qubits_frontier,
             next_frontier,
         ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits))
-        
+
         gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
             self.gate_tensors, qubits_frontier, next_frontier
         )
-        
+
         operands = input_operands + gate_operands
         mode_labels += gate_mode_labels
-        
+
         self.init_inverse_circuit(self.circuit.invert())
-        
-        
+
         next_frontier = max(qubits_frontier.values()) + 1
 
-        #input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
+        # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
+
+        pauli_gates = self.get_pauli_gates(
+            pauli_map, dtype=self.dtype, backend=self.backend
+        )
 
-        pauli_gates = self.get_pauli_gates(pauli_map, dtype=self.dtype, backend=self.backend)
-        
-        
         gates_inverse = pauli_gates + self.gate_tensors_inverse
-        
-        gate_mode_labels_inverse, gate_operands_inverse = self._parse_gates_to_mode_labels_operands(
+
+        (
+            gate_mode_labels_inverse,
+            gate_operands_inverse,
+        ) = self._parse_gates_to_mode_labels_operands(
             gates_inverse, qubits_frontier, next_frontier
         )
-        mode_labels = mode_labels + gate_mode_labels_inverse + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
-        operands = operands + gate_operands_inverse + operands[:self.circuit.nqubits]
-        
+        mode_labels = (
+            mode_labels
+            + gate_mode_labels_inverse
+            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
+        )
+        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
+
         operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
-        
-        #expec = contract(*operand_exp_interleave)
-        #print(expec)
 
-        '''
+        # expec = contract(*operand_exp_interleave)
+        # print(expec)
+
+        """
         gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, 
                                                                                          qubits_frontier, 
                                                                                          next_frontier)
@@ -220,5 +221,5 @@ def expectation_operands(self, pauli_string):
 
         output_mode_labels = []
         expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
-        '''
-        return operand_exp_interleave
\ No newline at end of file
+        """
+        return operand_exp_interleave
diff --git a/src/qibotn/backends.py b/src/qibotn/backends.py
index 4b28431d..3728a999 100644
--- a/src/qibotn/backends.py
+++ b/src/qibotn/backends.py
@@ -19,8 +19,6 @@ def __init__(self, platform):
             or platform == "cu_tensornet_expectation"
             or platform == "cu_tensornet_nccl"
             or platform == "cu_tensornet_nccl_expectation"
-
-
         ):  # pragma: no cover
             self.platform = platform
         else:
@@ -72,45 +70,44 @@ def execute_circuit(
             state = cutn.eval_mps(circuit, gate_algo, self.dtype)
 
         if self.platform == "qu_tensornet":
-            
-            #init_state = np.random.random(2**circuit.nqubits) + 1j * np.random.random(2**circuit.nqubits)
-            #init_state = init_state / np.sqrt((np.abs(init_state) ** 2).sum())
+            # init_state = np.random.random(2**circuit.nqubits) + 1j * np.random.random(2**circuit.nqubits)
+            # init_state = init_state / np.sqrt((np.abs(init_state) ** 2).sum())
             init_state = np.zeros(2**circuit.nqubits, dtype=self.dtype)
             init_state[0] = 1.0
             state = quimb.eval(circuit.to_qasm(), init_state, backend="numpy")
-            
+
         if self.platform == "cu_tensornet_mpi":
             if initial_state is not None:
                 raise_error(NotImplementedError, "QiboTN cannot support initial state.")
 
-            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
-            state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype,32)
+            # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype, 32)
             if rank > 0:
                 state = np.array(0)
-             
+
         if self.platform == "cu_tensornet_nccl":
             if initial_state is not None:
                 raise_error(NotImplementedError, "QiboTN cannot support initial state.")
 
-            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
-            state, rank = cutn.eval_tn_nccl(circuit, self.dtype,32)
+            # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_nccl(circuit, self.dtype, 32)
             if rank > 0:
                 state = np.array(0)
-        
+
         if self.platform == "cu_tensornet_expectation":
             if initial_state is not None:
                 raise_error(NotImplementedError, "QiboTN cannot support initial state.")
-                
+
             state = cutn.eval_expectation(circuit, self.dtype)
-        
+
         if self.platform == "cu_tensornet_mpi_expectation":
             if initial_state is not None:
                 raise_error(NotImplementedError, "QiboTN cannot support initial state.")
 
-            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
-            #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
-            state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype,32)
-            
+            # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            # state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype, 32)
+
             if rank > 0:
                 state = np.array(0)
 
@@ -118,10 +115,10 @@ def execute_circuit(
             if initial_state is not None:
                 raise_error(NotImplementedError, "QiboTN cannot support initial state.")
 
-            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
-            #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
-            state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype,32)
-            
+            # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            # state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype, 32)
+
             if rank > 0:
                 state = np.array(0)
 
diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 67d70c49..aca33ff1 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -13,9 +13,13 @@ def eval(qibo_circ, datatype):
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     return contract(*myconvertor.state_vector_operands())
 
+
 def eval_expectation(qibo_circ, datatype):
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    return contract(*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)))
+    return contract(
+        *myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
+    )
+
 
 def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
     from mpi4py import MPI  # this line initializes MPI
@@ -23,73 +27,79 @@ def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
     from cuquantum import Network
 
     # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
     root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
-    
-    
+
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands = myconvertor.state_vector_operands()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-    
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
     # Broadcast the operand data.
-    #operands = comm.bcast(operands, root)
-        
+    # operands = comm.bcast(operands, root)
+
     # Assign the device for each process.
     device_id = rank % getDeviceCount()
-    
-    #dev = cp.cuda.Device(device_id)
-    #free_mem, total_mem = dev.mem_info
-    #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
+
+    # dev = cp.cuda.Device(device_id)
+    # free_mem, total_mem = dev.mem_info
+    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
 
     # Create network object.
-    network = Network(*operands, options={'device_id' : device_id})
+    network = Network(*operands, options={"device_id": device_id})
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
 
     # Select the best path from all ranks.
     opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    #if rank == root:
+    # if rank == root:
     #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
 
     # Broadcast info from the sender to all other ranks.
     info = comm.bcast(info, sender)
 
     # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )
 
     # Calculate this process's share of the slices.
     num_slices = info.num_slices
     chunk, extra = num_slices // size, num_slices % size
     slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
     slices = range(slice_begin, slice_end)
 
-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")
 
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-    
+
     return result, rank
 
+
 def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
     from mpi4py import MPI  # this line initializes MPI
     import socket
@@ -97,18 +107,18 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
     from cupy.cuda import nccl
 
     # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
     root = 0
     comm_mpi = MPI.COMM_WORLD
     rank = comm_mpi.Get_rank()
     size = comm_mpi.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
-    
+
     cp.cuda.Device(device_id).use()
-    
+
     # Set up the NCCL communicator.
     nccl_id = nccl.get_unique_id() if rank == root else None
     nccl_id = comm_mpi.bcast(nccl_id, root)
@@ -116,51 +126,66 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
 
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands = myconvertor.state_vector_operands()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
     network = Network(*operands)
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )
 
-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
 
     # Select the best path from all ranks.
     opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    #if rank == root:
+    # if rank == root:
     #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
 
     # Broadcast info from the sender to all other ranks.
     info = comm_mpi.bcast(info, sender)
 
     # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )
 
     # Calculate this process's share of the slices.
     num_slices = info.num_slices
     chunk, extra = num_slices // size, num_slices % size
     slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
     slices = range(slice_begin, slice_end)
 
-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")
 
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr)
-    
+    comm_nccl.reduce(
+        result.data.ptr,
+        result.data.ptr,
+        result.size,
+        nccl.NCCL_FLOAT64,
+        nccl.NCCL_SUM,
+        root,
+        stream_ptr,
+    )
+
     return result, rank
 
+
 def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
     from mpi4py import MPI  # this line initializes MPI
     import socket
@@ -168,18 +193,18 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
     from cupy.cuda import nccl
 
     # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
     root = 0
     comm_mpi = MPI.COMM_WORLD
     rank = comm_mpi.Get_rank()
     size = comm_mpi.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
-    
+
     cp.cuda.Device(device_id).use()
-    
+
     # Set up the NCCL communicator.
     nccl_id = nccl.get_unique_id() if rank == root else None
     nccl_id = comm_mpi.bcast(nccl_id, root)
@@ -187,50 +212,64 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
 
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
 
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
     network = Network(*operands)
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )
 
-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
 
     # Select the best path from all ranks.
     opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    #if rank == root:
+    # if rank == root:
     #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
 
     # Broadcast info from the sender to all other ranks.
     info = comm_mpi.bcast(info, sender)
 
     # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )
 
     # Calculate this process's share of the slices.
     num_slices = info.num_slices
     chunk, extra = num_slices // size, num_slices % size
     slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
     slices = range(slice_begin, slice_end)
 
-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")
 
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr)
-    
+    comm_nccl.reduce(
+        result.data.ptr,
+        result.data.ptr,
+        result.size,
+        nccl.NCCL_FLOAT64,
+        nccl.NCCL_SUM,
+        root,
+        stream_ptr,
+    )
+
     return result, rank
 
 
@@ -240,128 +279,144 @@ def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8):
     from cuquantum import Network
 
     # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
     root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
-    
-    
+
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-    
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
     # Broadcast the operand data.
-    #operands = comm.bcast(operands, root)
-        
+    # operands = comm.bcast(operands, root)
+
     # Assign the device for each process.
     device_id = rank % getDeviceCount()
-    
-    #dev = cp.cuda.Device(device_id)
-    #free_mem, total_mem = dev.mem_info
-    #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
+
+    # dev = cp.cuda.Device(device_id)
+    # free_mem, total_mem = dev.mem_info
+    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
 
     # Create network object.
-    network = Network(*operands, options={'device_id' : device_id})
+    network = Network(*operands, options={"device_id": device_id})
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
 
     # Select the best path from all ranks.
     opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    #if rank == root:
+    # if rank == root:
     #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
 
     # Broadcast info from the sender to all other ranks.
     info = comm.bcast(info, sender)
 
     # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )
 
     # Calculate this process's share of the slices.
     num_slices = info.num_slices
     chunk, extra = num_slices // size, num_slices % size
     slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
     slices = range(slice_begin, slice_end)
 
-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")
 
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-    
+
     return result, rank
 
 
 def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8):
     from mpi4py import MPI  # this line initializes MPI
     import socket
+
     # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
     ncpu_threads = multiprocessing.cpu_count() // 2
-    
+
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
     cp.cuda.Device(device_id).use()
 
     handle = cutn.create()
     network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft network opts",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft network opts",mem_avail, "rank =",rank)
     cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    operands_interleave = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    operands_interleave = myconvertor.expectation_operands(
+        PauliStringGen(qibo_circ.nqubits)
+    )
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
     # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
     network = cutn.Network(*operands_interleave, options=network_opts)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
-    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract path",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
+    path, opt_info = network.contract_path(
+        optimize={
+            "samples": n_samples,
+            "threads": ncpu_threads,
+            "slicing": {"min_slices": max(16, size)},
+        }
+    )
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract path",mem_avail, "rank =",rank)
     # Execution: To execute the contraction using the optimal path found previously
-    #print("opt_cost",opt_info.opt_cost, "Process =",rank)
-
-    
-    num_slices = opt_info.num_slices#Andy
-    chunk, extra = num_slices // size, num_slices % size#Andy
-    slice_begin = rank * chunk + min(rank, extra)#Andy
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
-    slices = range(slice_begin, slice_end)#Andy
+    # print("opt_cost",opt_info.opt_cost, "Process =",rank)
+
+    num_slices = opt_info.num_slices  # Andy
+    chunk, extra = num_slices // size, num_slices % size  # Andy
+    slice_begin = rank * chunk + min(rank, extra)  # Andy
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )  # Andy
+    slices = range(slice_begin, slice_end)  # Andy
     result = network.contract(slices=slices)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract",mem_avail, "rank =",rank)
     cutn.destroy(handle)
 
     return result, rank
 
+
 def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
     """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
     The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
@@ -371,45 +426,52 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
 
     from mpi4py import MPI  # this line initializes MPI
     import socket
+
     # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
     ncpu_threads = multiprocessing.cpu_count() // 2
-    
+
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
     cp.cuda.Device(device_id).use()
 
     handle = cutn.create()
     network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft network opts",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft network opts",mem_avail, "rank =",rank)
     cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands_interleave = myconvertor.state_vector_operands()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
     # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
     network = cutn.Network(*operands_interleave, options=network_opts)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
-    network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract path",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
+    network.contract_path(
+        optimize={
+            "samples": n_samples,
+            "threads": ncpu_threads,
+            "slicing": {"min_slices": max(16, size)},
+        }
+    )
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract path",mem_avail, "rank =",rank)
     # Execution: To execute the contraction using the optimal path found previously
-    #print("opt_cost",opt_info.opt_cost, "Process =",rank)
+    # print("opt_cost",opt_info.opt_cost, "Process =",rank)
 
-    '''
+    """
     path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
 
     num_slices = opt_info.num_slices#Andy
@@ -418,16 +480,16 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
     slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
     slices = range(slice_begin, slice_end)#Andy
     result = network.contract(slices=slices)
-    '''
+    """
     result = network.contract()
 
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract",mem_avail, "rank =",rank)
     cutn.destroy(handle)
 
     return result, rank
 
-  
+
 def eval_mps(qibo_circ, gate_algo, datatype):
     myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
     mps_helper = MPSContractionHelper(myconvertor.num_qubits)
@@ -436,18 +498,18 @@ def eval_mps(qibo_circ, gate_algo, datatype):
         myconvertor.mps_tensors, {"handle": myconvertor.handle}
     )
 
+
 def PauliStringGen(nqubits):
-    
     if nqubits <= 0:
         return "Invalid input. N should be a positive integer."
 
-    #characters = 'IXYZ'
-    characters = 'XXXZ'
+    # characters = 'IXYZ'
+    characters = "XXXZ"
 
-    result = ''
+    result = ""
 
     for i in range(nqubits):
         char_to_add = characters[i % len(characters)]
         result += char_to_add
 
-    return result
\ No newline at end of file
+    return result