From 6667caa56e98c3e5ba0dafd798e590d87572fb0d Mon Sep 17 00:00:00 2001 From: tankya2 Date: Wed, 24 Jan 2024 11:47:32 +0800 Subject: [PATCH] Format with black --- src/qibotn/QiboCircuitConvertor.py | 85 +++---- src/qibotn/backends.py | 41 ++-- src/qibotn/cutn.py | 368 +++++++++++++++++------------ 3 files changed, 277 insertions(+), 217 deletions(-) diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py index 11aaa716..d3a0569a 100644 --- a/src/qibotn/QiboCircuitConvertor.py +++ b/src/qibotn/QiboCircuitConvertor.py @@ -95,8 +95,7 @@ def init_intermediate_circuit(self, circuit): required_shape = self.op_shape_from_qubits(len(gate_qubits)) self.gate_tensors.append( ( - cp.asarray(gate.matrix(), dtype=self.dtype).reshape( - required_shape), + cp.asarray(gate.matrix(), dtype=self.dtype).reshape(required_shape), gate_qubits, ) ) @@ -111,7 +110,6 @@ def init_basis_map(self, backend, dtype): self.basis_map = {"0": state_0, "1": state_1} - def init_inverse_circuit(self, circuit): self.gate_tensors_inverse = [] gates_qubits_inverse = [] @@ -132,14 +130,13 @@ def init_inverse_circuit(self, circuit): # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit. self.active_qubits_inverse = np.unique(gates_qubits_inverse) - - - def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp): + + def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp): """ Populate the gates for all pauli operators. Args: - pauli_map: A dictionary mapping qubits to pauli operators. + pauli_map: A dictionary mapping qubits to pauli operators. dtype: Data type for the tensor operands. backend: The package the tensor operands belong to. @@ -147,70 +144,74 @@ def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp): A sequence of pauli gates. """ asarray = backend.asarray - pauli_i = asarray([[1,0], [0,1]], dtype=dtype) - pauli_x = asarray([[0,1], [1,0]], dtype=dtype) - pauli_y = asarray([[0,-1j], [1j,0]], dtype=dtype) - pauli_z = asarray([[1,0], [0,-1]], dtype=dtype) - - operand_map = {'I': pauli_i, - 'X': pauli_x, - 'Y': pauli_y, - 'Z': pauli_z} + pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype) + pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype) + pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype) + pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype) + + operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z} gates = [] for qubit, pauli_char in pauli_map.items(): operand = operand_map.get(pauli_char) if operand is None: - raise ValueError('pauli string character must be one of I/X/Y/Z') + raise ValueError("pauli string character must be one of I/X/Y/Z") gates.append((operand, (qubit,))) return gates def expectation_operands(self, pauli_string): - #assign pauli string to qubit - #_get_forward_inverse_metadata() - input_bitstring = "0" * self.circuit.nqubits #Need all qubits! + # assign pauli string to qubit + # _get_forward_inverse_metadata() + input_bitstring = "0" * self.circuit.nqubits # Need all qubits! input_operands = self._get_bitstring_tensors(input_bitstring) - pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string)) + pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string)) pauli_map = pauli_string coned_qubits = pauli_map.keys() - + ( mode_labels, qubits_frontier, next_frontier, ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits)) - + gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( self.gate_tensors, qubits_frontier, next_frontier ) - + operands = input_operands + gate_operands mode_labels += gate_mode_labels - + self.init_inverse_circuit(self.circuit.invert()) - - + next_frontier = max(qubits_frontier.values()) + 1 - #input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits) + # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits) + + pauli_gates = self.get_pauli_gates( + pauli_map, dtype=self.dtype, backend=self.backend + ) - pauli_gates = self.get_pauli_gates(pauli_map, dtype=self.dtype, backend=self.backend) - - gates_inverse = pauli_gates + self.gate_tensors_inverse - - gate_mode_labels_inverse, gate_operands_inverse = self._parse_gates_to_mode_labels_operands( + + ( + gate_mode_labels_inverse, + gate_operands_inverse, + ) = self._parse_gates_to_mode_labels_operands( gates_inverse, qubits_frontier, next_frontier ) - mode_labels = mode_labels + gate_mode_labels_inverse + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)] - operands = operands + gate_operands_inverse + operands[:self.circuit.nqubits] - + mode_labels = ( + mode_labels + + gate_mode_labels_inverse + + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)] + ) + operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits] + operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] - - #expec = contract(*operand_exp_interleave) - #print(expec) - ''' + # expec = contract(*operand_exp_interleave) + # print(expec) + + """ gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, qubits_frontier, next_frontier) @@ -220,5 +221,5 @@ def expectation_operands(self, pauli_string): output_mode_labels = [] expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels) - ''' - return operand_exp_interleave \ No newline at end of file + """ + return operand_exp_interleave diff --git a/src/qibotn/backends.py b/src/qibotn/backends.py index 4b28431d..3728a999 100644 --- a/src/qibotn/backends.py +++ b/src/qibotn/backends.py @@ -19,8 +19,6 @@ def __init__(self, platform): or platform == "cu_tensornet_expectation" or platform == "cu_tensornet_nccl" or platform == "cu_tensornet_nccl_expectation" - - ): # pragma: no cover self.platform = platform else: @@ -72,45 +70,44 @@ def execute_circuit( state = cutn.eval_mps(circuit, gate_algo, self.dtype) if self.platform == "qu_tensornet": - - #init_state = np.random.random(2**circuit.nqubits) + 1j * np.random.random(2**circuit.nqubits) - #init_state = init_state / np.sqrt((np.abs(init_state) ** 2).sum()) + # init_state = np.random.random(2**circuit.nqubits) + 1j * np.random.random(2**circuit.nqubits) + # init_state = init_state / np.sqrt((np.abs(init_state) ** 2).sum()) init_state = np.zeros(2**circuit.nqubits, dtype=self.dtype) init_state[0] = 1.0 state = quimb.eval(circuit.to_qasm(), init_state, backend="numpy") - + if self.platform == "cu_tensornet_mpi": if initial_state is not None: raise_error(NotImplementedError, "QiboTN cannot support initial state.") - #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) - state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype,32) + # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype, 32) if rank > 0: state = np.array(0) - + if self.platform == "cu_tensornet_nccl": if initial_state is not None: raise_error(NotImplementedError, "QiboTN cannot support initial state.") - #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) - state, rank = cutn.eval_tn_nccl(circuit, self.dtype,32) + # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + state, rank = cutn.eval_tn_nccl(circuit, self.dtype, 32) if rank > 0: state = np.array(0) - + if self.platform == "cu_tensornet_expectation": if initial_state is not None: raise_error(NotImplementedError, "QiboTN cannot support initial state.") - + state = cutn.eval_expectation(circuit, self.dtype) - + if self.platform == "cu_tensornet_mpi_expectation": if initial_state is not None: raise_error(NotImplementedError, "QiboTN cannot support initial state.") - #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) - #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) - state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype,32) - + # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + # state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) + state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype, 32) + if rank > 0: state = np.array(0) @@ -118,10 +115,10 @@ def execute_circuit( if initial_state is not None: raise_error(NotImplementedError, "QiboTN cannot support initial state.") - #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) - #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) - state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype,32) - + # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + # state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) + state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype, 32) + if rank > 0: state = np.array(0) diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py index 67d70c49..aca33ff1 100644 --- a/src/qibotn/cutn.py +++ b/src/qibotn/cutn.py @@ -13,9 +13,13 @@ def eval(qibo_circ, datatype): myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) return contract(*myconvertor.state_vector_operands()) + def eval_expectation(qibo_circ, datatype): myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - return contract(*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))) + return contract( + *myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) + ) + def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): from mpi4py import MPI # this line initializes MPI @@ -23,73 +27,79 @@ def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): from cuquantum import Network # Get the hostname - #hostname = socket.gethostname() - + # hostname = socket.gethostname() + root = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() - - + # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands = myconvertor.state_vector_operands() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + # Broadcast the operand data. - #operands = comm.bcast(operands, root) - + # operands = comm.bcast(operands, root) + # Assign the device for each process. device_id = rank % getDeviceCount() - - #dev = cp.cuda.Device(device_id) - #free_mem, total_mem = dev.mem_info - #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) + + # dev = cp.cuda.Device(device_id) + # free_mem, total_mem = dev.mem_info + # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) # Create network object. - network = Network(*operands, options={'device_id' : device_id}) + network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. - path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) - #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + path, info = network.contract_path( + optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + ) + # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - #if rank == root: + # if rank == root: # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # Broadcast info from the sender to all other ranks. info = comm.bcast(info, sender) # Set path and slices. - path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + path, info = network.contract_path( + optimize={"path": info.path, "slicing": info.slices} + ) # Calculate this process's share of the slices. num_slices = info.num_slices chunk, extra = num_slices // size, num_slices % size slice_begin = rank * chunk + min(rank, extra) - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slice_end = ( + num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + ) slices = range(slice_begin, slice_end) - #print(f"Process {rank} is processing slice range: {slices}.") + # print(f"Process {rank} is processing slice range: {slices}.") # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - #print(f"Process {rank} result shape is : {result.shape}.") - #print(f"Process {rank} result size is : {result.nbytes}.") + # print(f"Process {rank} result shape is : {result.shape}.") + # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) - + return result, rank + def eval_tn_nccl(qibo_circ, datatype, n_samples=8): from mpi4py import MPI # this line initializes MPI import socket @@ -97,18 +107,18 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8): from cupy.cuda import nccl # Get the hostname - #hostname = socket.gethostname() - + # hostname = socket.gethostname() + root = 0 comm_mpi = MPI.COMM_WORLD rank = comm_mpi.Get_rank() size = comm_mpi.Get_size() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() - + cp.cuda.Device(device_id).use() - + # Set up the NCCL communicator. nccl_id = nccl.get_unique_id() if rank == root else None nccl_id = comm_mpi.bcast(nccl_id, root) @@ -116,51 +126,66 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8): # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands = myconvertor.state_vector_operands() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. - path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) + path, info = network.contract_path( + optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + ) - #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - #if rank == root: + # if rank == root: # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # Broadcast info from the sender to all other ranks. info = comm_mpi.bcast(info, sender) # Set path and slices. - path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + path, info = network.contract_path( + optimize={"path": info.path, "slicing": info.slices} + ) # Calculate this process's share of the slices. num_slices = info.num_slices chunk, extra = num_slices // size, num_slices % size slice_begin = rank * chunk + min(rank, extra) - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slice_end = ( + num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + ) slices = range(slice_begin, slice_end) - #print(f"Process {rank} is processing slice range: {slices}.") + # print(f"Process {rank} is processing slice range: {slices}.") # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - #print(f"Process {rank} result shape is : {result.shape}.") - #print(f"Process {rank} result size is : {result.nbytes}.") + # print(f"Process {rank} result shape is : {result.shape}.") + # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. stream_ptr = cp.cuda.get_current_stream().ptr - comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr) - + comm_nccl.reduce( + result.data.ptr, + result.data.ptr, + result.size, + nccl.NCCL_FLOAT64, + nccl.NCCL_SUM, + root, + stream_ptr, + ) + return result, rank + def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): from mpi4py import MPI # this line initializes MPI import socket @@ -168,18 +193,18 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): from cupy.cuda import nccl # Get the hostname - #hostname = socket.gethostname() - + # hostname = socket.gethostname() + root = 0 comm_mpi = MPI.COMM_WORLD rank = comm_mpi.Get_rank() size = comm_mpi.Get_size() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() - + cp.cuda.Device(device_id).use() - + # Set up the NCCL communicator. nccl_id = nccl.get_unique_id() if rank == root else None nccl_id = comm_mpi.bcast(nccl_id, root) @@ -187,50 +212,64 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. - path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) + path, info = network.contract_path( + optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + ) - #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - #if rank == root: + # if rank == root: # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # Broadcast info from the sender to all other ranks. info = comm_mpi.bcast(info, sender) # Set path and slices. - path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + path, info = network.contract_path( + optimize={"path": info.path, "slicing": info.slices} + ) # Calculate this process's share of the slices. num_slices = info.num_slices chunk, extra = num_slices // size, num_slices % size slice_begin = rank * chunk + min(rank, extra) - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slice_end = ( + num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + ) slices = range(slice_begin, slice_end) - #print(f"Process {rank} is processing slice range: {slices}.") + # print(f"Process {rank} is processing slice range: {slices}.") # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - #print(f"Process {rank} result shape is : {result.shape}.") - #print(f"Process {rank} result size is : {result.nbytes}.") + # print(f"Process {rank} result shape is : {result.shape}.") + # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. stream_ptr = cp.cuda.get_current_stream().ptr - comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr) - + comm_nccl.reduce( + result.data.ptr, + result.data.ptr, + result.size, + nccl.NCCL_FLOAT64, + nccl.NCCL_SUM, + root, + stream_ptr, + ) + return result, rank @@ -240,128 +279,144 @@ def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8): from cuquantum import Network # Get the hostname - #hostname = socket.gethostname() - + # hostname = socket.gethostname() + root = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() - - + # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + # Broadcast the operand data. - #operands = comm.bcast(operands, root) - + # operands = comm.bcast(operands, root) + # Assign the device for each process. device_id = rank % getDeviceCount() - - #dev = cp.cuda.Device(device_id) - #free_mem, total_mem = dev.mem_info - #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) + + # dev = cp.cuda.Device(device_id) + # free_mem, total_mem = dev.mem_info + # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) # Create network object. - network = Network(*operands, options={'device_id' : device_id}) + network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. - path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) - #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + path, info = network.contract_path( + optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + ) + # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - #if rank == root: + # if rank == root: # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # Broadcast info from the sender to all other ranks. info = comm.bcast(info, sender) # Set path and slices. - path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + path, info = network.contract_path( + optimize={"path": info.path, "slicing": info.slices} + ) # Calculate this process's share of the slices. num_slices = info.num_slices chunk, extra = num_slices // size, num_slices % size slice_begin = rank * chunk + min(rank, extra) - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slice_end = ( + num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + ) slices = range(slice_begin, slice_end) - #print(f"Process {rank} is processing slice range: {slices}.") + # print(f"Process {rank} is processing slice range: {slices}.") # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - #print(f"Process {rank} result shape is : {result.shape}.") - #print(f"Process {rank} result size is : {result.nbytes}.") + # print(f"Process {rank} result shape is : {result.shape}.") + # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) - + return result, rank def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8): from mpi4py import MPI # this line initializes MPI import socket + # Get the hostname - #hostname = socket.gethostname() - + # hostname = socket.gethostname() + ncpu_threads = multiprocessing.cpu_count() // 2 - + comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() cp.cuda.Device(device_id).use() handle = cutn.create() network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft network opts",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft network opts",mem_avail, "rank =",rank) cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - operands_interleave = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft convetor",mem_avail, "rank =",rank) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + operands_interleave = myconvertor.expectation_operands( + PauliStringGen(qibo_circ.nqubits) + ) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft convetor",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. network = cutn.Network(*operands_interleave, options=network_opts) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) - path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft contract path",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) + path, opt_info = network.contract_path( + optimize={ + "samples": n_samples, + "threads": ncpu_threads, + "slicing": {"min_slices": max(16, size)}, + } + ) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft contract path",mem_avail, "rank =",rank) # Execution: To execute the contraction using the optimal path found previously - #print("opt_cost",opt_info.opt_cost, "Process =",rank) - - - num_slices = opt_info.num_slices#Andy - chunk, extra = num_slices // size, num_slices % size#Andy - slice_begin = rank * chunk + min(rank, extra)#Andy - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy - slices = range(slice_begin, slice_end)#Andy + # print("opt_cost",opt_info.opt_cost, "Process =",rank) + + num_slices = opt_info.num_slices # Andy + chunk, extra = num_slices // size, num_slices % size # Andy + slice_begin = rank * chunk + min(rank, extra) # Andy + slice_end = ( + num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + ) # Andy + slices = range(slice_begin, slice_end) # Andy result = network.contract(slices=slices) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft contract",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft contract",mem_avail, "rank =",rank) cutn.destroy(handle) return result, rank + def eval_tn_MPI(qibo_circ, datatype, n_samples=8): """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI. The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution. @@ -371,45 +426,52 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8): from mpi4py import MPI # this line initializes MPI import socket + # Get the hostname - #hostname = socket.gethostname() - + # hostname = socket.gethostname() + ncpu_threads = multiprocessing.cpu_count() // 2 - + comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() cp.cuda.Device(device_id).use() handle = cutn.create() network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft network opts",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft network opts",mem_avail, "rank =",rank) cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands_interleave = myconvertor.state_vector_operands() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. network = cutn.Network(*operands_interleave, options=network_opts) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) - network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft contract path",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) + network.contract_path( + optimize={ + "samples": n_samples, + "threads": ncpu_threads, + "slicing": {"min_slices": max(16, size)}, + } + ) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft contract path",mem_avail, "rank =",rank) # Execution: To execute the contraction using the optimal path found previously - #print("opt_cost",opt_info.opt_cost, "Process =",rank) + # print("opt_cost",opt_info.opt_cost, "Process =",rank) - ''' + """ path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) num_slices = opt_info.num_slices#Andy @@ -418,16 +480,16 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8): slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy slices = range(slice_begin, slice_end)#Andy result = network.contract(slices=slices) - ''' + """ result = network.contract() - #mem_avail = cp.cuda.Device().mem_info[0] - #print("Mem avail: aft contract",mem_avail, "rank =",rank) + # mem_avail = cp.cuda.Device().mem_info[0] + # print("Mem avail: aft contract",mem_avail, "rank =",rank) cutn.destroy(handle) return result, rank - + def eval_mps(qibo_circ, gate_algo, datatype): myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype) mps_helper = MPSContractionHelper(myconvertor.num_qubits) @@ -436,18 +498,18 @@ def eval_mps(qibo_circ, gate_algo, datatype): myconvertor.mps_tensors, {"handle": myconvertor.handle} ) + def PauliStringGen(nqubits): - if nqubits <= 0: return "Invalid input. N should be a positive integer." - #characters = 'IXYZ' - characters = 'XXXZ' + # characters = 'IXYZ' + characters = "XXXZ" - result = '' + result = "" for i in range(nqubits): char_to_add = characters[i % len(characters)] result += char_to_add - return result \ No newline at end of file + return result