Skip to content

Commit

Permalink
Update NCCL
Browse files Browse the repository at this point in the history
  • Loading branch information
Tankya2 committed Jul 29, 2024
1 parent bbacc26 commit f358d0e
Showing 1 changed file with 96 additions and 6 deletions.
102 changes: 96 additions & 6 deletions src/qibotn/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
from cupy.cuda import nccl
from cuquantum import Network
from mpi4py import MPI
import cuquantum.cutensornet as cutn

root = 0
comm_mpi = MPI.COMM_WORLD
Expand All @@ -238,23 +239,30 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
device_id = rank % getDeviceCount()

cp.cuda.Device(device_id).use()
mempool = cp.get_default_memory_pool()

# Set up the NCCL communicator.
nccl_id = nccl.get_unique_id() if rank == root else None
nccl_id = comm_mpi.bcast(nccl_id, root)
comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)

# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
if rank==0:

myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
else:
operands = None

operands = comm_mpi.bcast(operands, root)

network = Network(*operands)

# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
)

# Select the best path from all ranks.
Expand Down Expand Up @@ -291,7 +299,10 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
root,
stream_ptr,
)


del network
mempool.free_all_blocks()

return result, rank


Expand Down Expand Up @@ -426,3 +437,82 @@ def pauli_string_gen(nqubits, pauli_string_pattern):
char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
result += char_to_add
return result

def expectation_pauli_tn_MPI_pathfinding(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to
expectation of given Pauli string using multi node and multi GPU through
MPI.
The conversion is performed by QiboCircuitToEinsum(), after which it
goes through 2 steps: pathfinder and execution. The
pauli_string_pattern is used to generate the pauli string
corresponding to the number of qubits of the system. The pathfinder
looks at user defined number of samples (n_samples) iteratively to
select the least costly contraction path. This is sped up with multi
thread. After pathfinding the optimal path is used in the actual
contraction to give an expectation value.
Parameters:
qibo_circ: The quantum circuit object.
datatype (str): Either single ("complex64") or double (complex128) precision.
pauli_string_pattern(str): pauli string pattern.
n_samples(int): Number of samples for pathfinding.
Returns:
Expectation of quantum circuit due to pauli string.
"""
from cuquantum import Network
from mpi4py import MPI # this line initializes MPI
import cuquantum.cutensornet as cutn
import time
import numpy as np

root = 0
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

# Assign the device for each process.
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
mempool = cp.get_default_memory_pool()

# Perform circuit conversion
if rank==0:
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)

operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
else:
operands = None

operands = comm.bcast(operands, root)

# Create network object.
network = Network(*operands, options={"device_id": device_id})
start_time = time.time()
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
)
end_time = time.time()

# print("Andy rank",rank,"info",info, info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time)
local_data = np.array([info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time])


# Initialize a list to store the gathered data on rank 0
if rank == 0:
gathered_data = np.zeros((size, 4))

else:
gathered_data = None

# Gather data from all ranks to rank 0
comm.Gather(local_data, gathered_data, root=0)
# print("Andy rank",rank,"gathered data",gathered_data)
del network
mempool.free_all_blocks()

return gathered_data, rank

0 comments on commit f358d0e

Please sign in to comment.