Skip to content

Commit

Permalink
Add XLA support to moco benchmark. (#2292)
Browse files Browse the repository at this point in the history
Summary:
This PR tweaks `moco` benchmark, so that it will also run on XLA devices. Previously, `moco` hardcoded the CUDA device in two ways:

- Initializing the `ProcessGroup` with `nccl` backend, only
- Moving intermediate tensors to `cuda`, explicitly

In order to add XLA support, this PR:

- Also checks for `xla*` devices and, if detected, initializes the `ProcessGroup` with `xla` backend
- Moves intermediate tensors to the appropriate devices

cc lezcano

Pull Request resolved: #2292

Reviewed By: aaronenyeshi

Differential Revision: D58787062

Pulled By: xuzhao9

fbshipit-source-id: 545d2c71296cc3e80958a51b3e48b335a1a72b59
  • Loading branch information
ysiraichi authored and facebook-github-bot committed Jun 19, 2024
1 parent 5f6633d commit 612b3c8
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 13 deletions.
33 changes: 22 additions & 11 deletions torchbenchmark/models/moco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,29 @@ def __init__(self, test, device, batch_size=None, extra_args=[]):
"distributed": True,
}
)
try:
dist.init_process_group(
backend="nccl",
init_method="tcp://localhost:10001",
world_size=1,
rank=0,
)
except RuntimeError:
pass # already initialized?

if device == "cpu":
raise NotImplementedError("DistributedDataParallel/allgather requires cuda")
elif device == "cuda":
try:
dist.init_process_group(
backend="nccl",
init_method="tcp://localhost:10001",
world_size=1,
rank=0,
)
except RuntimeError:
pass # already initialized?
elif device == "xla":
import torch_xla.distributed.xla_backend

try:
dist.init_process_group(backend="xla", init_method="xla://")
except RuntimeError:
pass # already initialized?
else:
raise NotImplementedError(f"{device} not supported")


self.model = MoCo(
models.__dict__[self.opt.arch],
Expand Down Expand Up @@ -102,8 +113,8 @@ def collate_train_fn(data):
range(2), collate_fn=collate_train_fn
)
for i, (images, _) in enumerate(self.example_inputs):
images[0] = images[0].cuda(device=0, non_blocking=True)
images[1] = images[1].cuda(device=0, non_blocking=True)
images[0] = images[0].to(device, non_blocking=True)
images[1] = images[1].to(device, non_blocking=True)

def get_module(self):
"""Recommended
Expand Down
4 changes: 2 additions & 2 deletions torchbenchmark/models/moco/moco/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _batch_shuffle_ddp(self, x):
num_gpus = batch_size_all // batch_size_this

# random shuffle index
idx_shuffle = torch.randperm(batch_size_all).cuda()
idx_shuffle = torch.randperm(batch_size_all, device=x_gather.device)

# broadcast to all gpus
torch.distributed.broadcast(idx_shuffle, src=0)
Expand Down Expand Up @@ -152,7 +152,7 @@ def forward(self, im_q, im_k):
logits /= self.T

# labels: positive key indicators
labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda()
labels = torch.zeros(logits.shape[0], dtype=torch.long, device=logits.device)

# dequeue and enqueue
self._dequeue_and_enqueue(k)
Expand Down

0 comments on commit 612b3c8

Please sign in to comment.