Skip to content

Commit

Permalink
Merge pull request #56 from MPI-Dortmund/fix-u-limit-problem
Browse files Browse the repository at this point in the history
Fix 'too many open files' problem
  • Loading branch information
thorstenwagner authored Oct 17, 2023
2 parents 9e361f6 + 44cc4ef commit 8bdaf84
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 10 deletions.
7 changes: 0 additions & 7 deletions tomotwin/embed_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,6 @@
import hashlib
import os
import random
import resource
import sys
from typing import List

import numpy as np
Expand Down Expand Up @@ -600,11 +598,6 @@ def run_distr(config, world_size: int):
Starts a distributed run using DistributedDataParallel
"""
mp.set_sharing_strategy('file_system')
limit = resource.getrlimit(resource.RLIMIT_NOFILE)
if limit[0] < 65000:
print(
f"Your user limit ('ulimit -n') is too low ({limit[0]}). Please run 'ulimit -n 65000' before running tomotwin_embed.")
sys.exit(1)
print(f"Found {world_size} GPU(s). Start DDP + Compiling.")
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29' + str(random.randint(1, 500)).zfill(3)
Expand Down
7 changes: 4 additions & 3 deletions tomotwin/modules/inference/embedor.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@
This Source Code Form is "Incompatible With Secondary Licenses", as
defined by the Mozilla Public License, v. 2.0.
"""

import copy
from abc import ABC, abstractmethod

import numpy as np
Expand Down Expand Up @@ -586,8 +586,9 @@ def embed(self, volume_data: VolumeDataset) -> np.array:
with torch.autocast(device_type='cuda', dtype=torch.float16):
subvolume = self.model.forward(subvolume).type(torch.HalfTensor)
subvolume = subvolume.data.cpu()
items_indicis.append(item_index.data.cpu())
embeddings.append(subvolume.data.cpu())
items_indicis.append(copy.deepcopy(item_index.data.cpu()))
embeddings.append(copy.deepcopy(subvolume.data.cpu()))
del subvolume

## Sync items
items_indicis = torch.cat(items_indicis) # .to(self.rank) # necessary because of nccl
Expand Down

0 comments on commit 8bdaf84

Please sign in to comment.