Merge pull request #56 from MPI-Dortmund/fix-u-limit-problem

Fix 'too many open files' problem
MPI-Dortmund · Oct 17, 2023 · 8bdaf84 · 8bdaf84
2 parents 9e361f6 + 44cc4ef
commit 8bdaf84
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 10 deletions.
diff --git a/tomotwin/embed_main.py b/tomotwin/embed_main.py
@@ -378,8 +378,6 @@
 import hashlib
 import os
 import random
-import resource
-import sys
 from typing import List
 
 import numpy as np
@@ -600,11 +598,6 @@ def run_distr(config, world_size: int):
     Starts a distributed run using DistributedDataParallel
     """
     mp.set_sharing_strategy('file_system')
-    limit = resource.getrlimit(resource.RLIMIT_NOFILE)
-    if limit[0] < 65000:
-        print(
-            f"Your user limit ('ulimit -n') is too low ({limit[0]}). Please run 'ulimit -n 65000' before running tomotwin_embed.")
-        sys.exit(1)
     print(f"Found {world_size} GPU(s). Start DDP + Compiling.")
     os.environ['MASTER_ADDR'] = '127.0.0.1'
     os.environ['MASTER_PORT'] = '29' + str(random.randint(1, 500)).zfill(3)

diff --git a/tomotwin/modules/inference/embedor.py b/tomotwin/modules/inference/embedor.py
@@ -374,7 +374,7 @@
   This Source Code Form is "Incompatible With Secondary Licenses", as
   defined by the Mozilla Public License, v. 2.0.
 """
-
+import copy
 from abc import ABC, abstractmethod
 
 import numpy as np
@@ -586,8 +586,9 @@ def embed(self, volume_data: VolumeDataset) -> np.array:
                 with torch.autocast(device_type='cuda', dtype=torch.float16):
                     subvolume = self.model.forward(subvolume).type(torch.HalfTensor)
                 subvolume = subvolume.data.cpu()
-                items_indicis.append(item_index.data.cpu())
-                embeddings.append(subvolume.data.cpu())
+                items_indicis.append(copy.deepcopy(item_index.data.cpu()))
+                embeddings.append(copy.deepcopy(subvolume.data.cpu()))
+                del subvolume
 
         ## Sync items
         items_indicis = torch.cat(items_indicis)  # .to(self.rank)  # necessary because of nccl