From 2979e1c52521cce7f393d89359e30b0a127fb411 Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:11 +0200 Subject: [PATCH 1/6] HTCondorCE: Limit cleanup to a single run per minute per SiteDirector --- .../Computing/HTCondorCEComputingElement.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index e511f39392d..dd04c86f7f1 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -45,7 +45,9 @@ import os import tempfile import commands +import datetime import errno +import threading from DIRAC import S_OK, S_ERROR, gConfig from DIRAC.Resources.Computing.ComputingElement import ComputingElement @@ -160,6 +162,10 @@ class HTCondorCEComputingElement(ComputingElement): implementing the functions jobSubmit, getJobOutput """ + # static variables to ensure single cleanup every minute + _lastCleanupTime = datetime.datetime.utcnow() + _cleanupLock = threading.Lock() + ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. @@ -516,6 +522,16 @@ def __cleanup(self): # FIXME: again some issue with the working directory... # workingDirectory = self.ceParameters.get( 'WorkingDirectory', DEFAULT_WORKINGDIRECTORY ) + if not self._cleanupLock.acquire(False): + return + + now = datetime.datetime.utcnow() + if (self._lastCleanupTime - now).total_seconds < 60: + self._cleanupLock.release() + return + + self._lastCleanupTime = now + self.log.debug("Cleaning working directory: %s" % self.workingDirectory) # remove all files older than 120 minutes starting with DIRAC_ Condor will @@ -534,3 +550,4 @@ def __cleanup(self): findPars) if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) + self._cleanupLock.release() From 2e24b8ea1d4aba81c97495f53d5cf2a5950dbe3b Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:19 +0200 Subject: [PATCH 2/6] HTCondorCE: optimize find for DIRAC_ executables --- Resources/Computing/HTCondorCEComputingElement.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index dd04c86f7f1..26646caf3aa 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -537,7 +537,8 @@ def __cleanup(self): # remove all files older than 120 minutes starting with DIRAC_ Condor will # push files on submission, but it takes at least a few seconds until this # happens so we can't directly unlink after condor_submit - status, stdout = commands.getstatusoutput('find %s -mmin +120 -name "DIRAC_*" -delete ' % self.workingDirectory) + status, stdout = commands.getstatusoutput('find -O3 %s -maxdepth 1 -mmin +120 -name "DIRAC_*" -delete ' % + self.workingDirectory) if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) From 55faac003c6d8b418e6a30bd96b280d38220f12b Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:25 +0200 Subject: [PATCH 3/6] HTCondorCE: Fix cleanup timing check --- Resources/Computing/HTCondorCEComputingElement.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 26646caf3aa..456ce743340 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -526,7 +526,7 @@ def __cleanup(self): return now = datetime.datetime.utcnow() - if (self._lastCleanupTime - now).total_seconds < 60: + if (now - self._lastCleanupTime).total_seconds() < 60: self._cleanupLock.release() return From 937b2988d801da3da29d5891b6b77bac131d1178 Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:37 +0200 Subject: [PATCH 4/6] HTCondorCE: Clean up log/out/err files for all CEs as we run only once and not for all CEs --- Resources/Computing/HTCondorCEComputingElement.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 456ce743340..4bb697e7e05 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -542,9 +542,9 @@ def __cleanup(self): if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) - # remove all out/err/log files older than "DaysToKeepLogs" days in the CE part of the working Directory - workDir = os.path.join(self.workingDirectory, self.ceName) - findPars = dict(workDir=workDir, days=self.daysToKeepLogs) + # remove all out/err/log files older than "DaysToKeepLogs" days in the working directory + # not running this for each CE so we do global cleanup + findPars = dict(workDir=self.workingDirectory, days=self.daysToKeepLogs) # remove all out/err/log files older than "DaysToKeepLogs" days status, stdout = commands.getstatusoutput( r'find %(workDir)s -mtime +%(days)s -type f \( -name "*.out" -o -name "*.err" -o -name "*.log" \) -delete ' % From e81d0a49c86851e7f57f466c3656ce20c1f683f5 Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:40 +0200 Subject: [PATCH 5/6] HTCondorCE: fix use of static variables --- Resources/Computing/HTCondorCEComputingElement.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 4bb697e7e05..e11da0a90c8 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -522,15 +522,15 @@ def __cleanup(self): # FIXME: again some issue with the working directory... # workingDirectory = self.ceParameters.get( 'WorkingDirectory', DEFAULT_WORKINGDIRECTORY ) - if not self._cleanupLock.acquire(False): + if not HTCondorCEComputingElement._cleanupLock.acquire(False): return now = datetime.datetime.utcnow() - if (now - self._lastCleanupTime).total_seconds() < 60: - self._cleanupLock.release() + if (now - HTCondorCEComputingElement._lastCleanupTime).total_seconds() < 60: + HTCondorCEComputingElement._cleanupLock.release() return - self._lastCleanupTime = now + HTCondorCEComputingElement._lastCleanupTime = now self.log.debug("Cleaning working directory: %s" % self.workingDirectory) From d7af92e4a36cdd2cfdb22e07f92919f4a1556fe4 Mon Sep 17 00:00:00 2001 From: Andrei Tsaregorodtsev Date: Wed, 28 Apr 2021 22:57:06 +0200 Subject: [PATCH 6/6] v7r0p56 notes and tags --- __init__.py | 2 +- release.notes | 8 ++++++++ setup.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/__init__.py b/__init__.py index e4478be06af..b0ee99c0888 100755 --- a/__init__.py +++ b/__init__.py @@ -95,7 +95,7 @@ else: majorVersion = 7 minorVersion = 0 - patchLevel = 55 + patchLevel = 56 preVersion = 0 version = "v%sr%s" % (majorVersion, minorVersion) diff --git a/release.notes b/release.notes index 25b6d42df33..aa8f621b1b2 100644 --- a/release.notes +++ b/release.notes @@ -1,3 +1,11 @@ +[v7r0p56] + +*Resources +FIX: (#5119) HTCondorCE: Limit calls to actual cleanup (find and delete files on disk) to + once per minute per SiteDirector, fixes #5118 +CHANGE: (#5119) HTCondorCE cleanup: Run the DIRAC_ executable purge with -O3 and -maxdepth + 1 to speed up the find + [v7r0p55] *TS diff --git a/setup.py b/setup.py index e1e1c02b514..344322717d9 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ setup( name="DIRAC", - version="7.0.55", + version="7.0.56", url="https://github.com/DIRACGRID/DIRAC", license="GPLv3", package_dir=package_dir,