diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 087eec1a70f..3fbe8bff320 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -54,7 +54,9 @@ import os import tempfile import commands +import datetime import errno +import threading from DIRAC import S_OK, S_ERROR, gConfig from DIRAC.Resources.Computing.ComputingElement import ComputingElement @@ -170,6 +172,10 @@ class HTCondorCEComputingElement(ComputingElement): implementing the functions jobSubmit, getJobOutput """ + # static variables to ensure single cleanup every minute + _lastCleanupTime = datetime.datetime.utcnow() + _cleanupLock = threading.Lock() + ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. @@ -535,21 +541,33 @@ def __cleanup(self): # FIXME: again some issue with the working directory... # workingDirectory = self.ceParameters.get( 'WorkingDirectory', DEFAULT_WORKINGDIRECTORY ) + if not HTCondorCEComputingElement._cleanupLock.acquire(False): + return + + now = datetime.datetime.utcnow() + if (now - HTCondorCEComputingElement._lastCleanupTime).total_seconds() < 60: + HTCondorCEComputingElement._cleanupLock.release() + return + + HTCondorCEComputingElement._lastCleanupTime = now + self.log.debug("Cleaning working directory: %s" % self.workingDirectory) # remove all files older than 120 minutes starting with DIRAC_ Condor will # push files on submission, but it takes at least a few seconds until this # happens so we can't directly unlink after condor_submit - status, stdout = commands.getstatusoutput('find %s -mmin +120 -name "DIRAC_*" -delete ' % self.workingDirectory) + status, stdout = commands.getstatusoutput('find -O3 %s -maxdepth 1 -mmin +120 -name "DIRAC_*" -delete ' % + self.workingDirectory) if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) - # remove all out/err/log files older than "DaysToKeepLogs" days in the CE part of the working Directory - workDir = os.path.join(self.workingDirectory, self.ceName) - findPars = dict(workDir=workDir, days=self.daysToKeepLogs) + # remove all out/err/log files older than "DaysToKeepLogs" days in the working directory + # not running this for each CE so we do global cleanup + findPars = dict(workDir=self.workingDirectory, days=self.daysToKeepLogs) # remove all out/err/log files older than "DaysToKeepLogs" days status, stdout = commands.getstatusoutput( r'find %(workDir)s -mtime +%(days)s -type f \( -name "*.out" -o -name "*.err" -o -name "*.log" \) -delete ' % findPars) if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) + self._cleanupLock.release() diff --git a/release.notes b/release.notes index 65d1cf5d31b..5f86c37c0b7 100644 --- a/release.notes +++ b/release.notes @@ -632,6 +632,14 @@ FIX: (#4551) align ProxyDB test to current changes NEW: (#4289) Document how to run integration tests in docker NEW: (#4551) add DNProperties description to Registry/Users subsection +[v7r0p56] + +*Resources +FIX: (#5119) HTCondorCE: Limit calls to actual cleanup (find and delete files on disk) to + once per minute per SiteDirector, fixes #5118 +CHANGE: (#5119) HTCondorCE cleanup: Run the DIRAC_ executable purge with -O3 and -maxdepth + 1 to speed up the find + [v7r0p55] *TS