From d039a108f992f170d5323a26a6d6f7dafb14029b Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 16 Oct 2024 08:59:11 -0400 Subject: [PATCH] IQSS/10697 - Improve batch permission indexing (#10698) * reindex batches of 20 files instead of all at once * Also only keep 100 files in list at a time * release note * Just do collections/datasets as you go Avoids keeping everything in memory, also helps in tracking progress as you can see the permissionindextime getting updated per dataset. * fix merge issues, add logging * put comments back to how they were #10697 * reduce logging #10697 * rename release note and add PR number #10697 * fix logging - finest for per file, space in message * adding a space in log message - per review --------- Co-authored-by: Philip Durbin --- .../10697-improve-permission-indexing.md | 7 + .../search/SolrIndexServiceBean.java | 151 ++++++++++-------- 2 files changed, 91 insertions(+), 67 deletions(-) create mode 100644 doc/release-notes/10697-improve-permission-indexing.md diff --git a/doc/release-notes/10697-improve-permission-indexing.md b/doc/release-notes/10697-improve-permission-indexing.md new file mode 100644 index 00000000000..b232b1c4d3c --- /dev/null +++ b/doc/release-notes/10697-improve-permission-indexing.md @@ -0,0 +1,7 @@ +### Reindexing after a role assignment is less memory intensive + +Adding/removing a user from a role on a collection, particularly the root collection, could lead to a significant increase in memory use resulting in Dataverse itself failing with an out-of-memory condition. Such changes now consume much less memory. + +If you have experienced out-of-memory failures in Dataverse in the past that could have been caused by this problem, you may wish to run a [reindex in place](https://guides.dataverse.org/en/latest/admin/solr-search-index.html#reindex-in-place) to update any out-of-date information. + +For more information, see #10697 and #10698. diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index cfe29ea08c7..e4d885276d0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -34,7 +34,7 @@ public class SolrIndexServiceBean { private static final Logger logger = Logger.getLogger(SolrIndexServiceBean.class.getCanonicalName()); - + @EJB DvObjectServiceBean dvObjectService; @EJB @@ -149,7 +149,7 @@ private List constructDatasetSolrDocs(Dataset dataset) { return solrDocs; } -// private List constructDatafileSolrDocs(DataFile dataFile) { + // private List constructDatafileSolrDocs(DataFile dataFile) { private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion) { List datafileSolrDocs = new ArrayList<>(); Map desiredCards = searchPermissionsService.getDesiredCards(dataFile.getOwner()); @@ -166,14 +166,14 @@ private List constructDatafileSolrDocs(DataFile dataFile, Map constructDatafileSolrDocsFromDataset(Dataset datas } else { perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); } + for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) { Long fileId = fileMetadata.getDataFile().getId(); String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId; String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); String solrId = solrIdStart + solrIdEnd; DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms); - logger.fine("adding fileid " + fileId); + logger.finest("adding fileid " + fileId); datafileSolrDocs.add(dataFileSolrDoc); } } @@ -361,20 +362,19 @@ private void persistToSolr(Collection docs) throws SolrServer public IndexResponse indexPermissionsOnSelfAndChildren(long definitionPointId) { DvObject definitionPoint = dvObjectService.findDvObject(definitionPointId); - if ( definitionPoint == null ) { + if (definitionPoint == null) { logger.log(Level.WARNING, "Cannot find a DvOpbject with id of {0}", definitionPointId); return null; } else { return indexPermissionsOnSelfAndChildren(definitionPoint); } } - + /** * We use the database to determine direct children since there is no * inheritance */ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) { - List dvObjectsToReindexPermissionsFor = new ArrayList<>(); List filesToReindexAsBatch = new ArrayList<>(); /** * @todo Re-indexing the definition point itself seems to be necessary @@ -383,27 +383,47 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) // We don't create a Solr "primary/content" doc for the root dataverse // so don't create a Solr "permission" doc either. + int i = 0; + int numObjects = 0; if (definitionPoint.isInstanceofDataverse()) { Dataverse selfDataverse = (Dataverse) definitionPoint; if (!selfDataverse.equals(dataverseService.findRootDataverse())) { - dvObjectsToReindexPermissionsFor.add(definitionPoint); + indexPermissionsForOneDvObject(definitionPoint); + numObjects++; } List directChildDatasetsOfDvDefPoint = datasetService.findByOwnerId(selfDataverse.getId()); for (Dataset dataset : directChildDatasetsOfDvDefPoint) { - dvObjectsToReindexPermissionsFor.add(dataset); + indexPermissionsForOneDvObject(dataset); + numObjects++; for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) { filesToReindexAsBatch.add(datafile); + i++; + if (i % 100 == 0) { + reindexFilesInBatches(filesToReindexAsBatch); + filesToReindexAsBatch.clear(); + } + if (i % 1000 == 0) { + logger.fine("Progress: " +i + " files permissions reindexed"); + } } + logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed"); } } else if (definitionPoint.isInstanceofDataset()) { - dvObjectsToReindexPermissionsFor.add(definitionPoint); + indexPermissionsForOneDvObject(definitionPoint); + numObjects++; // index files Dataset dataset = (Dataset) definitionPoint; for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) { filesToReindexAsBatch.add(datafile); + i++; + if (i % 100 == 0) { + reindexFilesInBatches(filesToReindexAsBatch); + filesToReindexAsBatch.clear(); + } } } else { - dvObjectsToReindexPermissionsFor.add(definitionPoint); + indexPermissionsForOneDvObject(definitionPoint); + numObjects++; } /** @@ -412,64 +432,64 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) * @todo Should update timestamps, probably, even thought these are * files, see https://github.com/IQSS/dataverse/issues/2421 */ - String response = reindexFilesInBatches(filesToReindexAsBatch); - - for (DvObject dvObject : dvObjectsToReindexPermissionsFor) { - /** - * @todo do something with this response - */ - IndexResponse indexResponse = indexPermissionsForOneDvObject(dvObject); - } - + reindexFilesInBatches(filesToReindexAsBatch); + logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections"); return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint - + ": " + dvObjectsToReindexPermissionsFor.size() - ); + + ": " + numObjects); } private String reindexFilesInBatches(List filesToReindexPermissionsFor) { List docs = new ArrayList<>(); Map> byParentId = new HashMap<>(); Map> permStringByDatasetVersion = new HashMap<>(); - for (DataFile file : filesToReindexPermissionsFor) { - Dataset dataset = (Dataset) file.getOwner(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - if (cardShouldExist) { - List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); - if (cachedPermission == null) { - logger.fine("no cached permission! Looking it up..."); - List fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { - Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); - if (datasetVersionId != null) { - permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions()); + int i = 0; + try { + for (DataFile file : filesToReindexPermissionsFor) { + Dataset dataset = (Dataset) file.getOwner(); + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); + for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) { + boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); + if (cardShouldExist) { + List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); + if (cachedPermission == null) { + logger.finest("no cached permission! Looking it up..."); + List fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); + for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { + Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); + if (datasetVersionId != null) { + permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions()); + SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); + docs.add(solrDoc); + i++; + } + } + } else { + logger.finest("cached permission is " + cachedPermission); + List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); + for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); docs.add(solrDoc); + i++; } } - } else { - logger.fine("cached permission is " + cachedPermission); - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { - SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); - docs.add(solrDoc); + if (i % 20 == 0) { + persistToSolr(docs); + docs = new ArrayList<>(); } } } + Long parent = file.getOwner().getId(); + List existingList = byParentId.get(parent); + if (existingList == null) { + List empty = new ArrayList<>(); + byParentId.put(parent, empty); + } else { + List updatedList = existingList; + updatedList.add(file.getId()); + byParentId.put(parent, updatedList); + } } - Long parent = file.getOwner().getId(); - List existingList = byParentId.get(parent); - if (existingList == null) { - List empty = new ArrayList<>(); - byParentId.put(parent, empty); - } else { - List updatedList = existingList; - updatedList.add(file.getId()); - byParentId.put(parent, updatedList); - } - } - try { + persistToSolr(docs); return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents "; } catch (SolrServerException | IOException ex) { @@ -517,29 +537,26 @@ public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServer } /** - * - * * @return A list of dvobject ids that should have their permissions - * re-indexed because Solr was down when a permission was added. The permission - * should be added to Solr. The id of the permission contains the type of - * DvObject and the primary key of the dvObject. - * DvObjects of type DataFile are currently skipped because their index - * time isn't stored in the database, since they are indexed along - * with their parent dataset (this may change). + * re-indexed because Solr was down when a permission was added. The + * permission should be added to Solr. The id of the permission contains the + * type of DvObject and the primary key of the dvObject. DvObjects of type + * DataFile are currently skipped because their index time isn't stored in + * the database, since they are indexed along with their parent dataset + * (this may change). */ public List findPermissionsInDatabaseButStaleInOrMissingFromSolr() { List indexingRequired = new ArrayList<>(); long rootDvId = dataverseService.findRootDataverse().getId(); List missingDataversePermissionIds = dataverseService.findIdStalePermission(); List missingDatasetPermissionIds = datasetService.findIdStalePermission(); - for (Long id : missingDataversePermissionIds) { + for (Long id : missingDataversePermissionIds) { if (!id.equals(rootDvId)) { - indexingRequired.add(id); + indexingRequired.add(id); } } indexingRequired.addAll(missingDatasetPermissionIds); return indexingRequired; } - }