Skip to content

Commit

Permalink
IQSS/10697 - Improve batch permission indexing (#10698)
Browse files Browse the repository at this point in the history
* reindex batches of 20 files instead of all at once

* Also only keep 100 files in list at a time

* release note

* Just do collections/datasets as you go

Avoids keeping everything in memory, also helps in tracking progress as
you can see the permissionindextime getting updated per dataset.

* fix merge issues, add logging

* put comments back to how they were #10697

* reduce logging #10697

* rename release note and add PR number #10697

* fix logging - finest for per file, space in message

* adding a space in log message - per review

---------

Co-authored-by: Philip Durbin <philip_durbin@harvard.edu>
  • Loading branch information
qqmyers and pdurbin authored Oct 16, 2024
1 parent c44ad65 commit d039a10
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 67 deletions.
7 changes: 7 additions & 0 deletions doc/release-notes/10697-improve-permission-indexing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
### Reindexing after a role assignment is less memory intensive

Adding/removing a user from a role on a collection, particularly the root collection, could lead to a significant increase in memory use resulting in Dataverse itself failing with an out-of-memory condition. Such changes now consume much less memory.

If you have experienced out-of-memory failures in Dataverse in the past that could have been caused by this problem, you may wish to run a [reindex in place](https://guides.dataverse.org/en/latest/admin/solr-search-index.html#reindex-in-place) to update any out-of-date information.

For more information, see #10697 and #10698.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
public class SolrIndexServiceBean {

private static final Logger logger = Logger.getLogger(SolrIndexServiceBean.class.getCanonicalName());

@EJB
DvObjectServiceBean dvObjectService;
@EJB
Expand Down Expand Up @@ -149,7 +149,7 @@ private List<DvObjectSolrDoc> constructDatasetSolrDocs(Dataset dataset) {
return solrDocs;
}

// private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile) {
// private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile) {
private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile, Map<Long, List<String>> permStringByDatasetVersion) {
List<DvObjectSolrDoc> datafileSolrDocs = new ArrayList<>();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataFile.getOwner());
Expand All @@ -166,14 +166,14 @@ private List<DvObjectSolrDoc> constructDatafileSolrDocs(DataFile dataFile, Map<L
cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
}
if (cachedPerms != null) {
logger.fine("reusing cached perms for file " + dataFile.getId());
logger.finest("reusing cached perms for file " + dataFile.getId());
perms = cachedPerms;
} else if (datasetVersionFileIsAttachedTo.isReleased()) {
logger.fine("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId());
logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId());
perms.add(IndexServiceBean.getPublicGroupString());
} else {
// go to the well (slow)
logger.fine("no cached perms, file is not public, finding perms for file " + dataFile.getId());
logger.finest("no cached perms, file is not public, finding perms for file " + dataFile.getId());
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
}
} else {
Expand Down Expand Up @@ -204,13 +204,14 @@ private List<DvObjectSolrDoc> constructDatafileSolrDocsFromDataset(Dataset datas
} else {
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
}

for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) {
Long fileId = fileMetadata.getDataFile().getId();
String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId;
String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState());
String solrId = solrIdStart + solrIdEnd;
DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms);
logger.fine("adding fileid " + fileId);
logger.finest("adding fileid " + fileId);
datafileSolrDocs.add(dataFileSolrDoc);
}
}
Expand Down Expand Up @@ -361,20 +362,19 @@ private void persistToSolr(Collection<SolrInputDocument> docs) throws SolrServer

public IndexResponse indexPermissionsOnSelfAndChildren(long definitionPointId) {
DvObject definitionPoint = dvObjectService.findDvObject(definitionPointId);
if ( definitionPoint == null ) {
if (definitionPoint == null) {
logger.log(Level.WARNING, "Cannot find a DvOpbject with id of {0}", definitionPointId);
return null;
} else {
return indexPermissionsOnSelfAndChildren(definitionPoint);
}
}

/**
* We use the database to determine direct children since there is no
* inheritance
*/
public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) {
List<DvObject> dvObjectsToReindexPermissionsFor = new ArrayList<>();
List<DataFile> filesToReindexAsBatch = new ArrayList<>();
/**
* @todo Re-indexing the definition point itself seems to be necessary
Expand All @@ -383,27 +383,47 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)

// We don't create a Solr "primary/content" doc for the root dataverse
// so don't create a Solr "permission" doc either.
int i = 0;
int numObjects = 0;
if (definitionPoint.isInstanceofDataverse()) {
Dataverse selfDataverse = (Dataverse) definitionPoint;
if (!selfDataverse.equals(dataverseService.findRootDataverse())) {
dvObjectsToReindexPermissionsFor.add(definitionPoint);
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;
}
List<Dataset> directChildDatasetsOfDvDefPoint = datasetService.findByOwnerId(selfDataverse.getId());
for (Dataset dataset : directChildDatasetsOfDvDefPoint) {
dvObjectsToReindexPermissionsFor.add(dataset);
indexPermissionsForOneDvObject(dataset);
numObjects++;
for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) {
filesToReindexAsBatch.add(datafile);
i++;
if (i % 100 == 0) {
reindexFilesInBatches(filesToReindexAsBatch);
filesToReindexAsBatch.clear();
}
if (i % 1000 == 0) {
logger.fine("Progress: " +i + " files permissions reindexed");
}
}
logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed");
}
} else if (definitionPoint.isInstanceofDataset()) {
dvObjectsToReindexPermissionsFor.add(definitionPoint);
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;
// index files
Dataset dataset = (Dataset) definitionPoint;
for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) {
filesToReindexAsBatch.add(datafile);
i++;
if (i % 100 == 0) {
reindexFilesInBatches(filesToReindexAsBatch);
filesToReindexAsBatch.clear();
}
}
} else {
dvObjectsToReindexPermissionsFor.add(definitionPoint);
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;
}

/**
Expand All @@ -412,64 +432,64 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)
* @todo Should update timestamps, probably, even thought these are
* files, see https://github.com/IQSS/dataverse/issues/2421
*/
String response = reindexFilesInBatches(filesToReindexAsBatch);

for (DvObject dvObject : dvObjectsToReindexPermissionsFor) {
/**
* @todo do something with this response
*/
IndexResponse indexResponse = indexPermissionsForOneDvObject(dvObject);
}

reindexFilesInBatches(filesToReindexAsBatch);
logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections");
return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint
+ ": " + dvObjectsToReindexPermissionsFor.size()
);
+ ": " + numObjects);
}

private String reindexFilesInBatches(List<DataFile> filesToReindexPermissionsFor) {
List<SolrInputDocument> docs = new ArrayList<>();
Map<Long, List<Long>> byParentId = new HashMap<>();
Map<Long, List<String>> permStringByDatasetVersion = new HashMap<>();
for (DataFile file : filesToReindexPermissionsFor) {
Dataset dataset = (Dataset) file.getOwner();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
if (cardShouldExist) {
List<String> cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
if (cachedPermission == null) {
logger.fine("no cached permission! Looking it up...");
List<DvObjectSolrDoc> fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) {
Long datasetVersionId = fileSolrDoc.getDatasetVersionId();
if (datasetVersionId != null) {
permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions());
int i = 0;
try {
for (DataFile file : filesToReindexPermissionsFor) {
Dataset dataset = (Dataset) file.getOwner();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
if (cardShouldExist) {
List<String> cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId());
if (cachedPermission == null) {
logger.finest("no cached permission! Looking it up...");
List<DvObjectSolrDoc> fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) {
Long datasetVersionId = fileSolrDoc.getDatasetVersionId();
if (datasetVersionId != null) {
permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions());
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
docs.add(solrDoc);
i++;
}
}
} else {
logger.finest("cached permission is " + cachedPermission);
List<DvObjectSolrDoc> fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) {
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
docs.add(solrDoc);
i++;
}
}
} else {
logger.fine("cached permission is " + cachedPermission);
List<DvObjectSolrDoc> fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion);
for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) {
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
docs.add(solrDoc);
if (i % 20 == 0) {
persistToSolr(docs);
docs = new ArrayList<>();
}
}
}
Long parent = file.getOwner().getId();
List<Long> existingList = byParentId.get(parent);
if (existingList == null) {
List<Long> empty = new ArrayList<>();
byParentId.put(parent, empty);
} else {
List<Long> updatedList = existingList;
updatedList.add(file.getId());
byParentId.put(parent, updatedList);
}
}
Long parent = file.getOwner().getId();
List<Long> existingList = byParentId.get(parent);
if (existingList == null) {
List<Long> empty = new ArrayList<>();
byParentId.put(parent, empty);
} else {
List<Long> updatedList = existingList;
updatedList.add(file.getId());
byParentId.put(parent, updatedList);
}
}
try {

persistToSolr(docs);
return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents ";
} catch (SolrServerException | IOException ex) {
Expand Down Expand Up @@ -517,29 +537,26 @@ public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServer
}

/**
*
*
* @return A list of dvobject ids that should have their permissions
* re-indexed because Solr was down when a permission was added. The permission
* should be added to Solr. The id of the permission contains the type of
* DvObject and the primary key of the dvObject.
* DvObjects of type DataFile are currently skipped because their index
* time isn't stored in the database, since they are indexed along
* with their parent dataset (this may change).
* re-indexed because Solr was down when a permission was added. The
* permission should be added to Solr. The id of the permission contains the
* type of DvObject and the primary key of the dvObject. DvObjects of type
* DataFile are currently skipped because their index time isn't stored in
* the database, since they are indexed along with their parent dataset
* (this may change).
*/
public List<Long> findPermissionsInDatabaseButStaleInOrMissingFromSolr() {
List<Long> indexingRequired = new ArrayList<>();
long rootDvId = dataverseService.findRootDataverse().getId();
List<Long> missingDataversePermissionIds = dataverseService.findIdStalePermission();
List<Long> missingDatasetPermissionIds = datasetService.findIdStalePermission();
for (Long id : missingDataversePermissionIds) {
for (Long id : missingDataversePermissionIds) {
if (!id.equals(rootDvId)) {
indexingRequired.add(id);
indexingRequired.add(id);
}
}
indexingRequired.addAll(missingDatasetPermissionIds);
return indexingRequired;
}


}

0 comments on commit d039a10

Please sign in to comment.