Skip to content

Commit

Permalink
Fix issues with chunked TaxonomyIndexArray (apache#13028)
Browse files Browse the repository at this point in the history
Fix construction from index with multiple of chunk size ordinals.

Fix mutable post-refresh children array.
  • Loading branch information
stefanvodita authored Jan 25, 2024
1 parent d0098f8 commit 7552c50
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.lucene.facet.taxonomy.directory;

import com.carrotsearch.hppc.IntHashSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -80,7 +81,8 @@ public int length() {

public TaxonomyIndexArrays(IndexReader reader) throws IOException {
int[][] parentArray = allocateChunkedArray(reader.maxDoc(), 0);
if (parentArray.length > 0) {
assert parentArray.length > 0;
if (parentArray[0].length > 0) {
initParents(parentArray, reader, 0);
parentArray[0][0] = TaxonomyReader.INVALID_ORDINAL;
}
Expand All @@ -95,10 +97,10 @@ public TaxonomyIndexArrays(IndexReader reader, TaxonomyIndexArrays copyFrom) thr
// NRT reader was obtained, even though nothing was changed. this is not very likely
// to happen.
int[][] parentArray = allocateChunkedArray(reader.maxDoc(), copyFrom.parents.values.length - 1);
if (parentArray.length > 0) {
copyChunkedArray(copyFrom.parents.values, parentArray);
initParents(parentArray, reader, copyFrom.parents.length());
}
assert parentArray.length > 0;

copyChunkedArray(copyFrom.parents.values, parentArray);
initParents(parentArray, reader, copyFrom.parents.length());
parents = new ChunkedIntArray(parentArray);
if (copyFrom.initializedChildren) {
initChildrenSiblings(copyFrom);
Expand Down Expand Up @@ -164,13 +166,24 @@ private void computeChildrenSiblings(int first) {
siblings.set(0, TaxonomyReader.INVALID_ORDINAL);
}

int firstChunkStart = first - (first & CHUNK_MASK);
IntHashSet reallocatedChildChunks = new IntHashSet();
for (int i = first; i < length; i++) {
int parent = parents.get(i);
// The existing youngest child of the parent is the next older sibling of i.
// note that parents[i] is always < i, so the right-hand-side of
// the following line is already set when we get here
siblings.set(i, children.get(parent));
// The new youngest child of the parent is i.
if (parent < firstChunkStart) {
int chunkIdx = parent >> CHUNK_SIZE_BITS;
if (reallocatedChildChunks.contains(chunkIdx) == false) {
reallocatedChildChunks.add(chunkIdx);
int[] oldChildren = children.values[chunkIdx];
children.values[chunkIdx] = new int[CHUNK_SIZE];
System.arraycopy(oldChildren, 0, children.values[chunkIdx], 0, oldChildren.length);
}
}
children.set(parent, i);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,36 @@
*/
package org.apache.lucene.facet.taxonomy.directory;

import java.io.IOException;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.util.LuceneTestCase;

public class TestTaxonomyIndexArrays extends LuceneTestCase {

private void checkIntArraysEquals(
TaxonomyIndexArrays.ChunkedIntArray expected, TaxonomyIndexArrays.ChunkedIntArray actual) {
for (int i = 0; i < expected.values.length - 1; i++) {
assertSame(expected.values[i], actual.values[i]);
}
int lastOldChunk = expected.values.length - 1;
for (int i = 0; i < expected.values[lastOldChunk].length; i++) {
assertEquals(expected.values[lastOldChunk][i], actual.values[lastOldChunk][i]);
}
}

private void checkInvariants(TaxonomyIndexArrays oldArray, TaxonomyIndexArrays newArray) {
TaxonomyIndexArrays.ChunkedIntArray oldParents = oldArray.parents();
TaxonomyIndexArrays.ChunkedIntArray newParents = newArray.parents();
for (int i = 0; i < oldParents.values.length - 1; i++) {
assertSame(oldParents.values[i], newParents.values[i]);
}
int lastOldChunk = oldParents.values.length - 1;
for (int i = 0; i < oldParents.values[lastOldChunk].length; i++) {
assertEquals(oldParents.values[lastOldChunk][i], newParents.values[lastOldChunk][i]);
}
checkIntArraysEquals(oldParents, newParents);
TaxonomyIndexArrays.ChunkedIntArray oldSiblings = oldArray.siblings();
TaxonomyIndexArrays.ChunkedIntArray newSiblings = newArray.siblings();
checkIntArraysEquals(oldSiblings, newSiblings);
}

public void testRandom() {
Expand Down Expand Up @@ -59,4 +74,172 @@ public void testMultiplesOfChunkSize() {
ordinal = newOrdinal;
}
}

public void testConstructFromEmptyIndex() throws IOException {
Directory dir = newDirectory();

// Produce empty index
new IndexWriter(dir, newIndexWriterConfig(null)).close();

IndexReader reader = DirectoryReader.open(dir);

TaxonomyIndexArrays tia = new TaxonomyIndexArrays(reader);
assertEquals(0, tia.parents().length());

tia = new TaxonomyIndexArrays(reader, tia);
assertEquals(0, tia.parents().length());

reader.close();
dir.close();
}

public void testConstructFromIndex() throws IOException {
Directory dir = newDirectory();
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
TaxonomyIndexArrays oldTia, newTia;
TaxonomyIndexArrays.ChunkedIntArray oldParents, newParents;

// Test 1
// Add one category. The first TIA will have a non-zero length incomplete chunk.
taxoWriter.addCategory(new FacetLabel("a"));
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
oldTia = new TaxonomyIndexArrays(reader);
}

oldParents = oldTia.parents();
assertEquals(2, oldParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, oldParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, oldParents.get(1));

// Test 2
// Add enough categories to fill the first chunk.
for (int i = 2; i < TaxonomyIndexArrays.CHUNK_SIZE; i++) {
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(i)));
}
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
oldTia = new TaxonomyIndexArrays(reader);
}

oldParents = oldTia.parents();
assertEquals(TaxonomyIndexArrays.CHUNK_SIZE, oldParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, oldParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, oldParents.get(1));
for (int i = 2; i < oldParents.length(); i++) {
assertEquals(1, oldParents.get(i));
}

// Test 3
// Both TIAs have the same parents and siblings arrays.
oldTia.children(); // Initializes children
try (IndexReader reader = DirectoryReader.open(dir)) {
newTia = new TaxonomyIndexArrays(reader, oldTia);
}
checkInvariants(oldTia, newTia);

// Test 4
// Add one more category, which will start a new chunk on the new TIA.
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(TaxonomyIndexArrays.CHUNK_SIZE)));
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
newTia = new TaxonomyIndexArrays(reader, oldTia);
}

newParents = newTia.parents();
assertEquals(1 + TaxonomyIndexArrays.CHUNK_SIZE, newParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, newParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, newParents.get(1));
for (int i = 2; i < newParents.length(); i++) {
assertEquals(1, newParents.get(i));
}

// Test 5
// Fill the second chunk of the new TIA.
for (int i = 1; i < TaxonomyIndexArrays.CHUNK_SIZE; i++) {
taxoWriter.addCategory(
new FacetLabel("a", Integer.toString(i + TaxonomyIndexArrays.CHUNK_SIZE)));
}
taxoWriter.commit();
try (IndexReader reader = DirectoryReader.open(dir)) {
newTia = new TaxonomyIndexArrays(reader, oldTia);
}

newParents = newTia.parents();
assertEquals(2 * TaxonomyIndexArrays.CHUNK_SIZE, newParents.length());
assertEquals(TaxonomyReader.INVALID_ORDINAL, newParents.get(0));
assertEquals(TaxonomyReader.ROOT_ORDINAL, newParents.get(1));
for (int i = 2; i < newParents.length(); i++) {
assertEquals(1, newParents.get(i));
}

taxoWriter.close();
dir.close();
}

public void testRefresh() throws IOException {
Directory dir = newDirectory();

// Write two chunks worth of ordinals whose parents are ordinals 1 or 2
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
taxoWriter.addCategory(new FacetLabel("a")); // ordinal 1
taxoWriter.addCategory(new FacetLabel("b")); // ordinal 2
for (int i = 0; i < 2 * TaxonomyIndexArrays.CHUNK_SIZE; i++) {
if (i % 2 == 0) {
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(i)));
} else {
taxoWriter.addCategory(new FacetLabel("b", Integer.toString(i)));
}
}
taxoWriter.commit();

// Initialize old taxonomy arrays
IndexReader reader = DirectoryReader.open(dir);
TaxonomyIndexArrays oldTia = new TaxonomyIndexArrays(reader);
reader.close();
oldTia.children(); // Init the children

// Write one more batch of ordinals whose parents are ordinals 1 or 2 again.
for (int i = 2 * TaxonomyIndexArrays.CHUNK_SIZE; i < 3 * TaxonomyIndexArrays.CHUNK_SIZE; i++) {
if (i % 2 == 0) {
taxoWriter.addCategory(new FacetLabel("a", Integer.toString(i)));
} else {
taxoWriter.addCategory(new FacetLabel("b", Integer.toString(i)));
}
}
taxoWriter.close();

// Initialize new taxonomy arrays
reader = DirectoryReader.open(dir);
TaxonomyIndexArrays newTia = new TaxonomyIndexArrays(reader, oldTia);
reader.close();

// Parents and siblings are unchanged in the old range, children will have changed
checkInvariants(oldTia, newTia);

TaxonomyIndexArrays.ChunkedIntArray oldChildren = oldTia.children();
TaxonomyIndexArrays.ChunkedIntArray newChildren = newTia.children();

// The first chunk had to be reallocated to rewrite the value for ordinals 1 and 2
assertNotSame(oldChildren.values[0], newChildren.values[0]);
// The second chunk could stay the same, since none of these ordinals have children
assertSame(oldChildren.values[1], newChildren.values[1]);
// The third chunk had to be reallocated to grow
assertNotSame(oldChildren.values[2], newChildren.values[2]);

// Check contents of the first chunk
for (int i = 0; i < TaxonomyIndexArrays.CHUNK_SIZE; i++) {
if (i == 1 || i == 2) {
assertNotEquals(oldChildren.values[0][i], newChildren.values[0][i]);
} else {
assertEquals(oldChildren.values[0][i], newChildren.values[0][i]);
}
}
// Check contents of the third chunk
for (int i = 0; i < oldChildren.values[2].length; i++) {
assertEquals(oldChildren.values[2][i], newChildren.values[2][i]);
}

dir.close();
}
}

0 comments on commit 7552c50

Please sign in to comment.