diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f37ded8fa8f0..854e6ff769e6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -81,6 +81,13 @@ Other * GITHUB#12934: Cleaning up old references to Lucene/Solr. (Jakub Slowinski) +======================== Lucene 9.9.2 ======================= + +Bug Fixes +--------------------- + +* GITHUB#13027: Fix NPE when sampling for quantization in Lucene99HnswScalarQuantizedVectorsFormat (Ben Trent) + ======================== Lucene 9.9.1 ======================= Bug Fixes diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index f731e52ba73c..d9995f9fdb4b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -547,9 +547,20 @@ static ScalarQuantizer mergeAndRecalculateQuantiles( // merged // segment view if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) { + int numVectors = 0; FloatVectorValues vectorValues = KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); - mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval); + // iterate vectorValues and increment numVectors + for (int doc = vectorValues.nextDoc(); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = vectorValues.nextDoc()) { + numVectors++; + } + mergedQuantiles = + ScalarQuantizer.fromVectors( + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState), + confidenceInterval, + numVectors); } return mergedQuantiles; } @@ -639,7 +650,8 @@ void finish() throws IOException { new FloatVectorWrapper( floatVectors, fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE), - confidenceInterval); + confidenceInterval, + floatVectors.size()); minQuantile = quantizer.getLowerQuantile(); maxQuantile = quantizer.getUpperQuantile(); if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) { diff --git a/lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java index 41ec4fe7ef8d..f5641e6134b7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java +++ b/lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java @@ -192,6 +192,53 @@ public String toString() { private static final Random random = new Random(42); + static int[] reservoirSampleIndices(int numFloatVecs, int sampleSize) { + int[] vectorsToTake = IntStream.range(0, sampleSize).toArray(); + for (int i = sampleSize; i < numFloatVecs; i++) { + int j = random.nextInt(i + 1); + if (j < sampleSize) { + vectorsToTake[j] = i; + } + } + Arrays.sort(vectorsToTake); + return vectorsToTake; + } + + static float[] sampleVectors(FloatVectorValues floatVectorValues, int[] vectorsToTake) + throws IOException { + int dim = floatVectorValues.dimension(); + float[] values = new float[vectorsToTake.length * dim]; + int copyOffset = 0; + int index = 0; + for (int i : vectorsToTake) { + while (index <= i) { + // We cannot use `advance(docId)` as MergedVectorValues does not support it + floatVectorValues.nextDoc(); + index++; + } + assert floatVectorValues.docID() != NO_MORE_DOCS; + float[] floatVector = floatVectorValues.vectorValue(); + System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length); + copyOffset += dim; + } + return values; + } + + /** + * See {@link #fromVectors(FloatVectorValues, float, int)} for details on how the quantiles are + * calculated. NOTE: If there are deleted vectors in the index, do not use this method, but + * instead use {@link #fromVectors(FloatVectorValues, float, int)}. This is because the + * totalVectorCount is used to account for deleted documents when sampling. + */ + public static ScalarQuantizer fromVectors( + FloatVectorValues floatVectorValues, float confidenceInterval) throws IOException { + return fromVectors( + floatVectorValues, + confidenceInterval, + floatVectorValues.size(), + SCALAR_QUANTIZATION_SAMPLE_SIZE); + } + /** * This will read the float vector values and calculate the quantiles. If the number of float * vectors is less than {@link #SCALAR_QUANTIZATION_SAMPLE_SIZE} then all the values will be read @@ -201,13 +248,26 @@ public String toString() { * * @param floatVectorValues the float vector values from which to calculate the quantiles * @param confidenceInterval the confidence interval used to calculate the quantiles + * @param totalVectorCount the total number of live float vectors in the index. This is vital for + * accounting for deleted documents when calculating the quantiles. * @return A new {@link ScalarQuantizer} instance * @throws IOException if there is an error reading the float vector values */ public static ScalarQuantizer fromVectors( - FloatVectorValues floatVectorValues, float confidenceInterval) throws IOException { + FloatVectorValues floatVectorValues, float confidenceInterval, int totalVectorCount) + throws IOException { + return fromVectors( + floatVectorValues, confidenceInterval, totalVectorCount, SCALAR_QUANTIZATION_SAMPLE_SIZE); + } + + static ScalarQuantizer fromVectors( + FloatVectorValues floatVectorValues, + float confidenceInterval, + int totalVectorCount, + int quantizationSampleSize) + throws IOException { assert 0.9f <= confidenceInterval && confidenceInterval <= 1f; - if (floatVectorValues.size() == 0) { + if (totalVectorCount == 0) { return new ScalarQuantizer(0f, 0f, confidenceInterval); } if (confidenceInterval == 1f) { @@ -222,9 +282,9 @@ public static ScalarQuantizer fromVectors( return new ScalarQuantizer(min, max, confidenceInterval); } int dim = floatVectorValues.dimension(); - if (floatVectorValues.size() < SCALAR_QUANTIZATION_SAMPLE_SIZE) { + if (totalVectorCount <= quantizationSampleSize) { int copyOffset = 0; - float[] values = new float[floatVectorValues.size() * dim]; + float[] values = new float[totalVectorCount * dim]; while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { float[] floatVector = floatVectorValues.vectorValue(); System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length); @@ -233,30 +293,10 @@ public static ScalarQuantizer fromVectors( float[] upperAndLower = getUpperAndLowerQuantile(values, confidenceInterval); return new ScalarQuantizer(upperAndLower[0], upperAndLower[1], confidenceInterval); } - int numFloatVecs = floatVectorValues.size(); + int numFloatVecs = totalVectorCount; // Reservoir sample the vector ordinals we want to read - float[] values = new float[SCALAR_QUANTIZATION_SAMPLE_SIZE * dim]; - int[] vectorsToTake = IntStream.range(0, SCALAR_QUANTIZATION_SAMPLE_SIZE).toArray(); - for (int i = SCALAR_QUANTIZATION_SAMPLE_SIZE; i < numFloatVecs; i++) { - int j = random.nextInt(i + 1); - if (j < SCALAR_QUANTIZATION_SAMPLE_SIZE) { - vectorsToTake[j] = i; - } - } - Arrays.sort(vectorsToTake); - int copyOffset = 0; - int index = 0; - for (int i : vectorsToTake) { - while (index <= i) { - // We cannot use `advance(docId)` as MergedVectorValues does not support it - floatVectorValues.nextDoc(); - index++; - } - assert floatVectorValues.docID() != NO_MORE_DOCS; - float[] floatVector = floatVectorValues.vectorValue(); - System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length); - copyOffset += dim; - } + int[] vectorsToTake = reservoirSampleIndices(numFloatVecs, quantizationSampleSize); + float[] values = sampleVectors(floatVectorValues, vectorsToTake); float[] upperAndLower = getUpperAndLowerQuantile(values, confidenceInterval); return new ScalarQuantizer(upperAndLower[0], upperAndLower[1], confidenceInterval); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index 7ddcfb67825b..9d4404878b1c 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -70,7 +70,8 @@ public void testQuantizedVectorsWriteAndRead() throws Exception { ScalarQuantizer scalarQuantizer = ScalarQuantizer.fromVectors( new Lucene99ScalarQuantizedVectorsWriter.FloatVectorWrapper(vectors, normalize), - confidenceInterval); + confidenceInterval, + numVectors); float[] expectedCorrections = new float[numVectors]; byte[][] expectedVectors = new byte[numVectors][]; for (int i = 0; i < numVectors; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizedVectorSimilarity.java b/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizedVectorSimilarity.java index addcbf23188f..3fab38312c34 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizedVectorSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizedVectorSimilarity.java @@ -21,6 +21,7 @@ import static org.apache.lucene.util.TestScalarQuantizer.randomFloats; import java.io.IOException; +import java.util.Set; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.tests.util.LuceneTestCase; @@ -36,7 +37,7 @@ public void testToEuclidean() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.EUCLIDEAN); @@ -64,9 +65,9 @@ public void testToCosine() throws IOException { for (float confidenceInterval : new float[] {0.9f, 0.95f, 0.99f, (1 - 1f / (dims + 1)), 1f}) { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); - FloatVectorValues floatVectorValues = fromFloatsNormalized(floats); + FloatVectorValues floatVectorValues = fromFloatsNormalized(floats, null); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectorsNormalized( @@ -100,7 +101,7 @@ public void testToDotProduct() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.DOT_PRODUCT); @@ -130,7 +131,7 @@ public void testToMaxInnerProduct() throws IOException { float error = Math.max((100 - confidenceInterval) * 0.5f, 0.5f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors( @@ -204,8 +205,9 @@ private static float[] quantizeVectorsNormalized( return offsets; } - private static FloatVectorValues fromFloatsNormalized(float[][] floats) { - return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats) { + private static FloatVectorValues fromFloatsNormalized( + float[][] floats, Set deletedVectors) { + return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) { @Override public float[] vectorValue() throws IOException { if (curDoc == -1 || curDoc >= floats.length) { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizer.java b/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizer.java index 66c4b41527e1..bcdfbec8b653 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizer.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.util; import java.io.IOException; +import java.util.HashSet; +import java.util.Set; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.tests.util.LuceneTestCase; @@ -30,7 +32,7 @@ public void testQuantizeAndDeQuantize() throws IOException { float[][] floats = randomFloats(numVecs, dims); FloatVectorValues floatVectorValues = fromFloats(floats); - ScalarQuantizer scalarQuantizer = ScalarQuantizer.fromVectors(floatVectorValues, 1); + ScalarQuantizer scalarQuantizer = ScalarQuantizer.fromVectors(floatVectorValues, 1, numVecs); float[] dequantized = new float[dims]; byte[] quantized = new byte[dims]; byte[] requantized = new byte[dims]; @@ -71,6 +73,87 @@ public void testEdgeCase() { assertEquals(1f, upperAndLower[1], 1e-7f); } + public void testSamplingEdgeCases() throws IOException { + int numVecs = 65; + int dims = 64; + float[][] floats = randomFloats(numVecs, dims); + FloatVectorValues floatVectorValues = fromFloats(floats); + int[] vectorsToTake = new int[] {0, floats.length - 1}; + float[] sampled = ScalarQuantizer.sampleVectors(floatVectorValues, vectorsToTake); + int i = 0; + for (; i < dims; i++) { + assertEquals(floats[vectorsToTake[0]][i], sampled[i], 0.0f); + } + for (; i < dims * 2; i++) { + assertEquals(floats[vectorsToTake[1]][i - dims], sampled[i], 0.0f); + } + } + + public void testVectorSampling() throws IOException { + int numVecs = random().nextInt(123) + 5; + int dims = 4; + float[][] floats = randomFloats(numVecs, dims); + FloatVectorValues floatVectorValues = fromFloats(floats); + int[] vectorsToTake = + ScalarQuantizer.reservoirSampleIndices(numVecs, random().nextInt(numVecs - 1) + 1); + int prev = vectorsToTake[0]; + // ensure sorted & unique + for (int i = 1; i < vectorsToTake.length; i++) { + assertTrue(vectorsToTake[i] > prev); + prev = vectorsToTake[i]; + } + float[] sampled = ScalarQuantizer.sampleVectors(floatVectorValues, vectorsToTake); + // ensure we got the right vectors + for (int i = 0; i < vectorsToTake.length; i++) { + for (int j = 0; j < dims; j++) { + assertEquals(floats[vectorsToTake[i]][j], sampled[i * dims + j], 0.0f); + } + } + } + + public void testScalarWithSampling() throws IOException { + int numVecs = random().nextInt(128) + 5; + int dims = 64; + float[][] floats = randomFloats(numVecs, dims); + // Should not throw + { + TestSimpleFloatVectorValues floatVectorValues = + fromFloatsWithRandomDeletions(floats, random().nextInt(numVecs - 1) + 1); + ScalarQuantizer.fromVectors( + floatVectorValues, + 0.99f, + floatVectorValues.numLiveVectors, + floatVectorValues.numLiveVectors - 1); + } + { + TestSimpleFloatVectorValues floatVectorValues = + fromFloatsWithRandomDeletions(floats, random().nextInt(numVecs - 1) + 1); + ScalarQuantizer.fromVectors( + floatVectorValues, + 0.99f, + floatVectorValues.numLiveVectors, + floatVectorValues.numLiveVectors + 1); + } + { + TestSimpleFloatVectorValues floatVectorValues = + fromFloatsWithRandomDeletions(floats, random().nextInt(numVecs - 1) + 1); + ScalarQuantizer.fromVectors( + floatVectorValues, + 0.99f, + floatVectorValues.numLiveVectors, + floatVectorValues.numLiveVectors); + } + { + TestSimpleFloatVectorValues floatVectorValues = + fromFloatsWithRandomDeletions(floats, random().nextInt(numVecs - 1) + 1); + ScalarQuantizer.fromVectors( + floatVectorValues, + 0.99f, + floatVectorValues.numLiveVectors, + random().nextInt(floatVectorValues.floats.length - 1) + 1); + } + } + static void shuffleArray(float[] ar) { for (int i = ar.length - 1; i > 0; i--) { int index = random().nextInt(i + 1); @@ -97,15 +180,29 @@ static float[][] randomFloats(int num, int dims) { } static FloatVectorValues fromFloats(float[][] floats) { - return new TestSimpleFloatVectorValues(floats); + return new TestSimpleFloatVectorValues(floats, null); + } + + static TestSimpleFloatVectorValues fromFloatsWithRandomDeletions( + float[][] floats, int numDeleted) { + Set deletedVectors = new HashSet<>(); + for (int i = 0; i < numDeleted; i++) { + deletedVectors.add(random().nextInt(floats.length)); + } + return new TestSimpleFloatVectorValues(floats, deletedVectors); } static class TestSimpleFloatVectorValues extends FloatVectorValues { protected final float[][] floats; + protected final Set deletedVectors; + protected final int numLiveVectors; protected int curDoc = -1; - TestSimpleFloatVectorValues(float[][] values) { + TestSimpleFloatVectorValues(float[][] values, Set deletedVectors) { this.floats = values; + this.deletedVectors = deletedVectors; + this.numLiveVectors = + deletedVectors == null ? values.length : values.length - deletedVectors.size(); } @Override @@ -136,14 +233,18 @@ public int docID() { @Override public int nextDoc() throws IOException { - curDoc++; + while (++curDoc < floats.length) { + if (deletedVectors == null || !deletedVectors.contains(curDoc)) { + return curDoc; + } + } return docID(); } @Override public int advance(int target) throws IOException { - curDoc = target; - return docID(); + curDoc = target - 1; + return nextDoc(); } } }