Fix NPE when sampling for quantization in Lucene99HnswScalarQuantized…

…VectorsFormat (apache#13027) When merging `Lucene99HnswScalarQuantizedVectorsFormat` a NPE is possible when deleted documents are present. `ScalarQuantizer#fromVectors` doesn't take deleted documents into account. This means using `FloatVectorValues#size` may actually be larger than the actual size of live documents. Consequently, when iterating for sampling iteration too far is possible and an NPE will be thrown.
stefanvodita · Jan 23, 2024 · 9ccfc30 · 9ccfc30
1 parent b951c4c
commit 9ccfc30
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 43 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -81,6 +81,13 @@ Other
 
 * GITHUB#12934: Cleaning up old references to Lucene/Solr. (Jakub Slowinski)
 
+======================== Lucene 9.9.2 =======================
+
+Bug Fixes
+---------------------
+
+* GITHUB#13027: Fix NPE when sampling for quantization in Lucene99HnswScalarQuantizedVectorsFormat (Ben Trent)
+
 ======================== Lucene 9.9.1 =======================
 
 Bug Fixes

diff --git a/...core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/...core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java
@@ -547,9 +547,20 @@ static ScalarQuantizer mergeAndRecalculateQuantiles(
     // merged
     // segment view
     if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
+      int numVectors = 0;
       FloatVectorValues vectorValues =
           KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
-      mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval);
+      // iterate vectorValues and increment numVectors
+      for (int doc = vectorValues.nextDoc();
+          doc != DocIdSetIterator.NO_MORE_DOCS;
+          doc = vectorValues.nextDoc()) {
+        numVectors++;
+      }
+      mergedQuantiles =
+          ScalarQuantizer.fromVectors(
+              KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState),
+              confidenceInterval,
+              numVectors);
     }
     return mergedQuantiles;
   }
@@ -639,7 +650,8 @@ void finish() throws IOException {
               new FloatVectorWrapper(
                   floatVectors,
                   fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
-              confidenceInterval);
+              confidenceInterval,
+              floatVectors.size());
       minQuantile = quantizer.getLowerQuantile();
       maxQuantile = quantizer.getUpperQuantile();
       if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {

diff --git a/lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java
@@ -192,6 +192,53 @@ public String toString() {
 
   private static final Random random = new Random(42);
 
+  static int[] reservoirSampleIndices(int numFloatVecs, int sampleSize) {
+    int[] vectorsToTake = IntStream.range(0, sampleSize).toArray();
+    for (int i = sampleSize; i < numFloatVecs; i++) {
+      int j = random.nextInt(i + 1);
+      if (j < sampleSize) {
+        vectorsToTake[j] = i;
+      }
+    }
+    Arrays.sort(vectorsToTake);
+    return vectorsToTake;
+  }
+
+  static float[] sampleVectors(FloatVectorValues floatVectorValues, int[] vectorsToTake)
+      throws IOException {
+    int dim = floatVectorValues.dimension();
+    float[] values = new float[vectorsToTake.length * dim];
+    int copyOffset = 0;
+    int index = 0;
+    for (int i : vectorsToTake) {
+      while (index <= i) {
+        // We cannot use `advance(docId)` as MergedVectorValues does not support it
+        floatVectorValues.nextDoc();
+        index++;
+      }
+      assert floatVectorValues.docID() != NO_MORE_DOCS;
+      float[] floatVector = floatVectorValues.vectorValue();
+      System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length);
+      copyOffset += dim;
+    }
+    return values;
+  }
+
+  /**
+   * See {@link #fromVectors(FloatVectorValues, float, int)} for details on how the quantiles are
+   * calculated. NOTE: If there are deleted vectors in the index, do not use this method, but
+   * instead use {@link #fromVectors(FloatVectorValues, float, int)}. This is because the
+   * totalVectorCount is used to account for deleted documents when sampling.
+   */
+  public static ScalarQuantizer fromVectors(
+      FloatVectorValues floatVectorValues, float confidenceInterval) throws IOException {
+    return fromVectors(
+        floatVectorValues,
+        confidenceInterval,
+        floatVectorValues.size(),
+        SCALAR_QUANTIZATION_SAMPLE_SIZE);
+  }
+
   /**
    * This will read the float vector values and calculate the quantiles. If the number of float
    * vectors is less than {@link #SCALAR_QUANTIZATION_SAMPLE_SIZE} then all the values will be read
@@ -201,13 +248,26 @@ public String toString() {
    *
    * @param floatVectorValues the float vector values from which to calculate the quantiles
    * @param confidenceInterval the confidence interval used to calculate the quantiles
+   * @param totalVectorCount the total number of live float vectors in the index. This is vital for
+   *     accounting for deleted documents when calculating the quantiles.
    * @return A new {@link ScalarQuantizer} instance
    * @throws IOException if there is an error reading the float vector values
    */
   public static ScalarQuantizer fromVectors(
-      FloatVectorValues floatVectorValues, float confidenceInterval) throws IOException {
+      FloatVectorValues floatVectorValues, float confidenceInterval, int totalVectorCount)
+      throws IOException {
+    return fromVectors(
+        floatVectorValues, confidenceInterval, totalVectorCount, SCALAR_QUANTIZATION_SAMPLE_SIZE);
+  }
+
+  static ScalarQuantizer fromVectors(
+      FloatVectorValues floatVectorValues,
+      float confidenceInterval,
+      int totalVectorCount,
+      int quantizationSampleSize)
+      throws IOException {
     assert 0.9f <= confidenceInterval && confidenceInterval <= 1f;
-    if (floatVectorValues.size() == 0) {
+    if (totalVectorCount == 0) {
       return new ScalarQuantizer(0f, 0f, confidenceInterval);
     }
     if (confidenceInterval == 1f) {
@@ -222,9 +282,9 @@ public static ScalarQuantizer fromVectors(
       return new ScalarQuantizer(min, max, confidenceInterval);
     }
     int dim = floatVectorValues.dimension();
-    if (floatVectorValues.size() < SCALAR_QUANTIZATION_SAMPLE_SIZE) {
+    if (totalVectorCount <= quantizationSampleSize) {
       int copyOffset = 0;
-      float[] values = new float[floatVectorValues.size() * dim];
+      float[] values = new float[totalVectorCount * dim];
       while (floatVectorValues.nextDoc() != NO_MORE_DOCS) {
         float[] floatVector = floatVectorValues.vectorValue();
         System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length);
@@ -233,30 +293,10 @@ public static ScalarQuantizer fromVectors(
       float[] upperAndLower = getUpperAndLowerQuantile(values, confidenceInterval);
       return new ScalarQuantizer(upperAndLower[0], upperAndLower[1], confidenceInterval);
     }
-    int numFloatVecs = floatVectorValues.size();
+    int numFloatVecs = totalVectorCount;
     // Reservoir sample the vector ordinals we want to read
-    float[] values = new float[SCALAR_QUANTIZATION_SAMPLE_SIZE * dim];
-    int[] vectorsToTake = IntStream.range(0, SCALAR_QUANTIZATION_SAMPLE_SIZE).toArray();
-    for (int i = SCALAR_QUANTIZATION_SAMPLE_SIZE; i < numFloatVecs; i++) {
-      int j = random.nextInt(i + 1);
-      if (j < SCALAR_QUANTIZATION_SAMPLE_SIZE) {
-        vectorsToTake[j] = i;
-      }
-    }
-    Arrays.sort(vectorsToTake);
-    int copyOffset = 0;
-    int index = 0;
-    for (int i : vectorsToTake) {
-      while (index <= i) {
-        // We cannot use `advance(docId)` as MergedVectorValues does not support it
-        floatVectorValues.nextDoc();
-        index++;
-      }
-      assert floatVectorValues.docID() != NO_MORE_DOCS;
-      float[] floatVector = floatVectorValues.vectorValue();
-      System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length);
-      copyOffset += dim;
-    }
+    int[] vectorsToTake = reservoirSampleIndices(numFloatVecs, quantizationSampleSize);
+    float[] values = sampleVectors(floatVectorValues, vectorsToTake);
     float[] upperAndLower = getUpperAndLowerQuantile(values, confidenceInterval);
     return new ScalarQuantizer(upperAndLower[0], upperAndLower[1], confidenceInterval);
   }

diff --git a/...re/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/...re/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
@@ -70,7 +70,8 @@ public void testQuantizedVectorsWriteAndRead() throws Exception {
     ScalarQuantizer scalarQuantizer =
         ScalarQuantizer.fromVectors(
             new Lucene99ScalarQuantizedVectorsWriter.FloatVectorWrapper(vectors, normalize),
-            confidenceInterval);
+            confidenceInterval,
+            numVectors);
     float[] expectedCorrections = new float[numVectors];
     byte[][] expectedVectors = new byte[numVectors][];
     for (int i = 0; i < numVectors; i++) {

diff --git a/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizedVectorSimilarity.java b/lucene/core/src/test/org/apache/lucene/util/TestScalarQuantizedVectorSimilarity.java
@@ -21,6 +21,7 @@
 import static org.apache.lucene.util.TestScalarQuantizer.randomFloats;
 
 import java.io.IOException;
+import java.util.Set;
 import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.tests.util.LuceneTestCase;
@@ -36,7 +37,7 @@ public void testToEuclidean() throws IOException {
       float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f);
       FloatVectorValues floatVectorValues = fromFloats(floats);
       ScalarQuantizer scalarQuantizer =
-          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
+          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
       byte[][] quantized = new byte[floats.length][];
       float[] offsets =
           quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.EUCLIDEAN);
@@ -64,9 +65,9 @@ public void testToCosine() throws IOException {
 
     for (float confidenceInterval : new float[] {0.9f, 0.95f, 0.99f, (1 - 1f / (dims + 1)), 1f}) {
       float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f);
-      FloatVectorValues floatVectorValues = fromFloatsNormalized(floats);
+      FloatVectorValues floatVectorValues = fromFloatsNormalized(floats, null);
       ScalarQuantizer scalarQuantizer =
-          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
+          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
       byte[][] quantized = new byte[floats.length][];
       float[] offsets =
           quantizeVectorsNormalized(
@@ -100,7 +101,7 @@ public void testToDotProduct() throws IOException {
       float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f);
       FloatVectorValues floatVectorValues = fromFloats(floats);
       ScalarQuantizer scalarQuantizer =
-          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
+          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
       byte[][] quantized = new byte[floats.length][];
       float[] offsets =
           quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.DOT_PRODUCT);
@@ -130,7 +131,7 @@ public void testToMaxInnerProduct() throws IOException {
       float error = Math.max((100 - confidenceInterval) * 0.5f, 0.5f);
       FloatVectorValues floatVectorValues = fromFloats(floats);
       ScalarQuantizer scalarQuantizer =
-          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
+          ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
       byte[][] quantized = new byte[floats.length][];
       float[] offsets =
           quantizeVectors(
@@ -204,8 +205,9 @@ private static float[] quantizeVectorsNormalized(
     return offsets;
   }
 
-  private static FloatVectorValues fromFloatsNormalized(float[][] floats) {
-    return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats) {
+  private static FloatVectorValues fromFloatsNormalized(
+      float[][] floats, Set<Integer> deletedVectors) {
+    return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) {
       @Override
       public float[] vectorValue() throws IOException {
         if (curDoc == -1 || curDoc >= floats.length) {