Skip to content

Commit

Permalink
Fix NPE when sampling for quantization in Lucene99HnswScalarQuantized…
Browse files Browse the repository at this point in the history
…VectorsFormat (apache#13027)

When merging `Lucene99HnswScalarQuantizedVectorsFormat` a NPE is possible when deleted documents are present.

`ScalarQuantizer#fromVectors` doesn't take deleted documents into account. This means using `FloatVectorValues#size` may actually be larger than the actual size of live documents. Consequently, when iterating for sampling iteration too far is possible and an NPE will be thrown.
  • Loading branch information
benwtrent committed Jan 23, 2024
1 parent b951c4c commit 9ccfc30
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 43 deletions.
7 changes: 7 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ Other

* GITHUB#12934: Cleaning up old references to Lucene/Solr. (Jakub Slowinski)

======================== Lucene 9.9.2 =======================

Bug Fixes
---------------------

* GITHUB#13027: Fix NPE when sampling for quantization in Lucene99HnswScalarQuantizedVectorsFormat (Ben Trent)

======================== Lucene 9.9.1 =======================

Bug Fixes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -547,9 +547,20 @@ static ScalarQuantizer mergeAndRecalculateQuantiles(
// merged
// segment view
if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
int numVectors = 0;
FloatVectorValues vectorValues =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval);
// iterate vectorValues and increment numVectors
for (int doc = vectorValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = vectorValues.nextDoc()) {
numVectors++;
}
mergedQuantiles =
ScalarQuantizer.fromVectors(
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState),
confidenceInterval,
numVectors);
}
return mergedQuantiles;
}
Expand Down Expand Up @@ -639,7 +650,8 @@ void finish() throws IOException {
new FloatVectorWrapper(
floatVectors,
fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
confidenceInterval);
confidenceInterval,
floatVectors.size());
minQuantile = quantizer.getLowerQuantile();
maxQuantile = quantizer.getUpperQuantile();
if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
Expand Down
94 changes: 67 additions & 27 deletions lucene/core/src/java/org/apache/lucene/util/ScalarQuantizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,53 @@ public String toString() {

private static final Random random = new Random(42);

static int[] reservoirSampleIndices(int numFloatVecs, int sampleSize) {
int[] vectorsToTake = IntStream.range(0, sampleSize).toArray();
for (int i = sampleSize; i < numFloatVecs; i++) {
int j = random.nextInt(i + 1);
if (j < sampleSize) {
vectorsToTake[j] = i;
}
}
Arrays.sort(vectorsToTake);
return vectorsToTake;
}

static float[] sampleVectors(FloatVectorValues floatVectorValues, int[] vectorsToTake)
throws IOException {
int dim = floatVectorValues.dimension();
float[] values = new float[vectorsToTake.length * dim];
int copyOffset = 0;
int index = 0;
for (int i : vectorsToTake) {
while (index <= i) {
// We cannot use `advance(docId)` as MergedVectorValues does not support it
floatVectorValues.nextDoc();
index++;
}
assert floatVectorValues.docID() != NO_MORE_DOCS;
float[] floatVector = floatVectorValues.vectorValue();
System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length);
copyOffset += dim;
}
return values;
}

/**
* See {@link #fromVectors(FloatVectorValues, float, int)} for details on how the quantiles are
* calculated. NOTE: If there are deleted vectors in the index, do not use this method, but
* instead use {@link #fromVectors(FloatVectorValues, float, int)}. This is because the
* totalVectorCount is used to account for deleted documents when sampling.
*/
public static ScalarQuantizer fromVectors(
FloatVectorValues floatVectorValues, float confidenceInterval) throws IOException {
return fromVectors(
floatVectorValues,
confidenceInterval,
floatVectorValues.size(),
SCALAR_QUANTIZATION_SAMPLE_SIZE);
}

/**
* This will read the float vector values and calculate the quantiles. If the number of float
* vectors is less than {@link #SCALAR_QUANTIZATION_SAMPLE_SIZE} then all the values will be read
Expand All @@ -201,13 +248,26 @@ public String toString() {
*
* @param floatVectorValues the float vector values from which to calculate the quantiles
* @param confidenceInterval the confidence interval used to calculate the quantiles
* @param totalVectorCount the total number of live float vectors in the index. This is vital for
* accounting for deleted documents when calculating the quantiles.
* @return A new {@link ScalarQuantizer} instance
* @throws IOException if there is an error reading the float vector values
*/
public static ScalarQuantizer fromVectors(
FloatVectorValues floatVectorValues, float confidenceInterval) throws IOException {
FloatVectorValues floatVectorValues, float confidenceInterval, int totalVectorCount)
throws IOException {
return fromVectors(
floatVectorValues, confidenceInterval, totalVectorCount, SCALAR_QUANTIZATION_SAMPLE_SIZE);
}

static ScalarQuantizer fromVectors(
FloatVectorValues floatVectorValues,
float confidenceInterval,
int totalVectorCount,
int quantizationSampleSize)
throws IOException {
assert 0.9f <= confidenceInterval && confidenceInterval <= 1f;
if (floatVectorValues.size() == 0) {
if (totalVectorCount == 0) {
return new ScalarQuantizer(0f, 0f, confidenceInterval);
}
if (confidenceInterval == 1f) {
Expand All @@ -222,9 +282,9 @@ public static ScalarQuantizer fromVectors(
return new ScalarQuantizer(min, max, confidenceInterval);
}
int dim = floatVectorValues.dimension();
if (floatVectorValues.size() < SCALAR_QUANTIZATION_SAMPLE_SIZE) {
if (totalVectorCount <= quantizationSampleSize) {
int copyOffset = 0;
float[] values = new float[floatVectorValues.size() * dim];
float[] values = new float[totalVectorCount * dim];
while (floatVectorValues.nextDoc() != NO_MORE_DOCS) {
float[] floatVector = floatVectorValues.vectorValue();
System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length);
Expand All @@ -233,30 +293,10 @@ public static ScalarQuantizer fromVectors(
float[] upperAndLower = getUpperAndLowerQuantile(values, confidenceInterval);
return new ScalarQuantizer(upperAndLower[0], upperAndLower[1], confidenceInterval);
}
int numFloatVecs = floatVectorValues.size();
int numFloatVecs = totalVectorCount;
// Reservoir sample the vector ordinals we want to read
float[] values = new float[SCALAR_QUANTIZATION_SAMPLE_SIZE * dim];
int[] vectorsToTake = IntStream.range(0, SCALAR_QUANTIZATION_SAMPLE_SIZE).toArray();
for (int i = SCALAR_QUANTIZATION_SAMPLE_SIZE; i < numFloatVecs; i++) {
int j = random.nextInt(i + 1);
if (j < SCALAR_QUANTIZATION_SAMPLE_SIZE) {
vectorsToTake[j] = i;
}
}
Arrays.sort(vectorsToTake);
int copyOffset = 0;
int index = 0;
for (int i : vectorsToTake) {
while (index <= i) {
// We cannot use `advance(docId)` as MergedVectorValues does not support it
floatVectorValues.nextDoc();
index++;
}
assert floatVectorValues.docID() != NO_MORE_DOCS;
float[] floatVector = floatVectorValues.vectorValue();
System.arraycopy(floatVector, 0, values, copyOffset, floatVector.length);
copyOffset += dim;
}
int[] vectorsToTake = reservoirSampleIndices(numFloatVecs, quantizationSampleSize);
float[] values = sampleVectors(floatVectorValues, vectorsToTake);
float[] upperAndLower = getUpperAndLowerQuantile(values, confidenceInterval);
return new ScalarQuantizer(upperAndLower[0], upperAndLower[1], confidenceInterval);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ public void testQuantizedVectorsWriteAndRead() throws Exception {
ScalarQuantizer scalarQuantizer =
ScalarQuantizer.fromVectors(
new Lucene99ScalarQuantizedVectorsWriter.FloatVectorWrapper(vectors, normalize),
confidenceInterval);
confidenceInterval,
numVectors);
float[] expectedCorrections = new float[numVectors];
byte[][] expectedVectors = new byte[numVectors][];
for (int i = 0; i < numVectors; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static org.apache.lucene.util.TestScalarQuantizer.randomFloats;

import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.tests.util.LuceneTestCase;
Expand All @@ -36,7 +37,7 @@ public void testToEuclidean() throws IOException {
float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f);
FloatVectorValues floatVectorValues = fromFloats(floats);
ScalarQuantizer scalarQuantizer =
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
byte[][] quantized = new byte[floats.length][];
float[] offsets =
quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.EUCLIDEAN);
Expand Down Expand Up @@ -64,9 +65,9 @@ public void testToCosine() throws IOException {

for (float confidenceInterval : new float[] {0.9f, 0.95f, 0.99f, (1 - 1f / (dims + 1)), 1f}) {
float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f);
FloatVectorValues floatVectorValues = fromFloatsNormalized(floats);
FloatVectorValues floatVectorValues = fromFloatsNormalized(floats, null);
ScalarQuantizer scalarQuantizer =
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
byte[][] quantized = new byte[floats.length][];
float[] offsets =
quantizeVectorsNormalized(
Expand Down Expand Up @@ -100,7 +101,7 @@ public void testToDotProduct() throws IOException {
float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f);
FloatVectorValues floatVectorValues = fromFloats(floats);
ScalarQuantizer scalarQuantizer =
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
byte[][] quantized = new byte[floats.length][];
float[] offsets =
quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.DOT_PRODUCT);
Expand Down Expand Up @@ -130,7 +131,7 @@ public void testToMaxInnerProduct() throws IOException {
float error = Math.max((100 - confidenceInterval) * 0.5f, 0.5f);
FloatVectorValues floatVectorValues = fromFloats(floats);
ScalarQuantizer scalarQuantizer =
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval);
ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, floats.length);
byte[][] quantized = new byte[floats.length][];
float[] offsets =
quantizeVectors(
Expand Down Expand Up @@ -204,8 +205,9 @@ private static float[] quantizeVectorsNormalized(
return offsets;
}

private static FloatVectorValues fromFloatsNormalized(float[][] floats) {
return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats) {
private static FloatVectorValues fromFloatsNormalized(
float[][] floats, Set<Integer> deletedVectors) {
return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) {
@Override
public float[] vectorValue() throws IOException {
if (curDoc == -1 || curDoc >= floats.length) {
Expand Down
Loading

0 comments on commit 9ccfc30

Please sign in to comment.