forked from facebookresearch/faiss
-
Notifications
You must be signed in to change notification settings - Fork 12
/
HNSW.h
342 lines (256 loc) · 8.99 KB
/
HNSW.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#pragma once
#include <vector>
#include <unordered_set>
#include <queue>
#include <omp.h>
#include <LightGBM/boosting.h>
#include <LightGBM/prediction_early_stop.h>
#include "Index.h"
#include "FaissAssert.h"
#include "utils.h"
namespace faiss {
/** Implementation of the Hierarchical Navigable Small World
* datastructure.
*
* Efficient and robust approximate nearest neighbor search using
* Hierarchical Navigable Small World graphs
*
* Yu. A. Malkov, D. A. Yashunin, arXiv 2017
*
* This implmentation is heavily influenced by the NMSlib
* implementation by Yury Malkov and Leonid Boystov
* (https://github.com/searchivarius/nmslib)
*
* The HNSW object stores only the neighbor link structure, see
* IndexHNSW.h for the full index object.
*/
struct VisitedTable;
struct DistanceComputer; // from AuxIndexStructures
struct HNSW {
/// internal storage of vectors (32 bits: this is expensive)
typedef int storage_idx_t;
/// Faiss results are 64-bit
typedef Index::idx_t idx_t;
typedef std::pair<float, storage_idx_t> Node;
/** Heap structure that allows fast
*/
struct MinimaxHeap {
int n;
int k;
int nvalid;
std::vector<storage_idx_t> ids;
std::vector<float> dis;
typedef faiss::CMax<float, storage_idx_t> HC;
explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
void push(storage_idx_t i, float v);
float max() const;
int size() const;
void clear();
int pop_min(float *vmin_out = nullptr);
int count_below(float thresh);
};
/// to sort pairs of (id, distance) from nearest to fathest or the reverse
struct NodeDistCloser {
float d;
int id;
NodeDistCloser(float d, int id): d(d), id(id) {}
bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
};
struct NodeDistFarther {
float d;
int id;
NodeDistFarther(float d, int id): d(d), id(id) {}
bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
};
/// assignment probability to each layer (sum=1)
std::vector<double> assign_probas;
/// number of neighbors stored per layer (cumulative), should not
/// be changed after first add
std::vector<int> cum_nneighbor_per_level;
/// level of each vector (base level = 1), size = ntotal
std::vector<int> levels;
/// offsets[i] is the offset in the neighbors array where vector i is stored
/// size ntotal + 1
std::vector<size_t> offsets;
/// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
/// for all levels. this is where all storage goes.
std::vector<storage_idx_t> neighbors;
/// entry point in the search structure (one of the points with maximum level
storage_idx_t entry_point;
faiss::RandomGenerator rng;
/// maximum level
int max_level;
/// expansion factor at construction time
int efConstruction;
/// expansion factor at search time
int efSearch;
/// during search: do we check whether the next best distance is good enough?
bool check_relative_distance = true;
/// number of entry points in levels > 0.
int upper_beam;
/// use bounded queue during exploration
bool search_bounded_queue = true;
// Ground truth nearest neighbor indexes.
std::vector<std::vector<long>> gtvector;
// When to make prediction.
std::vector<long> pred_thresh;
// Prediction models for decision tree approach.
std::vector<LightGBM::Boosting *> boosters;
// Early stop condition for decision tree.
// We didn't employ early stopping, so this is just a dummy.
LightGBM::PredictionEarlyStopConfig tree_config;
LightGBM::PredictionEarlyStopInstance tree_early_stop =
LightGBM::CreatePredictionEarlyStopInstance(std::string("none"),
tree_config);
// methods that initialize the tree sizes
/// initialize the assign_probas and cum_nneighbor_per_level to
/// have 2*M links on level 0 and M links on levels > 0
void set_default_probas(int M, float levelMult);
/// set nb of neighbors for this level (before adding anything)
void set_nb_neighbors(int level_no, int n);
// methods that access the tree sizes
/// nb of neighbors for this level
int nb_neighbors(int layer_no) const;
/// cumumlative nb up to (and excluding) this level
int cum_nb_neighbors(int layer_no) const;
/// range of entries in the neighbors table of vertex no at layer_no
void neighbor_range(idx_t no, int layer_no,
size_t * begin, size_t * end) const;
/// only mandatory parameter: nb of neighbors
explicit HNSW(int M = 32);
/// pick a random level for a new point
int random_level();
/// add n random levels to table (for debugging...)
void fill_with_random_links(size_t n);
void add_links_starting_from(DistanceComputer& ptdis,
storage_idx_t pt_id,
storage_idx_t nearest,
float d_nearest,
int level,
omp_lock_t *locks,
VisitedTable &vt);
/** add point pt_id on all levels <= pt_level and build the link
* structure for them. */
void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
std::vector<omp_lock_t>& locks,
VisitedTable& vt);
int search_from_candidates(DistanceComputer& qdis, int k,
idx_t *I, float *D,
MinimaxHeap& candidates,
VisitedTable &vt,
int level, int nres_in = 0) const;
std::priority_queue<Node> search_from_candidate_unbounded(
const Node& node,
DistanceComputer& qdis,
int ef,
VisitedTable *vt
) const;
// For search_mode = 1.
// Generate training data for prediction-based approach.
void search_from_candidate_unbounded_train(
const Node& node,
DistanceComputer& qdis,
idx_t *I, float *D, int k,
idx_t gt_idx,
VisitedTable *vt
) const;
// For search_mode = 2.
// Prediction-based adaptive learned early termination.
void search_from_candidate_unbounded_pred(
const Node& node,
DistanceComputer& qdis,
idx_t *I, float *D, int k,
const float *x, size_t d, long pred_max,
VisitedTable *vt
) const;
// For search_mode = 3.
// ndis-based fixed configuration to find the minimum number
// of distance evaluations to reach certain accuracy targets. This is
// needed for generating training data and grid search on different
// intermediate search result features.
void search_from_candidate_unbounded_ndis(
const Node& node,
DistanceComputer& qdis,
idx_t *I, float *D, int k,
VisitedTable *vt
) const;
/// search interface
void search(DistanceComputer& qdis, int k,
idx_t *I, float *D,
VisitedTable& vt) const;
// Customized search() for search_mode = 1, 2, 3.
void search_custom(DistanceComputer& qdis, int k,
idx_t *I, float *D,
int search_mode, idx_t gt_idx,
const float *x, size_t d, long pred_max,
VisitedTable& vt) const;
void reset();
void clear_neighbor_tables(int level);
void print_neighbor_stats(int level) const;
int prepare_level_tab(size_t n, bool preset_levels = false);
static void shrink_neighbor_list(
DistanceComputer& qdis,
std::priority_queue<NodeDistFarther>& input,
std::vector<NodeDistFarther>& output,
int max_size);
// Load ground truth nearest neighbor database index
// for finding ground truth minimum termination condition.
void load_gt(long label);
// Load the thresholds about when to make predictions.
// This is related to the choice of intermediate search result features.
void load_thresh(long thresh);
// Load the prediction model.
void load_model(char *file);
};
/**************************************************************
* Auxiliary structures
**************************************************************/
/// set implementation optimized for fast access.
struct VisitedTable {
std::vector<uint8_t> visited;
int visno;
explicit VisitedTable(int size)
: visited(size), visno(1) {}
/// set flog #no to true
void set(int no) {
visited[no] = visno;
}
/// get flag #no
bool get(int no) const {
return visited[no] == visno;
}
/// reset all flags to false
void advance() {
visno++;
if (visno == 250) {
// 250 rather than 255 because sometimes we use visno and visno+1
memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
visno = 1;
}
}
};
struct HNSWStats {
size_t n1, n2, n3;
size_t ndis;
size_t nreorder;
bool view;
HNSWStats() {
reset();
}
void reset() {
n1 = n2 = n3 = 0;
ndis = 0;
nreorder = 0;
view = false;
}
};
// global var that collects them all
extern HNSWStats hnsw_stats;
} // namespace faiss