forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
IndexTable.h
309 lines (252 loc) · 10.7 KB
/
IndexTable.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// Matt Wells, copyright Aug 2001
// . calls for intersecting a bunch of IndexLists to generate docIds
// . IndexLists are data-less lists (keys only)
// . each key in an IndexList is a termId/score/adultBit/docId tuple
// . we try to use as small a sublist of each IndexList as possible to avoid
// wasting network bandwidth
// . TODO: split into 2+ classes
// TODO: implement site clustering??????? in getNumResults()
// TODO: if we have in cache we can hash right into the table, but
// we must do that before blocking on something in case it disappears
// from the cache
// TODO: it is possible to get a better scoring result, even if we found
// 10 docIds in the heads of ALL the IndexLists. Because it may have
// a really high score in 3 of the IndexLists, but a low score in the
// fourth, but it's sum may be the highest of all docIds.
// TODO: the search "cell phone cable hp jornada 680", w/o quotes, should
// be quote forced anyway. Some pages will match all but "cable hp"
// so we should break that up into it's 2 terms, cable and hp
// 6912 (30%) of queries are 1 word queries 138
// 7825 (34%) of queries are 2 word queries 156
// 4512 (20%) of queries are 3 word queries 90
// 1771 ( 8%) of queries are 4 word queries 32
// 869 ( 4%) of queries are 5 word queries 17
// 391 ( 2%) of queries are 6 word queries 8
// 290 ( 1%) of queries are 7 word queries 6
// 183 ( 1%) of queries are 8 word queries (4 per second)
#ifndef _INDEXTABLE_H_
#define _INDEXTABLE_H_
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
#include "Indexdb.h" // makeStartKey(), getTruncationLimit()
#include "IndexList.h" // for m_lists[]
#include "Titledb.h" // g_titledb.getTotalNumDocs()
#include "IndexReadInfo.h" // MAX_TIERS
// max # search results that can be viewed
#define MAX_RESULTS 1000
class IndexTable {
public:
// . returns false on error and sets errno
// . we now support multiple plus signs before the query term
// . start/endTermNums apply to phrase termIds only
// . allows us to set multiple bits when a phrase termId is matched
// in case the singleton was truncated, but doc has the phrase
// . if you want Default AND behaviour set requireAllTerms to true
// it is much faster, too
void init (Query *q,bool isDebug,void *logstate,bool requireAllTerms,
class TopTree *topTree );
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
// sets m_positiveBits, etc.
//void prepareToAddLists ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
void addLists_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
int32_t numTiers ,
int32_t numListsPerTier ,
Query *q ,
int32_t docsWanted ,
int32_t *totalListSizes ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight );
// . these are set from calling addLists() above
// . we log all matching topDocIds if isDebug is true
int64_t *getTopDocIds ( int32_t tier ) { return m_topDocIds[tier]; };
unsigned char *getTopBitScores ( int32_t tier )
{ return m_topBitScores[tier]; };
char *getTopExplicits ( int32_t tier ) { return m_topExplicits[tier]; };
int32_t *getTopScores ( int32_t tier ) { return m_topScores[tier]; };
//uint32_t *getTopBitScores () { return m_finalTopBitScores; };
// make sure to call getTopDocIds() before calling this
int32_t getNumTopDocIds ( int32_t tier ) { return m_numTopDocIds[tier]; };
// . get how many results we have in the topDocIds list
// . if "thatIncludeAllTerms" is true, results must have all terms
// from all indexLists that we haven't read ALL of yet
int32_t getNumExactExplicitMatches ( int32_t tier ) {
return m_numExactExplicitMatches[tier];};
int32_t getNumExactImplicitMatches ( int32_t tier ) {
return m_numExactImplicitMatches[tier];};
// some generic stuff
IndexTable();
~IndexTable();
void reset();
// . call to set the m_final* member vars from the m_top* member vars
// . ALWAYS call this BEFORE calling
// getTopDocIds(), getNumTopDocIds() or getNumExactMatches()
void filterTopDocIds ( ) ;
// how long to add the last batch of lists
int64_t m_addListsTime;
uint32_t m_totalDocIds;
int32_t m_numPanics;
int32_t m_numCollisions;
int32_t m_numPtrs; // in the beginning at least
int32_t m_numLoops;
// how long to get top docIds
int64_t m_setTopDocIdsTime;
int64_t m_estimatedTotalHits;
int32_t m_numSlots;
// Msg39 needs to call these
void freeMem ( ) ;
bool alloc (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
int32_t numTiers ,
int32_t numListsPerTier ,
int32_t docsWanted ,
bool sortByDate );
bool doRecall() { return m_doRecall; };
int32_t getNumDocsInTier ( int32_t i ) { return m_numDocsInTier[i]; };
// . sets m_scoreWeights[] based on termFreqs (IDF)
void setScoreWeights ( Query *q );
void setScoreWeights ( Query *q , bool phrase );
int32_t *getScoreWeights ( ) { return m_scoreWeights; };
private:
void addLists2_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
int32_t numTiers ,
int32_t numListsPerTier ,
Query *q ,
int32_t docsWanted ,
int32_t *imap ,
bool lastRound ,
int32_t numBaseLists ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight,
int32_t *minHardCountPtr );
void hashTopDocIds2 ( uint32_t *maxDocId ,
char **docIdPtrs ,
int32_t *scores ,
qvec_t *explicitBits ,
int16_t *hardCounts ,
uint32_t mask ,
int32_t numSlots ) ;
// . used for getting which topDocId to kick out of the top list
int32_t getWeakestTopDocId ( char **topp ,
int32_t *tops ,
unsigned char *topb ,
int32_t numTop ,
unsigned char *minBitScore2 ,
int32_t *score ,
char **docIdPtr ) ;
// . get the termBits for the termId represented by this list
// . only phrases may set multiple bits
qvec_t getTermImplicitBitMask_r ( int32_t i );
// . set the m_bitScores[] array
// . "count" is the # of query term (single or phrase) bit combinations
void setBitScores ( int32_t count );
// are lists swapped?
bool m_swapped [ MAX_TIERS ] [ MAX_QUERY_TERMS ] ;
// these describe the lists associated with each m_termId
int32_t m_scoreWeights [ MAX_QUERY_TERMS ];
// for each tier we have a list of the top docids
int64_t m_topDocIds [ MAX_TIERS ] [ MAX_RESULTS ];
char *m_topDocIdPtrs [ MAX_TIERS ] [ MAX_RESULTS ];
int32_t m_topScores [ MAX_TIERS ] [ MAX_RESULTS ];
//int16_t m_topHardCounts [ MAX_TIERS ] [ MAX_RESULTS ];
unsigned char m_topBitScores [ MAX_TIERS ] [ MAX_RESULTS ];
char m_topExplicits [ MAX_TIERS ] [ MAX_RESULTS ];
int32_t m_numTopDocIds [ MAX_TIERS ] ;
int32_t m_numExactExplicitMatches [ MAX_TIERS ];
int32_t m_numExactImplicitMatches [ MAX_TIERS ];
int32_t m_numTiers;
// when filterTopDocIds() is called it uniquifies and combines
// m_topDocIds[*][] into m_finalDocIds
/* int64_t m_finalTopDocIds [ MAX_RESULTS ];
int32_t m_finalTopScores [ MAX_RESULTS ];
//uint32_t m_finalTopBitScores [ MAX_RESULTS ];
int32_t m_finalNumExactExplicitMatches ;
int32_t m_finalNumExactImplicitMatches ;
int32_t m_finalNumTopDocIds ;*/
// a reference to the query
Query *m_q;
// has init() been called?
bool m_initialized;
// are we in debug mode?
bool m_isDebug;
// for debug msgs
int32_t m_logstate;
// . did we already call m_q->setBitScores() for this query?
// . don't call it more than once
bool m_alreadySet;
bool m_doRecalc;
bool m_requireAllTerms;
char **m_topDocIdPtrs2;
int32_t *m_topScores2;
qvec_t *m_topExplicits2;
int16_t *m_topHardCounts2;
int32_t m_maxTopDocIds2;
int32_t m_numTopDocIds2;
int32_t m_nexti;
int32_t m_oldnexti;
bool m_doRecall;
// allocated memory
char *m_buf;
int32_t m_bufSize;
char *m_bufMiddle;
// for large hashtable for sortByDate
char *m_bigBuf;
int32_t m_bigBufSize;
// the imap stuff
int32_t m_imap [ MAX_QUERY_TERMS ];
int32_t m_sizes [ MAX_QUERY_TERMS ];
int32_t m_blocksize [ MAX_QUERY_TERMS ];
int32_t m_nb;
class TopTree *m_topTree;
// these are for removing component lists replaced by their compounds
int32_t *m_componentCodes;
//char *m_ignore;
//bool m_scoresSet;
int32_t m_numDocsInTier [ MAX_TIERS ] ;
};
// . get the LOWEST scoring docId from our list of top docIds
// . set "minBitScore22" and "score" for that lowest docId
// . inline this for speed
// . BUT lower docIds are considered higher scoring than higher docIds
inline int32_t IndexTable::getWeakestTopDocId ( char **topp ,
int32_t *tops ,
unsigned char *topb ,
int32_t numTop ,
unsigned char *minBitScore2 ,
int32_t *score ,
char **docIdPtr ) {
int64_t tmp = 0LL;
int32_t minScore = 0x7fffffff;
unsigned char minBitScore = 0xff;
char *minDocIdPtr = (char *)&tmp;
int32_t mini = 0;
for ( int32_t i = 0 ; i < numTop ; i++ ) {
if ( topb [i] > minBitScore ) continue;
if ( topb [i] < minBitScore ) goto gotIt;
if ( tops [i] > minScore ) continue;
if ( tops [i] < minScore ) goto gotIt;
if ( *(uint32_t *)(topp[i]+1 ) <
*(uint32_t *)(minDocIdPtr+1) ) continue;
if ( *(uint32_t *)(topp[i]+1 ) >
*(uint32_t *)(minDocIdPtr+1) ) goto gotIt;
if ( (*(unsigned char *)(topp[i] ) & 0xfc) <
(*(unsigned char *)(minDocIdPtr) & 0xfc) ) continue;
// ties should not be happening for docid, unless
// it tied with initial setting of minDocIdPtr, in that
// case we should add it!
gotIt:
minScore = tops [i];
minBitScore = topb [i];
minDocIdPtr = topp [i];
mini = i;
}
// set the callers ptrs
*minBitScore2 = minBitScore;
*score = minScore;
*docIdPtr = minDocIdPtr;
// return the lowest scoring docId's position
return mini;
}
#endif