-
Notifications
You must be signed in to change notification settings - Fork 443
/
Datedb.h
195 lines (155 loc) · 5.9 KB
/
Datedb.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
// Matt Wells, Copyright May 2005
// . format of a 16-byte datedb key
// . tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// . tttttttt tttttttt DDDDDDDD DDDDDDDD D = ~date
// DDDDDDDD DDDDDDDD ssssssss dddddddd s = ~score
// . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits)
// . format of a 10-byte indexdb key
// . DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD D = ~date
// . ssssssss dddddddd dddddddd dddddddd
// . dddddddd dddddd0Z s = ~score d = docId (38 bits)
//
// SPECIAL EVENTDB KEYS. for indexing events.
//
// . format of a 16-byte "eventdb" key with termId of 0
// . for sorting/constraining events with multiple start dates
// . each start date has a "termId 0" key. "D" date is when
// the event starts. score is the eventId. this key is
// added by the Events::hashIntervals(eventId) function.
//
// . 00000000 00000000 00000000 00000000 t = termId (48bits)
// . 00000000 00000000 DDDDDDDD DDDDDDDD D = ~date (in secs after epoch)
// DDDDDDDD DDDDDDDD IIIIIIII dddddddd I = eventId
// . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits)
// . format of a 16-byte "eventdb" key from words/phrases
// . each word/phrase of each event has one and only one key of this format.
// . this key is added by the Events::hash() function.
//
// . tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// . tttttttt tttttttt 00000000 00000000
// iiiiiiii IIIIIIII ssssssss dddddddd s = ~score, [I-i] = eventId RANGE
// . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits)
#ifndef _DATEDB_H_
#define _DATEDB_H_
#include "Rdb.h"
#include "Conf.h"
#include "Indexdb.h"
// we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h
#define NUMTERMIDBITS 48
// mask the lower 48 bits
#define TERMID_MASK (0x0000ffffffffffffLL)
#include "Titledb.h" // DOCID_MASK
// Msg5.cpp and Datedb.cpp use this
//#define MIN_TRUNC (PAGE_SIZE/6 * 4 + 6)
// keep it at LEAST 12 million to avoid disasters
#define MIN_TRUNC 12000000
class Datedb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
bool addIndexList ( class IndexList *list ) ;
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
key128_t makeKey ( int64_t termId ,
uint32_t date ,
unsigned char score ,
uint64_t docId ,
bool isDelKey );
key128_t makeStartKey ( int64_t termId , uint32_t date1 ) {
return makeKey ( termId , date1, 255 , 0LL , true ); };
key128_t makeEndKey ( int64_t termId , uint32_t date2 ) {
return makeKey ( termId , date2, 0 , DOCID_MASK , false ); };
// works on 16 byte full key or 10 byte half key
int64_t getDocId ( void *key ) {
return ((*(uint64_t *)(key)) >> 2) & DOCID_MASK; };
unsigned char getScore ( void *key ) {
return ~(((unsigned char *)key)[5]); };
// use the very top int32_t only
/*
uint32_t getGroupIdFromKey ( key128_t *key ) {
if ( g_conf.m_fullSplit )
return g_titledb.getGroupId ( getDocId((char *)key) );
//#ifdef SPLIT_INDEXDB
if ( g_conf.m_indexdbSplit > 1 ) {
uint32_t groupId =
(((uint32_t*)key)[3]) &
g_hostdb.m_groupMask;
groupId >>= g_indexdb.m_groupIdShift;
uint32_t offset = (key->n0 >> 2) &
DOCID_OFFSET_MASK;
return g_indexdb.m_groupIdTable [ groupId+
(offset*g_indexdb.m_numGroups) ];
}
//#else
else
return (((uint32_t *)key)[3]) &
g_hostdb.m_groupMask;
//#endif
};
*/
//#ifdef SPLIT_INDEXDB
// for terms like gbdom:xyz.com that only reside in one group and
// are not split by docid into multiple groups. reduces disk seeks
// while spidering, cuz we use such terms for deduping and for
// doing quotas.
// ---> IS THIS RIGHT???? MDW
uint32_t getNoSplitGroupId ( key128_t *k ) {
char *xx=NULL;*xx=0;
return 0;
// wtf is this? still being used?
//return (((uint32_t *)k)[3]) & g_hostdb.m_groupMask;
//uint32_t bgid = getBaseGroupId(k);
//return g_indexdb.getSplitGroupId(bgid,0);
//return bgid;
}
//uint32_t getBaseGroupId ( key128_t *k ) {
// return (((uint32_t *)k)[3]) & g_hostdb.m_groupMask;
//}
//#endif
// extract the termId from a key
int64_t getTermId ( key128_t *k ) {
int64_t termId = 0LL;
gbmemcpy ( &termId , ((char *)k) + 10 , 6 );
return termId ;
};
int32_t getDate ( key128_t *k ) {
uint32_t date = 0;
date = (uint32_t)(k->n1 & 0x000000000000ffffULL);
date <<= 16;
date |= (uint32_t)((k->n0 & 0xffff000000000000ULL) >> 48);
return ~date;
}
int32_t getEventIdStart ( void *k ) {
uint32_t d = getDate ( (key128_t *)k );
return ((uint8_t *)(&d))[1];
};
int32_t getEventIdEnd ( void *k ) {
uint32_t d = getDate ( (key128_t *)k );
return ((uint8_t *)(&d))[0];
};
//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache m_pc;
};
extern class Datedb g_datedb;
//extern class Datedb g_datedb2;
#endif
// . the search-within operator "|"
// - termlists are sorted by score so that when merging 2 termlists
// we can stop when we get the first 10 docIds that have both terms and
// we are certain that they are the top 10 highest scoring
// - but search within says to disregard the scores of the first list,
// so we can still be sure we got the top 10, i guess
// - sort by date: like search-within but everybody has a date so the
// termlist is huge!!! we can pass a sub-date termlist, say today's
// date and merge that one. if we get no hits then try the last 3 days
// date termlist. Shit, can't have one huge date termlist anyway cuz we
// need truncation to make the network thang work.