forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Categories.h
242 lines (201 loc) · 6.43 KB
/
Categories.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
//
// Gigablast, Copyright March 2005
// Author: Javier Olivares <jolivares@gigablast.com>
//
// Stores Categories in a Hierarchy
// Based on DMOZ
//
#ifndef _CATEGORY_H_
#define _CATEGORY_H_
#include "Mem.h"
#include "HashTable.h"
#define RDFBUFFER_SIZE (1024*1024*100)
#define RDFSMALLBUFFER_SIZE (32*1024)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"
#define STRUCTURE_OUTPUT_FILE "gbdmoz.structure.dat"
#define CONTENT_OUTPUT_FILE "gbdmoz.content.dat"
#define URL_OUTPUT_FILE "gbdmoz.urls.dat"
#define URLTEXT_OUTPUT_FILE "gbdmoz.urls.txt"
#define MAX_CATID_LEN 63
#define MAX_TAG_LEN 127
#define MAX_URL_CATIDS 64
#define MAX_URLTXT_SIZE 500000
#define MAX_CATIDS 96
#define MAX_CATNAME_LEN 1024
#define HASHTABLE_SIZE (1024*1024)
#define URLHASHTABLE_SIZE (10*1024*1024)
#define MAX_SUB_CATS 1024
#define SUBCAT_LETTERBAR 10
#define SUBCAT_NARROW2 30
#define SUBCAT_SYMBOLIC2 31
#define SUBCAT_NARROW1 50
#define SUBCAT_SYMBOLIC1 51
#define SUBCAT_NARROW 70
#define SUBCAT_SYMBOLIC 71
#define SUBCAT_RELATED 90
#define SUBCAT_ALTLANG 110
struct Category {
int32_t m_catid;
int32_t m_parentid;
//int16_t m_numSymParents;
//int32_t m_symParentsOffset;
int32_t m_nameOffset;
int16_t m_nameLen;
uint32_t m_structureOffset;
uint32_t m_contentOffset;
int32_t m_numUrls;
};
struct CategoryHash {
uint32_t m_hash;
int32_t m_catIndex;
};
struct SubCategory {
//int32_t m_prefixOffset;
int32_t m_prefixLen;
//int32_t m_nameOffset;
int32_t m_nameLen;
char m_type;
int32_t getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
char *getPrefix() { return m_buf; };
char *getName () { return m_buf+m_prefixLen+1;};
char m_buf[0];
};
class Categories {
public:
Categories();
~Categories();
int32_t fileRead ( int fileid, void *buf, size_t count );
void reset();
// load the hierarchy from a file
int32_t loadCategories ( char *filename );
// . this is called by loadCategories() and constructs m_adultTable
// . it will load/save it from/to disk, too
bool makeBadHashTable ( ) ;
bool addUrlsToBadHashTable ( int32_t catid ) ;
// get the index of a cat from its id
// -1 if not found
int32_t getIndexFromId ( int32_t catid );
int32_t getIndexFromPath ( char *str, int32_t strLen );
int32_t getIdFromPath ( char *str, int32_t strLen );
// determine if a category should be printed RTL
bool isIdRTLStart ( int32_t catid );
bool isIndexRTLStart ( int32_t catIndex );
bool isIdRTL ( int32_t catid );
bool isIndexRTL ( int32_t catIndex );
// see if the category is Adult
bool isIdAdultStart ( int32_t catid );
bool isIndexAdultStart ( int32_t catIndex );
bool isIdAdult ( int32_t catid );
bool isIndexAdult ( int32_t catIndex );
// is it in a bad cat, like adult, gambling, online pharmacies
bool isIdBadStart ( int32_t catid );
bool isIndexBadStart ( int32_t catIndex );
bool isIdBad ( int32_t catid );
bool isIndexBad ( int32_t catIndex );
// is this url directly in a dmoz adult category?
bool isInBadCat ( Url *u ) ;
bool isInBadCat ( uint32_t urlHash );
// print info of cats
void printCats ( int32_t start, int32_t end );
// print the path of this category
void printPathFromId ( SafeBuf *sb ,
int32_t catid,
bool raw = false,
bool isRTL = false );
void printPathFromIndex ( SafeBuf *sb ,
int32_t catIndex,
bool raw = false,
bool isRTL = false );
// print the path bread crumb links for this category
void printPathCrumbFromId ( SafeBuf *sb ,
int32_t catid,
bool isRTL = false );
void printPathCrumbFromIndex ( SafeBuf *sb ,
int32_t catid,
bool isRTL = false );
bool printUrlsInTopic ( class SafeBuf *sb , int32_t catid ) ;
// . get the title and summary for a specific url
// and catid
bool getTitleAndSummary ( char *url,
int32_t urlLen,
int32_t catid,
char *title = NULL,
int32_t *titleLen = NULL,
int32_t maxTitleLen = 0,
char *summ = NULL,
int32_t *summLen = NULL,
int32_t maxSummLen = 0,
char *anchor = NULL,
unsigned char *anchorLen = NULL,
int32_t maxAnchorLen = 0 ,
int32_t niceness = 0 ,
bool justAddToTable = false );
// normalize a url string
int32_t fixUrl ( char *url, int32_t urlLen );
// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
// . hits disk without using threads... so kinda sucks...
int32_t generateSubCats ( int32_t catid, SafeBuf *subCatBuf );
int32_t getNumUrlsFromIndex ( int32_t catIndex ) {
if ( ! m_cats ) return 0;
return m_cats[catIndex].m_numUrls; };
// creates a directory search request url
//void createDirectorySearchUrl ( Url *url,
int32_t createDirSearchRequest ( char *requestBuf,
int32_t requestBufSize,
int32_t catid,
char *hostname,
int32_t hostnameLen,
char *coll,
int32_t collLen,
char *cgi ,//= NULL,
int32_t cgiLen ,//= 0,
bool cgiFromRequest ,//= false ,
class HttpRequest *r );
bool initLangTables(void);
bool loadLangTables(void);
uint8_t findLanguage(char *addr);
// Categories
Category *m_cats;
int32_t m_numCats;
// name buffer
char *m_nameBuffer;
int32_t m_nameBufferSize;
// symbolic parent buffer
//int32_t *m_symParents;
//int32_t m_numSymParents;
// hash buffer
CategoryHash *m_catHash;
// full buffer
char *m_buffer;
int32_t m_bufferSize;
protected:
// for parsing the original dmoz files
char* incRdfPtr ( int32_t skip = 1 );
int32_t rdfParse ( char *tagName );
int32_t rdfNextTag ( );
int32_t fillNextString ( char *str, int32_t max );
int32_t fillNextTagBody ( char *str, int32_t max );
// rdf stream
char *m_rdfPtr;
char *m_rdfEnd;
//std::ifstream m_rdfStream;
int m_rdfStream;
char *m_rdfBuffer;
int32_t m_rdfBufferSize;
int32_t m_currOffset;
// static rdf buffer
char m_rdfSmallBuffer[RDFSMALLBUFFER_SIZE];
// tag buffer
char m_tagRecfer[MAX_TAG_LEN+1];
int32_t m_tagLen;
HashTable m_badTable;
// sub category buffer
//SubCategory m_subCats[MAX_SUB_CATS];
//int32_t m_numSubCats;
};
extern class Categories g_categories1;
extern class Categories g_categories2;
extern class Categories *g_categories;
#endif