-
Notifications
You must be signed in to change notification settings - Fork 1
/
IndexManager.h
500 lines (444 loc) · 28.2 KB
/
IndexManager.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
/**
Project: libtpc
File name: TpcIndexReader.h
@author valerio
@version 1.0 7/25/17.
*/
#ifndef LIBTPC_TPCINDEXREADER_H
#define LIBTPC_TPCINDEXREADER_H
#include <vector>
#include <string>
#include <lucene++/LuceneHeaders.h>
#include <cfloat>
#include "CASManager.h"
#include "DataStructures.h"
namespace tpc {
namespace index {
static const std::string INDEX_ROOT_LOCATION("/usr/local/textpresso/luceneindex/");
static const std::string CORPUS_COUNTER_FILENAME("cc.cfg");
static const std::string DOCUMENT_INDEXNAME("fulltext");
static const std::string SENTENCE_INDEXNAME("sentence");
static const std::string DOCUMENT_INDEXNAME_CS("fulltext_cs");
static const std::string SENTENCE_INDEXNAME_CS("sentence_cs");
static const int MAX_HITS(1000000);
static const int FIELD_CACHE_MIN_HITS(30000);
static const int MAX_NUM_SENTENCES_IN_QUERY(200);
static const int MAX_NUM_DOCIDS_IN_QUERY(200);
static const std::set<std::string> INDEX_TYPES{DOCUMENT_INDEXNAME, SENTENCE_INDEXNAME, DOCUMENT_INDEXNAME_CS,
SENTENCE_INDEXNAME_CS};
static const std::string SUBINDEX_NAME = "subindex";
static const std::set<std::string> DOCUMENTS_FIELDS_DETAILED{"accession_compressed", "title_compressed",
"author_compressed", "journal_compressed", "year",
"abstract_compressed", "filepath",
"corpus", "doc_id",
"fulltext_compressed", "type_compressed",
"fulltext_cat_compressed"};
static const std::set<std::string> SENTENCE_FIELDS_DETAILED{"sentence_id", "begin", "end",
"sentence_compressed", "sentence_cat_compressed"};
/*!
* @struct TmpConf
* @brief data structure that represents information about temporary configuration files of an index
*
* @var <b>new_index_flag</b> location of file which, if exists, indicates that a new index must be created
* @var <b>index_descriptor</b> the location of the descriptor file for the index
* @var <b>tmp_dir</b> the temporary directory where the configuration is stored
*/
struct TmpConf {
std::string new_index_flag;
std::string index_descriptor;
std::string tmp_dir;
};
class tpc_exception : public std::runtime_error {
public:
explicit tpc_exception(char const* const message) throw(): std::runtime_error(message) { }
virtual char const* what() const throw() { return std::exception::what(); }
};
/*!
* add, retrieve or remove documents from Textpresso index
*/
class IndexManager {
public:
IndexManager() = default;
/*!
* create a new index manager object
* @param index_path the path to the index
* @param read_only whether the index should be opened in read-only mode
* @param external whether the index is external or standalone
*/
explicit IndexManager(const std::string& index_path, bool read_only = true, bool external = false):
index_dir(index_path),
readonly(read_only),
external(external),
readers_map(),
corpus_doc_counter(),
externalIndexManager() { };
~IndexManager() {
close();
};
IndexManager(const IndexManager& other) {
readers_map = other.readers_map;
index_dir = other.index_dir;
readonly = other.readonly;
external = other.external;
corpus_doc_counter = other.corpus_doc_counter;
externalIndexManager = other.externalIndexManager;
};
IndexManager& operator=(const IndexManager& other) {
readers_map = other.readers_map;
index_dir = other.index_dir;
readonly = other.readonly;
external = other.external;
corpus_doc_counter = other.corpus_doc_counter;
externalIndexManager = other.externalIndexManager;
};
IndexManager(IndexManager&& other) noexcept :
readers_map(std::move(other.readers_map)),
readonly(other.readonly),
external(other.external),
index_dir(std::move(other.index_dir)),
corpus_doc_counter(std::move(other.corpus_doc_counter)),
externalIndexManager(std::move(other.externalIndexManager)) {}
IndexManager& operator=(IndexManager&& other) noexcept {
readers_map = std::move(other.readers_map);
index_dir = std::move(other.index_dir);
readonly = other.readonly;
external = other.external;
corpus_doc_counter = std::move(other.corpus_doc_counter);
externalIndexManager = std::move(other.externalIndexManager);
};
void close() {
for (auto &it : readers_map) {
it.second->close();
}
}
/*!
* return the list of indexed corpora
* @return a vector of strings, representing the list of available corpora in the index
*/
static std::vector<std::string> get_available_corpora();
/*!
* return the list of additional corpora
* @return a vector of strings, representing the list of additional corpora in the index
*/
std::vector<std::string> get_additional_corpora();
/*!
* return the number of articles indexed under a specific corpus
* @param corpus the value of the corpus
* @param external whether to retrieve the number of articles per corpus from the external index
* @return the numbe of articles indexed under the specified corpus
*/
int get_num_articles_in_corpus(const std::string& corpus, bool external = false);
/*!
* @brief search the Textpresso index for documents matching the provided Lucene query and return summary
* information with a list of results sorted by their match score
*
* The results returned by this method contain only the internal identifier of the indexed documents and
* other summary information regarding the documents matching the provided query. To get detailed
* information for the returned internal ids, use the function IndexManager::get_documents_details passing
* the hit_documents vector in the DataStructures::SearchResults object returned by this function, or
* IndexManager::get_document_details to retrieve details for a single document
*
* Note that while the documents are sorted by score, their matched sentences, in case of sentence searches,
* are not sorted in order to obtain better performances
* @param query a query object
* @param matches_only perform a partial search that returns a Lucene internal object representing the
* collection of matches. This object can be passed to a subsequent call to this method to continue the
* search and get the complete results. This is useful to get an initial estimate of the size of the
* complete search
* @param doc_ids limit the search to a set of document ids. This is useful for sentence queries to retrieve
* the sentence ids for a set of documents obtained by a previous search without ids
* @param partialResults the results of a previous partial search. The search will be completed with the
* sentence or document scores starting from the provided matching documents
* @return the list of the documents matching the query sorted by their scores and encapsulated in a
* SearchResutl object
*/
SearchResults search_documents(const Query &query, bool matches_only = false,
const std::set<std::string> &doc_ids = {},
const SearchResults& partialResults = SearchResults());
/*!
* @brief get detailed information about a document specified by a DocumentSummary object
*
* @param doc_summary the DocumentSummary object that identifies the document
* @param include_sentences_details whether to retrieve the details of the matching sentences specified in the
* DocumentSummary object
* @param include_doc_fields the list of fields to retrieve for the document. Retrieve all fields if not
* specified
* @param include_match_sentences_fields the list of fields to retrieve for the matching sentences specified in
* the DocumentSummary object
* @param exclude_doc_fields the list of fields to exclude for the document
* @param exclude_match_sentences_fields the list of fields to exclude for the matching sentences specified in
* the DocumentSummary object
* @param include_all_sentences whether to retrieve the details of all sentences in the document
* @param include_all_sentences_fields fields to be included for all sentences
* @param exclude_all_sentences_fields fields to be excluded for all sentences
* @param remove_tags whether to remove any tags (e.g., pdf tags) from the text, including the fulltext,
* matching sentences and all sentences
* @param remove_newlines whether to remove newlines and extra whitespaces from the text, including the
* fulltext, matching sentences and all sentences
* @return the detailed information of the document
*/
DocumentDetails get_document_details(const DocumentSummary &doc_summary,
bool include_sentences_details = true,
std::set<std::string> include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
std::set<std::string> include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
const std::set<std::string> &exclude_doc_fields = {},
const std::set<std::string> &exclude_match_sentences_fields = {},
bool include_all_sentences = false,
std::set<std::string> include_all_sentences_fields = SENTENCE_FIELDS_DETAILED,
const std::set<std::string> &exclude_all_sentences_fields = {},
bool remove_tags = false, bool remove_newlines = false);
/*!
* @brief get detailed information for a set of documents specified by a list of DocumentSummary objects
*
* @param doc_summaries a list of DocumentSummary object that identifies the documents to be searched and,
* optionally, the list of sentences in the matching_sentences field of the document for which to retrieve
* detailed information
* @param sort_by_year whether to sort the results by year
* @param include_sentences_details whether to retrieve the details of the matching sentences specified in the
* DocumentSummary object
* @param include_doc_fields the list of fields to retrieve for the document. Retrieve all fields if not
* specified
* @param include_match_sentences_fields the list of fields to retrieve for the matching sentences specified in
* the DocumentSummary object
* @param exclude_doc_fields the list of fields to exclude for the document
* @param exclude_match_sentences_fields the list of fields to exclude for the matching sentences specified in
* the DocumentSummary object
* @param include_all_sentences whether to retrieve the details of all sentences in the document
* @param include_all_sentences_fields fields to be included for all sentences
* @param exclude_all_sentences_fields fields to be excluded for all sentences
* @param remove_tags whether to remove any tags (e.g., pdf tags) from the text, including the fulltext,
* matching sentences and all sentences
* @param remove_newlines whether to remove newlines and extra whitespaces from the text, including the
* fulltext, matching sentences and all sentences
* @return the detailed information of the documents
*/
std::vector<DocumentDetails> get_documents_details(const std::vector<DocumentSummary> &doc_summaries,
bool sort_by_year,
bool include_sentences_details = true,
std::set<std::string> include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
std::set<std::string> include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
const std::set<std::string> &exclude_doc_fields = {},
const std::set<std::string> &exclude_match_sentences_fields = {},
bool include_all_sentences = false,
std::set<std::string> include_all_sentences_fields = SENTENCE_FIELDS_DETAILED,
const std::set<std::string> &exclude_all_sentences_fields = {},
bool remove_tags = false, bool remove_newlines = false);
std::set<std::string> get_words_belonging_to_category_from_document_fulltext(const std::string& fulltext,
const std::string& fulltext_cat,
const std::string& category);
// comparators for reverse sorting of documents and sentence objects
static bool document_score_gt(const Document &a, const Document &b) { return a.score > b.score; }
static bool document_year_score_gt(const Document &a, const Document &b) {
if (a.year != b.year) return a.year > b.year;
return a.score > b.score;
}
static bool sentence_greater_than(const SentenceSummary &a, const SentenceSummary &b) {
return a.score > b.score;
}
/*!
* create a textpresso index from a set of cas files
* @param input_cas_dir the directory containing the cas files to be added to the index
* @param max_num_papers_per_subindex max number of papers per subindex
*/
void create_index_from_existing_cas_dir(const std::string &input_cas_dir,
const std::set<std::string>& file_list = {},
int max_num_papers_per_subindex = 50000);
/*!
* add a file to a textpresso index
* @param file_path the path to a compressed cas file
* @param literature the literature of the file
* @param max_num_papers_per_subindex max number of papers per subindex
*/
void add_file_to_index(const std::string& file_path, int max_num_papers_per_subindex = 50000);
/*!
* remove a specific file from the index
* @param identifier the id of the file to remove, currently represented by the filepath field stored in
* lucene
*/
void remove_file_from_index(const std::string& identifier);
/*!
* update the document counters for the index and save them to file
*/
void calculate_and_save_corpus_counter();
/*!
* create an external database for sentences containing their document ids
*/
void save_all_doc_ids_for_sentences_to_db();
/*!
* create an external database for documents containing their year field
*/
void save_all_years_for_documents_to_db();
/*!
* whether the index has an external index attached
* @return true if the index has an external index attached, false otherwise
*/
bool has_external_index();
/*!
* add an external index to the main one
* @param external_idx_path the path to the external index
*/
void set_external_index(std::string external_idx_path);
/*!
* remove the external index
*/
void remove_external_index();
/*!
* get the list of additional corpora available from the external index
* @return the list of additional corpora od the external index
*/
std::vector<std::string> get_external_corpora();
private:
/*!
* create a collection of sub-readers with multiple Lucene indexes
* @param type the type of query to be performed by the subreaders
* @param case_sensitive whether to get case sensitive subreaders
* @return a collection of readers created from the Lucene indexes
*/
Lucene::Collection<Lucene::IndexReaderPtr> get_subreaders(QueryType type, bool case_sensitive = false);
/*!
* collect and return document basic information for a collection of matches obtained from a document search
* @param matches_collection the collection of documents matching the search query
* @param subreaders the readers used during the search
* @param searcher the searcher used during the search
* @return the list of Document objects with information related to the matching documents, encapsulated in a
* SearchResult object
*/
SearchResults read_documents_summaries(const Lucene::Collection<Lucene::ScoreDocPtr> &matches_collection,
bool sort_by_year = false);
/*!
* collect and return document information for a collection of matches obtained from a sentence search
* @param matches_collection the collection of sentences matching the search query
* @param subreaders the readers used during the search
* @param searcher the searcher used during the search
* @return the list of Document objects with information related to the matching sentences and their respective
* documents, encapsulated in a SearchResult object
*/
SearchResults read_sentences_summaries(const Lucene::Collection<Lucene::ScoreDocPtr> &matches_collection,
bool sort_by_year = false);
/*!
* get detailed information for the sentences of a document specifed by a DocumentSummary object and update the
* respective information in the provided DocumentDetails object
* @param doc_summary a DocumentSummary object that identifies a document
* @param doc_details the DocumentDetails object to be updated with the new detailed information about the
* sentences
* @param sent_parser a Lucene query parser
* @param searcher a Lucene searcher
* @param fsel a Lucene field selector
* @param fields the set of fields to be retrieved for the sentences
* @return the details of the document
*/
void update_match_sentences_details_for_document(const DocumentSummary &doc_summary,
DocumentDetails &doc_details,
Lucene::QueryParserPtr sent_parser,
Lucene::SearcherPtr searcher,
Lucene::FieldSelectorPtr fsel,
const std::set<Lucene::String> &fields,
bool use_lucene_internal_ids,
Lucene::MultiReaderPtr sent_reader);
/*!
* get detailed information for the complete sentences list for a document specifed by a DocumentSummary
* object and update the respective information in the provided DocumentDetails object
* @param doc_details the DocumentDetails object that identifies the document related to the sentences to
* be retrieved and that needs be updated with the new detailed information about the sentences. Note that
* the object must contain a non null identifier
* @param fsel a Lucene field selector
* @param fields the set of fields to be retrieved for the sentences
* @param internal_lucene_ids whether internal lucene ids are used for the search
* @return the details of the document
*/
void update_all_sentences_details_for_document(DocumentDetails &doc_details,
Lucene::FieldSelectorPtr fsel,
const std::set<Lucene::String> &fields);
static std::set<Lucene::String> compose_field_set(const std::set<std::string> &include_fields,
const std::set<std::string> &exclude_fields,
const std::set<std::string> &required_fields = {});
void update_document_details(DocumentDetails &doc_details, Lucene::String field,
Lucene::DocumentPtr doc_ptr);
std::vector<DocumentDetails> read_documents_details(const std::vector<DocumentSummary> &doc_summaries,
Lucene::QueryParserPtr doc_parser,
Lucene::SearcherPtr searcher,
Lucene::FieldSelectorPtr fsel,
const std::set<Lucene::String> &fields,
bool use_lucene_internal_ids,
Lucene::MultiReaderPtr doc_reader);
template <typename Function> void transform_document_text_fields(Function f,
std::vector<DocumentDetails> &documents)
{
for (auto &document : documents) {
if (!document.abstract.empty()) {
document.abstract = f(document.abstract);
}
if (!document.fulltext.empty()) {
document.fulltext = f(document.fulltext);
}
for (auto &sentence : document.sentences_details) {
if (!sentence.sentence_text.empty()) {
sentence.sentence_text = f(sentence.sentence_text);
}
}
document.sentences_details.erase(std::remove_if(document.sentences_details.begin(),
document.sentences_details.end(), [](const SentenceDetails &s) {return s.sentence_text.empty() || s.sentence_text == " ";}),
document.sentences_details.end());
for (auto &sentence : document.all_sentences_details) {
if (!sentence.sentence_text.empty()) {
sentence.sentence_text = f(sentence.sentence_text);
}
}
document.all_sentences_details.erase(std::remove_if(document.all_sentences_details.begin(),
document.all_sentences_details.end(), [](const SentenceDetails &s)
{
return s.sentence_text.empty() || s.sentence_text == " ";
}), document.all_sentences_details.end());
}
}
/*!
* write the temporary conf files for a subindex with the UIMA files needed
* @param index_path the output directory of the subindex
* @return a TmpConf object representing the information about the newly created files
*/
static TmpConf write_tmp_conf_files(const std::string &index_path);
/*!
* create the directory structure for a subindex
* @param index_path the path of the subindex to create
*/
static void create_subindex_dir_structure(const std::string &index_path);
/*!
* add a cas file to the index. The cas file is processed through UIMA engine to extract sentences and other
* features to be added to the index
* @param file_path the path of the cas file to be added to the index
* @param index_descriptor the index descriptor location
* @param temp_dir the temp dir location
* @param update_db whether to update the db with the new entry
*/
int add_cas_file_to_index(const char *file_path, std::string index_descriptor, std::string tempDir,
bool update_db);
/*!
* process a single file to be added to the index, calling the appropriate UIMA annotator
* @param filepath the path of the file
* @param first_paper whether the file is the first one to add to the subindex
* @param tmp_conf the temporary configuration file names
* @param update_db whether to update the entries in the db
* @return true if the file was valid and it has been processed correctly, false otherwise
*/
bool process_single_file(const std::string &filepath, bool &first_paper, const TmpConf &tmp_conf,
bool update_db = false);
std::string remove_document_from_index(std::string identifier, bool case_sensitive);
void remove_sentences_for_document(const std::string& doc_id, bool case_sensitive);
void add_doc_and_sentences_to_bdb(std::string identifier);
void save_corpus_counter();
void update_corpus_counter();
/*!
* load information about the number of documents indexed per corpus from file
*/
void load_corpus_counter();
int get_num_docs_in_corpus_from_index(const std::string& corpus);
std::map<std::string, Lucene::IndexReaderPtr> readers_map;
std::string index_dir;
bool readonly;
bool external;
std::map<std::string, int> corpus_doc_counter;
std::shared_ptr<IndexManager> externalIndexManager;
};
}
}
#endif //LIBTPC_TPCINDEXREADER_H