-
Notifications
You must be signed in to change notification settings - Fork 4
/
index_build_utils.hpp
125 lines (111 loc) · 3.92 KB
/
index_build_utils.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#pragma once
#include "index_types.hpp"
#include "util.hpp"
#include "succinct/mapper.hpp"
#include "clustered_index_types.hpp"
namespace ds2i {
struct progress_logger {
progress_logger()
: sequences(0)
, postings(0)
{}
void log()
{
logger() << "Processed " << sequences << " sequences, "
<< postings << " postings" << std::endl;
}
void done_sequence(size_t n)
{
sequences += 1;
postings += n;
if (sequences % 1000000 == 0) {
log();
}
}
size_t sequences, postings;
};
template <typename SUMU_index>
void get_size_stats(SUMU_index& coll,
uint64_t& docs_size, uint64_t& freqs_size)
{
auto size_tree = succinct::mapper::size_tree_of(coll);
size_tree->dump();
for (auto const& node: size_tree->children) {
if (node->name == "m_docs_sequences") {
docs_size = node->size;
} else if (node->name == "m_freqs_sequences") {
freqs_size = node->size;
}
}
}
template <typename FreqsSequence>
void get_size_stats(clustered_freq_index<FreqsSequence>& coll,
uint64_t& docs_size, uint64_t& freqs_size)
{
auto size_tree = succinct::mapper::size_tree_of(coll);
size_tree->dump();
for (auto const& node: size_tree->children) {
if (node->name == "m_docs_sequences") {
docs_size = node->size;
} else if (node->name == "m_freqs_sequences") {
freqs_size = node->size;
} else if (node->name == "m_refs_sequences") {
docs_size += node->size;
}
}
}
template <typename DocsSequence, typename FreqsSequence>
void get_size_stats(freq_index<DocsSequence, FreqsSequence>& coll,
uint64_t& docs_size, uint64_t& freqs_size)
{
auto size_tree = succinct::mapper::size_tree_of(coll);
size_tree->dump();
for (auto const& node: size_tree->children) {
if (node->name == "m_docs_sequences") {
docs_size = node->size;
} else if (node->name == "m_freqs_sequences") {
freqs_size = node->size;
}
}
}
template <typename BlockCodec, bool Profile>
void get_size_stats(block_freq_index<BlockCodec, Profile>& coll,
uint64_t& docs_size, uint64_t& freqs_size)
{
auto size_tree = succinct::mapper::size_tree_of(coll);
size_tree->dump();
uint64_t total_size = 0;
for (auto const& node: size_tree->children) {
if (node->name == "m_lists") {
total_size = node->size;
}
}
freqs_size = 0;
for (size_t i = 0; i < coll.size(); ++i) {
freqs_size += coll[i].stats_freqs_size();
}
docs_size = total_size - freqs_size;
}
template <typename Collection>
void dump_stats(Collection& coll,
std::string const& type,
uint64_t postings)
{
uint64_t docs_size = 0, freqs_size = 0;
get_size_stats(coll, docs_size, freqs_size);
double bits_per_doc = docs_size * 8.0 / postings;
double bits_per_freq = freqs_size * 8.0 / postings;
logger() << "Documents: " << docs_size << " bytes, "
<< bits_per_doc << " bits per element" << std::endl;
logger() << "Frequencies: " << freqs_size << " bytes, "
<< bits_per_freq << " bits per element" << std::endl;
stats_line()
("type", type)
("size", docs_size + freqs_size)
("docs_size", docs_size)
("freqs_size", freqs_size)
("bits_per_doc", bits_per_doc)
("bits_per_freq", bits_per_freq)
;
}
}