-
Notifications
You must be signed in to change notification settings - Fork 1
/
rlcsa.h
397 lines (303 loc) · 14.8 KB
/
rlcsa.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
#ifndef RLCSA_H
#define RLCSA_H
#include <fstream>
#include <vector>
#include "bits/deltavector.h"
#include "bits/rlevector.h"
#include "bits/nibblevector.h"
#include "bits/succinctvector.h"
#include "sasamples.h"
#include "alphabet.h"
#include "lcpsamples.h"
#include "misc/parameters.h"
#include "sampler.h"
#include "suffixarray.h"
namespace CSA
{
const std::string PSI_EXTENSION = ".psi";
const std::string ARRAY_EXTENSION = ".rlcsa.array";
const std::string SA_SAMPLES_EXTENSION = ".rlcsa.sa_samples";
const std::string PARAMETERS_EXTENSION = ".rlcsa.parameters";
const std::string DOCUMENT_EXTENSION = ".rlcsa.docs";
const std::string LCP_SAMPLES_EXTENSION = ".lcp_samples";
const std::string PLCP_EXTENSION = ".plcp";
const parameter_type RLCSA_BLOCK_SIZE = parameter_type("RLCSA_BLOCK_SIZE", 32);
const parameter_type SAMPLE_RATE = parameter_type("SAMPLE_RATE", 128);
const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 1);
const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 1);
const parameter_type WEIGHTED_SAMPLES = parameter_type("WEIGHTED_SAMPLES", 0);
#ifdef USE_NIBBLE_VECTORS
typedef NibbleVector PsiVector;
#else
typedef RLEVector PsiVector;
#endif
#ifdef SUCCINCT_LCP_VECTOR
typedef SuccinctVector PLCPVector;
#else
typedef RLEVector PLCPVector;
#endif
class RLCSA
{
friend class RLCSABuilder;
public:
//--------------------------------------------------------------------------
// CONSTRUCTION
//--------------------------------------------------------------------------
static const usint ENDPOINT_BLOCK_SIZE = 16;
explicit RLCSA(const std::string& base_name, bool print = false);
/*
Build RLCSA for multiple sequences, treating each \0 as an end marker.
There must be nonzero characters between the \0s, and the last character must also be \0.
FIXME Crashes if bytes >= 4 GB.
*/
RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, usint threads, bool delete_data);
/*
Same as before, but this time we build the suffix array for the ranks array.
*/
RLCSA(uchar* data, usint* ranks, usint bytes, usint block_size, usint sa_sample_rate, usint threads, bool delete_data);
/*
Build an RLCSA for a single sequence, and use the samples given by sampler, if not 0.
The sequence does not contain an end marker.
FIXME This cannot be merged. Maybe should enforce it somehow.
*/
RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, usint threads, Sampler* sampler, bool delete_data);
// Destroys contents of index and increment.
RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads = 1);
~RLCSA();
void writeTo(const std::string& base_name) const;
inline bool isOk() const { return this->ok; }
//--------------------------------------------------------------------------
// QUERIES
//--------------------------------------------------------------------------
// These queries use SA ranges.
// Returns the closed range containing the matches.
pair_type count(const std::string& pattern) const;
// Used when merging CSAs.
void reportPositions(uchar* data, usint length, usint* positions) const;
// Returns SA[range]. User must free the buffer. Latter version uses buffer provided by the user.
// Direct locate means locating one position at a time.
// Steps means that the returned values are the number of Psi steps taken, not SA values.
usint* locate(pair_type range, bool direct = false, bool steps = false) const;
usint* locate(pair_type range, usint* data, bool direct = false, bool steps = false) const;
// Returns SA[index].
usint locate(usint index, bool steps = false) const;
// Given SA[index], returns index.
usint inverseLocate(usint location) const;
// Returns T^{sequence}[range]. User must free the buffer.
// Third version uses buffer provided by the user.
uchar* display(usint sequence, bool include_end_marker = false) const;
uchar* display(usint sequence, pair_type range) const;
uchar* display(usint sequence, pair_type range, uchar* data) const;
// Displays the intersection of T[position - context, position + len + context - 1]
// and T^{getSequenceForPosition(position)}.
// This is intended for displaying an occurrence of a pattern of length 'len'
// with 'context' extra characters on both sides.
// The actual length of the returned string is written into result_length.
uchar* display(usint position, usint len, usint context, usint& result_length) const;
// Returns the actual length of the prefix. User must provide the buffer.
usint displayPrefix(usint sequence, usint len, uchar* data) const;
// Returns at most max_len characters starting from T[SA[index]].
// User must provide the buffer. Returns the number of characters in buffer.
usint displayFromPosition(usint index, usint max_len, uchar* data) const;
// Get the range of SA values for the sequence identified by
// a sequence number or a SA value.
pair_type getSequenceRange(usint number) const;
pair_type getSequenceRangeForPosition(usint value) const;
// Get the sequence number for given SA value(s).
// The returned array is the same as the parameter.
usint getSequenceForPosition(usint value) const;
usint* getSequenceForPosition(usint* value, usint length) const;
// Changes SA value to (sequence, offset).
pair_type getRelativePosition(usint value) const;
// Changes (sequence, offset) to SA value (not the same as SA index; it's a
// position in the unified coordinate space over all texts).
usint getAbsolutePosition(pair_type position) const;
// Returns the BWT of the collection including end of sequence markers.
uchar* readBWT() const;
uchar* readBWT(pair_type range) const;
// Returns the number of equal letter runs in the BWT. Runs consisting of end markers are ignored.
usint countRuns() const;
// Return the suffix array for a particular sequence. User must free the suffix array.
SuffixArray* getSuffixArrayForSequence(usint number) const;
//--------------------------------------------------------------------------
// SUPPORT FOR EXTERNAL MODULES: POSITIONS
//--------------------------------------------------------------------------
// The return values of these functions are BWT indexes/ranges.
inline usint psi(usint sa_index) const
{
if(sa_index >= this->data_size)
{
return this->data_size + this->number_of_sequences;
}
usint c = this->getCharacter(sa_index);
return this->psiUnsafe(sa_index, c);
}
// This version returns a run.
inline pair_type psi(usint sa_index, usint max_length) const
{
if(sa_index >= this->data_size)
{
return pair_type(this->data_size + this->number_of_sequences, 0);
}
usint c = this->getCharacter(sa_index);
PsiVector::Iterator iter(*(this->array[c]));
return iter.selectRun(sa_index - this->alphabet->cumulative(c), max_length);
}
inline usint LF(usint sa_index, usint c) const
{
if(c >= CHARS)
{
return this->data_size + this->number_of_sequences;
}
if(this->array[c] == 0)
{
if(c < this->alphabet->getFirstChar()) { return this->number_of_sequences - 1; }
return this->alphabet->cumulative(c) + this->number_of_sequences - 1;
}
this->convertToBWTIndex(sa_index);
PsiVector::Iterator iter(*(this->array[c]));
return this->LF(sa_index, c, iter);
}
inline void convertToSAIndex(usint& bwt_index) const { bwt_index -= this->number_of_sequences; }
inline void convertToBWTIndex(usint& sa_index) const { sa_index += this->number_of_sequences; }
inline bool hasImplicitSample(usint bwt_index) const
{
return (bwt_index < this->number_of_sequences);
}
inline usint getImplicitSample(usint bwt_index) const
{
DeltaVector::Iterator iter(*(this->end_points));
return iter.select(bwt_index) + 1;
}
// This is an unsafe function that returns a character value.
inline usint getCharacter(usint sa_index) const
{
return this->alphabet->charAt(sa_index);
}
//--------------------------------------------------------------------------
// SUPPORT FOR EXTERNAL MODULES: RANGES
//--------------------------------------------------------------------------
pair_type getSARange() const;
pair_type getBWTRange() const;
pair_type getCharRange(usint c) const;
void convertToBWTRange(pair_type& sa_range) const;
void convertToSARange(pair_type& bwt_range) const;
void convertToSARange(std::vector<pair_type>& bwt_ranges) const;
// This is an unsafe function that does not check its parameters.
pair_type LF(pair_type bwt_range, usint c) const;
// User must free the returned vector.
std::vector<usint>* locateRange(pair_type range) const;
std::vector<usint>* locateRanges(std::vector<pair_type>& ranges) const;
//--------------------------------------------------------------------------
// REPORTING
//--------------------------------------------------------------------------
inline bool supportsLocate() const { return this->support_locate; }
inline bool supportsDisplay() const { return this->support_display; }
inline usint getSize() const { return this->data_size; }
inline usint getTextSize() const { return this->end_points->getSize(); }
inline usint getNumberOfSequences() const { return this->number_of_sequences; }
inline usint getBlockSize() const { return this->array[this->alphabet->getFirstChar()]->getBlockSize(); }
// Returns the size of the data structure.
usint reportSize(bool print = false) const;
void printInfo() const;
//--------------------------------------------------------------------------
// LCP EXPERIMENTS
//--------------------------------------------------------------------------
// Optimized version:
// - Interleaves main loop with computing irreducible values.
// - Encodes maximal runs from a true local maximum to a true local minimum.
PLCPVector* buildPLCP(usint block_size) const; //, usint threads = 1) const;
// Returns the number of samples. sampled_values will be a pointer to the samples.
usint sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report = false) const;
usint lcp(usint sa_index, const LCPSamples& lcp_samples, bool steps = false) const;
// Calculate LCP[index] directly.
usint lcpDirect(usint sa_index) const;
// Writes PLCP[start] to PLCP[stop - 1].
inline void encodePLCPRun(PLCPVector::Encoder& plcp, usint start, usint stop, usint first_val) const
{
plcp.addRun(2 * start + first_val, stop - start);
// std::cerr << "(" << start << ", " << stop << ", " << first_val << ")" << std::endl;
}
//--------------------------------------------------------------------------
// INTERNAL VARIABLES
//--------------------------------------------------------------------------
protected:
bool ok;
usint data_size;
PsiVector* array[CHARS];
Alphabet* alphabet;
SASamples* sa_samples;
bool support_locate, support_display;
// A sequence starts at the next multiple of sample_rate after the end of previous sequence.
usint sample_rate;
usint number_of_sequences;
DeltaVector* end_points;
//--------------------------------------------------------------------------
// INTERNAL VERSIONS OF QUERIES
//--------------------------------------------------------------------------
// Locates the given SA range, one item at a time, storing results in the
// array given by the data pointer. Finds actual locations if steps is
// false, or the number of steps needed to determine each location if steps
// is true.
void directLocate(pair_type range, usint* data, bool steps) const;
// Locates the given BWT position, returning either the actual location or
// the steps needed to find it, depending on the value of steps.
usint directLocate(usint index, bool steps) const;
void locateUnsafe(pair_type range, usint* data, bool steps) const;
bool processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters, bool steps) const;
void displayUnsafe(pair_type range, uchar* data, bool get_ranks = false, usint* ranks = 0) const;
void locateRange(pair_type range, std::vector<usint>& vec) const;
// Given a sequence position, return the corresponding BWT position.
usint directInverseLocate(usint location) const;
//--------------------------------------------------------------------------
// INTERNAL VERSIONS OF BASIC OPERATIONS
//--------------------------------------------------------------------------
inline usint psi(usint index, PsiVector::Iterator** iters) const
{
usint c = this->getCharacter(index);
return iters[c]->select(index - this->alphabet->cumulative(c));
}
inline pair_type psi(usint index, usint max_length, PsiVector::Iterator** iters) const
{
usint c = this->getCharacter(index);
return iters[c]->selectRun(index - this->alphabet->cumulative(c), max_length);
}
// Returns psi(index), assuming the suffix of rank index begins with c.
inline usint psiUnsafe(usint index, usint c) const
{
PsiVector::Iterator iter(*(this->array[c]));
return this->psiUnsafe(index, c, iter);
}
// As above, but with a given iterator.
inline usint psiUnsafe(usint index, usint c, PsiVector::Iterator& iter) const
{
return iter.select(index - this->alphabet->cumulative(c));
}
// As above, but returns the next value of psi.
inline usint psiUnsafeNext(usint c, PsiVector::Iterator& iter) const
{
return iter.selectNext();
}
inline usint LF(usint bwt_index, usint c, PsiVector::Iterator& iter) const
{
return this->alphabet->cumulative(c) + this->number_of_sequences + iter.rank(bwt_index) - 1;
}
//--------------------------------------------------------------------------
// INTERNAL STUFF
//--------------------------------------------------------------------------
// Creates an array of iterators for every vector in this->array.
PsiVector::Iterator** getIterators() const;
void deleteIterators(PsiVector::Iterator** iters) const;
void mergeEndPoints(RLCSA& index, RLCSA& increment);
void mergeSamples(RLCSA& index, RLCSA& increment, usint* positions);
void buildCharIndexes(usint* distribution);
void buildRLCSA(uchar* data, usint* ranks, usint bytes, usint block_size, usint threads, Sampler* sampler, bool multiple_sequences, bool delete_data);
// Removes structures not necessary for merging.
void strip();
// These are not allowed.
RLCSA();
RLCSA(const RLCSA&);
RLCSA& operator = (const RLCSA&);
};
} // namespace CSA
#endif // RLCSA_H