-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall-maximal-sets-lexicographic.h
193 lines (163 loc) · 7.01 KB
/
all-maximal-sets-lexicographic.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// ---
// This class implements an algorithm that computes all maximal sets
// from a given input list of sets.
//
// The input list must have the following properties for the algorithm
// to behave correctly and/or efficiently:
//
// 1) Sets in the file are assumed to appear in increasing
// lexicographic order.
//
// 2) Items within a set must always appear from least to most
// frequent in a consistent order. That is if item $x$ appears less
// frequently than item $y$ within the dataset, then $x$ should always
// appear before $y$ within any set containing both items. Furthermore
// if two items $x$ and $y$ have the same frequency, then one must be
// chosen to consistently appear before the other should they both
// appear in a given set.
//
// 3) A set must not contain duplicate items.
// ---
// Author: Roberto Bayardo
//
#ifndef _ALL_MAXIMAL_SETS_LEXICOGRAPHIC_H_
#define _ALL_MAXIMAL_SETS_LEXICOGRAPHIC_H_
#include <limits>
#include <vector>
#include <utility>
#include "basic-types.h"
namespace google_extremal_sets {
class DataSourceIterator;
class SetProperties;
class AllMaximalSetsLexicographic {
public:
AllMaximalSetsLexicographic()
: max_items_in_ram_(std::numeric_limits<uint32_t>::max()),
output_mode_(ID) {
}
// Finds all maximal sets in the "data" stream. Does not assume
// ownership of the data stream. Returns false if the computation
// could not complete successfully because of a data stream error. A
// call to data->GetErrorMessage() will return a human-readable
// description of the problem.
//
// The caller must provide an estimate of the max_item_id which will
// be used to preallocate buffers. The correct output will be
// produced even if the estimates are inaccurate (provided there is
// sufficient memory for the buffers to be allocated to the
// specified size.)
//
// This method may output status & progress messages to stderr.
bool FindAllMaximalSets(DataSourceIterator* data, uint32_t max_item_id);
// To specify a bound on the number of 4-byte item ids that will be
// stored in main memory during algorithm execution. Should the
// dataset contain more items than the limit, the algorithm will
// switch to an "out of core" mode and perform multiple passes over
// the data in order to compute the output. Default is to impose
// no RAM limit.
void SetMaxItemsInRam(int max) {
max_items_in_ram_ = max;
}
// Set the output mode. Default is "ID".
void SetOutputMode(OutputModeEnum mode) {
output_mode_ = mode;
}
// Returns the number of maximal sets found by the last call to
// FindAllMaximalSets.
long MaximalSetsCount() const { return maximal_sets_count_; }
// Returns the number of itemssets encountered in the input stream.
long long InputSetsCount() const { return input_sets_count_; }
// Returns the number of seeks within the candidate list performed
// by FindAllMaximalSets.
long long CandidateSeekCount() const { return canidate_seek_count_; }
// A list of itemsets used to store candidates within the candidate
// map.
typedef std::vector<SetProperties*> CandidateList;
private:
// First method called by FindAllMaximalSets for rudimentary variable
// initialization.
void Init();
// Prepare datastructures for scanning the data beginning at the
// provided offset. Returns false if IO error encountered.
bool PrepareForDataScan(DataSourceIterator* data, off_t seek_offset);
// Scans the input data from beginning to end, and reads in a chunk
// of data to process, up to the max_items_in_ram_ limit. Returns
// false on IO error. seek_offset will contain the point at which
// scanning stopped if the max_items_in_ram_ limit was reached.
// Otherwise it is set to 0.
bool ReadNextChunk(DataSourceIterator* data, off_t* seek_offset);
// Iterates over the current chunk backwards and delete itemsets
// that are tivially subsumed based on prefix comparison.
void DeleteTriviallySubsumedCandidates();
// Compresses out the blanks left by deleting trivially subsumed
// itemsets, identifies blocks of candidates that start with the
// same item id, and builds the index that maps each item to the
// first candidate that starts with that item.
void BuildIndex();
// Delete any candidate subsumed by the given input_set.
void DeleteSubsumedCandidates(unsigned int candidate_index);
void DeleteSubsumedCandidates(const ItemSet& itemset);
// Call FoundMaximalSet for all sets that remain as candidates, and
// release them from memory. The candidate_ set will be empty
// upon return.
void DumpMaximalSets();
// Invoked for each maximal set found.
void FoundMaximalSet(const SetProperties& maximal_set);
// Deletes all candidates from the specified range that are subsumed
// by the current_set_.
void DeleteSubsumedFromRange(
CandidateList::iterator begin_range_it,
CandidateList::iterator end_range_it,
const uint32_t* current_set_it,
unsigned int depth);
// Invoked by Recurse to delete & advance over any candidates that
// are equal to the current prefix (and are hence subsumed).
void DeleteSubsumedSets(
CandidateList::iterator* begin_range_it,
CandidateList::iterator end_range_it,
unsigned int depth);
CandidateList::iterator GetNewBeginRangeIt(
CandidateList::iterator begin_range_it,
CandidateList::iterator end_range_it,
unsigned int current_item,
unsigned int depth);
CandidateList::iterator GetNewEndRangeIt(
CandidateList::iterator begin_range_it,
CandidateList::iterator end_range_it,
unsigned int current_item,
unsigned int depth);
// Stats variables.
long maximal_sets_count_;
long input_sets_count_;
long long canidate_seek_count_;
// Maps each item to a list of "candidate itemsets", each of which
// contains the item as its first entry. Itemsets in each candidate
// list appear in increasing order of cardinality, then increasing
// lexocographic order. Some entries may be NULL.
CandidateList candidates_;
// Index into candidates_. Maps each item id to the position within
// candidates_ containing the first set in the lexicographic
// ordering to follow the singleton set { item_id }.
std::vector<CandidateList::size_type> index_;
// Temporary/global variables
SetProperties* current_set_;
// Configuration options.
uint32_t items_in_ram_, max_items_in_ram_;
OutputModeEnum output_mode_;
};
} // namespace google_extremal_sets
#endif // _ALL_MAXIMAL_SETS_LEXICOGRAPHIC_H_