forked from DataStories-UniPi/miniDB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
btree.py
328 lines (274 loc) · 13.4 KB
/
btree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
'''
https://en.wikipedia.org/wiki/B%2B_tree
'''
class Node:
'''
Node abstraction. Represents a single bucket
'''
def __init__(self, b, values=[], ptrs=[],\
left_sibling=None, right_sibling=None, parent=None, is_leaf=False):
self.b = b # branching factor
self.values = values # Values (the data from the pk column)
self.ptrs = ptrs # ptrs (the indexes of each datapoint or the index of another bucket)
self.left_sibling = left_sibling # the index of a buckets left sibling
self.right_sibling = right_sibling # the index of a buckets right sibling
self.parent = parent # the index of a buckets parent
self.is_leaf = is_leaf # a boolean value signaling whether the node is a leaf or not
def find(self, value, return_ops=False):
'''
Returns the index of the next node to search for a value if the node is not a leaf (a ptrs of the available ones).
If it is a leaf (we have found the appropriate node), it returns nothing.
value: the value that we are searching for
return_ops: set to True if you want to use the number of operations (for benchmarking)
'''
ops = 0 # number of operations (<>= etc). Used for benchmarking
if self.is_leaf: #
return
# for each value in the node, if the user supplied value is smaller, return the btrees value index
# else (no value in the node is larger) return the last ptr
for index, existing_val in enumerate(self.values):
ops+=1
if value<existing_val:
if return_ops:
return self.ptrs[index], ops
else:
return self.ptrs[index]
if return_ops:
return self.ptrs[-1], ops
else:
return self.ptrs[-1]
def insert(self, value, ptr, ptr1=None):
'''
Insert the value and its ptr/s to the appropriate place (node wise).
User can input two ptrs to insert to a non leaf node.
value: the value that we are inserting to the node
ptr: the ptr of the inserted value (its index for example)
ptr1: the 2nd ptr (in case the user wants to insert into a nonleaf node for ex)
'''
# for each value in the node, if the user supplied value is smaller, insert the value and its ptr into that position
# if a second ptr is provided, insert it right next to the 1st ptr
# else (no value in the node is larger) append value and ptr/s to the back of the list.
for index, existing_val in enumerate(self.values):
if value<existing_val:
self.values.insert(index, value)
self.ptrs.insert(index+1, ptr)
if ptr1:
self.ptrs.insert(index+1, ptr1)
return
self.values.append(value)
self.ptrs.append(ptr)
if ptr1:
self.ptrs.append(ptr1)
def show(self):
'''
print the node's value and important info
'''
print('Values', self.values)
print('ptrs', self.ptrs)
print('Parent', self.parent)
print('LS', self.left_sibling)
print('RS', self.right_sibling)
class Btree:
def __init__(self, b):
'''
The tree abstraction.
'''
self.b = b # branching factor
self.nodes = [] # list of nodes. Every new node is appended here
self.root = None # the index of the root node
def insert(self, value, ptr, rptr=None):
'''
Insert the value and its ptr/s to the appropriate node (node-level insertion is covered by the node object).
User can input two ptrs to insert to a non leaf node.
'''
# if the tree is empty, add the first node and set the root index to 0 (the only node's index)
if self.root is None:
self.nodes.append(Node(self.b, is_leaf=True))
self.root = 0
# find the index of the node that the value and its ptr/s should be inserted to (_search)
index = self._search(value)
# insert to it
self.nodes[index].insert(value,ptr)
# if the node has more elements than b-1, split the node
if len(self.nodes[index].values)==self.b:
self.split(index)
def _search(self, value, return_ops=False):
'''
Returns the index of the node that the given value exist or should exist in.
value: the value that we are searching for
return_ops: set to True if you want to use the number of operations (for benchmarking)
'''
ops=0 # number of operations (<>= etc). Used for benchmarking
#start with the root node
node = self.nodes[self.root]
# while the node that we are searching in is not a leaf
# keep searching
while not node.is_leaf:
idx, ops1 = node.find(value, return_ops=True)
node = self.nodes[idx]
ops += ops1
# finally return the index of the appropriate node (and the ops if you want to)
if return_ops:
return self.nodes.index(node), ops
else:
return self.nodes.index(node)
def split(self, node_id):
'''
Split the node with index=node_id
'''
# fetch the node to be split
node = self.nodes[node_id]
# the value that will be propagated to the parent is the middle one.
new_parent_value = node.values[len(node.values)//2]
if node.is_leaf:
# if the node is a leaf, the parent value should be a part of the new node (right)
# Important: in a b+tree, every value should appear in a leaf
right_values = node.values[len(node.values)//2:]
right_ptrs = node.ptrs[len(node.ptrs)//2:]
# create the new node with the right half of the old nodes values and ptrs (including the middle ones)
right = Node(self.b, right_values, right_ptrs,\
left_sibling=node_id, right_sibling=node.right_sibling, parent=node.parent, is_leaf=node.is_leaf)
# since the new node (right) will be the next one to be appended to the nodes list
# its index will be equal to the length of the nodes list.
# Thus we set the old nodes (now left) right sibling to the right nodes future index (len of nodes)
if node.right_sibling is not None:
self.nodes[node.right_sibling].left_sibling = len(self.nodes)
node.right_sibling = len(self.nodes)
else:
# if the node is not a leaf, the parent value shoudl NOT be part of the new node
right_values = node.values[len(node.values)//2+1:]
right_ptrs = node.ptrs[len(node.ptrs)//2:]
# if nonleafs should be connected change the following two lines and add siblings
right = Node(self.b, right_values, right_ptrs,\
parent=node.parent, is_leaf=node.is_leaf)
# make sure that a non leaf node doesnt have a parent
node.right_sibling = None
# the right node's kids should have him as a parent (if not all nodes will have left as parent)
for ptr in right_ptrs:
self.nodes[ptr].parent = len(self.nodes)
# old node (left) keeps only the first half of the values/ptrs
node.values = node.values[:len(node.values)//2]
node.ptrs = node.ptrs[:len(node.ptrs)//2]
# append the new node (right) to the nodes list
self.nodes.append(right)
# If the new nodes have no parents (a new level needs to be added
if node.parent is None:
# its the root that is split
# new root contains the parent value and ptrs to the two recently split nodes
parent = Node(self.b, [new_parent_value], [node_id, len(self.nodes)-1]\
,parent=node.parent, is_leaf=False)
# set root, and parent of split celss to the index of the new root node (len of nodes-1)
self.nodes.append(parent)
self.root = len(self.nodes)-1
node.parent = len(self.nodes)-1
right.parent = len(self.nodes)-1
else:
# insert the parent value to the parent
self.nodes[node.parent].insert(new_parent_value, len(self.nodes)-1)
# check whether the parent needs to be split
if len(self.nodes[node.parent].values)==self.b:
self.split(node.parent)
def show(self):
'''
Show important info for each node (sort the by level - root first, then left to right)
'''
nds = []
nds.append(self.root)
for ptr in nds:
if self.nodes[ptr].is_leaf:
continue
nds.extend(self.nodes[ptr].ptrs)
for ptr in nds:
print(f'## {ptr} ##')
self.nodes[ptr].show()
print('----')
def plot(self):
## arrange the nodes top to bottom left to right
nds = []
nds.append(self.root)
for ptr in nds:
if self.nodes[ptr].is_leaf:
continue
nds.extend(self.nodes[ptr].ptrs)
# add each node and each link
g = 'digraph G{\nforcelabels=true;\n'
for i in nds:
node = self.nodes[i]
g+=f'{i} [label="{node.values}"]\n'
if node.is_leaf:
continue
# if node.left_sibling is not None:
# g+=f'"{node.values}"->"{self.nodes[node.left_sibling].values}" [color="blue" constraint=false];\n'
# if node.right_sibling is not None:
# g+=f'"{node.values}"->"{self.nodes[node.right_sibling].values}" [color="green" constraint=false];\n'
#
# g+=f'"{node.values}"->"{self.nodes[node.parent].values}" [color="red" constraint=false];\n'
else:
for child in node.ptrs:
g+=f'{child} [label="{self.nodes[child].values}"]\n'
g+=f'{i}->{child};\n'
g +="}"
try:
from graphviz import Source
src = Source(g)
src.render('bplustree', view=True)
except ImportError:
print('"graphviz" package not found. Writing to graph.gv.')
with open('graph.gv','w') as f:
f.write(g)
def find(self, operator, value):
'''
Return ptrs of elements where btree_value"operator"value.
Important, the user supplied "value" is the right value of the operation. That is why the operation are reversed below.
The left value of the op is the btree value.
'''
results = []
# find the index of the node that the element should exist in
leaf_idx, ops = self._search(value, True)
target_node = self.nodes[leaf_idx]
if operator == '==':
# if the element exist, append to list, else pass and return
try:
results.append(target_node.ptrs[target_node.values.index(value)])
# print('Found')
except:
# print('Not found')
pass
# for all other ops, the code is the same, only the operations themselves and the sibling indexes change
# for > and >= (btree value is >/>= of user supplied value), we return all the right siblings (all values are larger than current cell)
# for < and <= (btree value is </<= of user supplied value), we return all the left siblings (all values are smaller than current cell)
if operator == '>':
for idx, node_value in enumerate(target_node.values):
ops+=1
if node_value > value:
results.append(target_node.ptrs[idx])
while target_node.right_sibling is not None:
target_node = self.nodes[target_node.right_sibling]
results.extend(target_node.ptrs)
if operator == '>=':
for idx, node_value in enumerate(target_node.values):
ops+=1
if node_value >= value:
results.append(target_node.ptrs[idx])
while target_node.right_sibling is not None:
target_node = self.nodes[target_node.right_sibling]
results.extend(target_node.ptrs)
if operator == '<':
for idx, node_value in enumerate(target_node.values):
ops+=1
if node_value < value:
results.append(target_node.ptrs[idx])
while target_node.left_sibling is not None:
target_node = self.nodes[target_node.left_sibling]
results.extend(target_node.ptrs)
if operator == '<=':
for idx, node_value in enumerate(target_node.values):
ops+=1
if node_value <= value:
results.append(target_node.ptrs[idx])
while target_node.left_sibling is not None:
target_node = self.nodes[target_node.left_sibling]
results.extend(target_node.ptrs)
# print the number of operations (usefull for benchamrking)
print(f'With BTree -> {ops} comparison operations')
return results