-
Notifications
You must be signed in to change notification settings - Fork 1
/
my_tree_tools.py
512 lines (394 loc) · 17.6 KB
/
my_tree_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
__author__ = 'yannis'
import numpy
import cPickle as pickle
class tree_node:
def __init__(self,name, parent = None):
self.name = name
self.parent = parent
self.children = set()
if isinstance(self.parent,tree_node):
self.level = self.parent.level + 1
else:
self.level = 0
class tree:
def __init__(self,root_object = None,node_lookup = None, max_depth = 0):
self.node_lookup = dict()
if root_object is None:
self.root = tree_node('ROOT')
elif isinstance(root_object,str):
self.root = tree_node(root_object)
elif isinstance(root_object,tree_node):
self.root = root_object
if node_lookup is None:
self.node_lookup[self.root.name] = self.root
else:
self.node_lookup = node_lookup
self.max_depth = max_depth
def format_input_node_object_to_tree_node(self,node_object,None_option = None):
if isinstance(node_object,str):
return self.node_lookup[node_object]
elif isinstance(node_object,tree_node):
return node_object
elif node_object is None:
return None_option
def get_total_number_of_nodes(self):
return len(self.node_lookup)
def has_node_with_name(self,name):
return self.node_lookup.has_key(name)
def get_node_by_name(self,name):
return self.node_lookup[name]
def get_node_level(self,node_object):
u = self.format_input_node_object_to_tree_node(node_object)
return u.level
def add_node(self,name,parent_object = None):
parent = self.format_input_node_object_to_tree_node(parent_object)
new_node = tree_node(name,parent)
if parent is not None:
parent.children.add(new_node)
self.node_lookup[new_node.name] = new_node
if self.max_depth<new_node.level:
self.max_depth = new_node.level
return new_node
def update_parent_of_node(self,node_object,parent_object=None):
node = self.format_input_node_object_to_tree_node(node_object)
new_parent = self.format_input_node_object_to_tree_node(parent_object)
old_parent = node.parent
if old_parent is not None:
old_parent.children.remove(node)
node.parent = new_parent
new_parent.children.add(node)
# maybe update
def get_parent(self,node_object):
if isinstance(node_object,tree_node):
return node_object.parent
elif isinstance(node_object,str):
focal_node = self.node_lookup[node_object]
return focal_node.parent
def get_parent_name(self,node_object):
parent = self.get_parent(node_object)
return parent.name
def get_children(self,node_object):
focal_node = self.format_input_node_object_to_tree_node(node_object)
return focal_node.children
def get_children_names(self,node_object):
children = self.get_children(node_object)
return [child.name for child in children]
def get_path_to_root(self,node_object):
u = self.format_input_node_object_to_tree_node(node_object)
path = list()
# by default includes self
path.append(u)
current_parent = u.parent
while current_parent is not None:
path.append(current_parent)
current_parent = current_parent.parent
return path
def get_path_to_root_as_node_names(self,u):
path_to_root = self.get_path_to_root(u)
return [check_point.name for check_point in path_to_root]
def get_path_from_to(self,node_object_u,node_object_v):
u = self.format_input_node_object_to_tree_node(node_object_u)
v = self.format_input_node_object_to_tree_node(node_object_v)
path_u = self.get_path_to_root(u)
path_v = self.get_path_to_root(v)
path_u_names = [n.name for n in path_u]
path_v_names = [n.name for n in path_v]
i=0
for y in path_u_names:
if y in path_v_names:
lu = i+1
lv = path_v_names.index(y)+1
path_to_common_ancestor_u = path_u[0:lu]
path_to_common_ancestor_v = path_v[0:lv]
return path_to_common_ancestor_u[0:-1] + path_to_common_ancestor_v[::-1]
i+=1
def get_path_from_to_as_node_names(self,node_object_u,node_object_v):
u = self.format_input_node_object_to_tree_node(node_object_u)
v = self.format_input_node_object_to_tree_node(node_object_v)
path_u = self.get_path_to_root_as_node_names(u)
path_v = self.get_path_to_root_as_node_names(v)
i=0
for y in path_u:
if y in path_v:
lu = i+1
lv = path_v.index(y)+1
path_to_common_ancestor_u = path_u[0:lu]
path_to_common_ancestor_v = path_v[0:lv]
return path_to_common_ancestor_u[0:-1] + path_to_common_ancestor_v[::-1]
i+=1
def get_node_to_node_distance(self,node_object_u,node_object_v):
return self.get_leaf_to_leaf_distance(node_object_u,node_object_v)
def get_node_to_node_distance_exp_weighted(self,node_object_u,node_object_v,alpha=1.,beta=1.):
return self.get_leaf_to_leaf_distance_exp_weighted(node_object_u,node_object_v,alpha,beta)
def get_leaf_to_leaf_distance(self,node_object_u,node_object_v):
u = self.format_input_node_object_to_tree_node(node_object_u)
v = self.format_input_node_object_to_tree_node(node_object_v)
common_ancestor_level = self.get_first_common_ancestor_level(u,v)
return u.level + v.level - 2*common_ancestor_level
def get_leaf_to_leaf_distance_exp_weighted(self,node_object_u,node_object_v,alpha=1.,beta=1.):
u = self.format_input_node_object_to_tree_node(node_object_u)
v = self.format_input_node_object_to_tree_node(node_object_v)
path_u_v = self.get_path_from_to(u,v)
depth_path = [node.level for node in path_u_v]
weighted_distances = [numpy.exp(-(alpha/beta) *d) for d in depth_path[1:-1]]
return sum(weighted_distances)
def get_first_common_ancestor(self,node_object_u,node_object_v):
u = self.format_input_node_object_to_tree_node(node_object_u)
v = self.format_input_node_object_to_tree_node(node_object_v)
path_u = self.get_path_to_root_as_node_names(u)
path_v = self.get_path_to_root_as_node_names(v)
for ancestor_name in path_u:
if ancestor_name in path_v:
ancestor_node = self.node_lookup[ancestor_name]
return ancestor_node
def get_first_common_ancestor_name(self,node_object_u,node_object_v):
ancestor_node = self.get_first_common_ancestor(node_object_u,node_object_v)
return ancestor_node.name
def get_first_common_ancestor_level(self,node_object_u,node_object_v):
ancestor_node = self.get_first_common_ancestor(node_object_u,node_object_v)
return ancestor_node.level
def get_leaves(self):
return [anode for anode in self.node_lookup.values() if len(anode.children)==0]
def get_leaf_names(self):
leaves = self.get_leaves()
return [leaf.name for leaf in leaves]
def get_number_of_leaves(self):
leaves = self.get_leaves()
return len(leaves)
def get_number_of_children(self,node_object):
focal_node = self.format_input_node_object_to_tree_node(node_object)
return focal_node.children
def get_siblings(self,node_object):
focal_node = self.format_input_node_object_to_tree_node(node_object)
if focal_node.parent is self.root:
return []
else:
siblings = [child for child in focal_node.parent.children if child is not focal_node]
return siblings
def get_sibling_names(self,node_object):
siblings = self.get_siblings(node_object)
return [sibling.name for sibling in siblings]
def get_orphans(self):
orphans = [anode for anode in self.node_lookup.values() if anode.parent is None]
return orphans
def get_orphan_names(self):
orphans = self.get_orphans()
orphan_names = [an_orphan.name for an_orphan in orphans]
return orphan_names
def calculate_node_level(self,node_object):
focal_node = self.format_input_node_object_to_tree_node(node_object)
if focal_node.parent is not None:
level = 1 + self.calculate_node_level(focal_node.parent)
else:
level = 0
focal_node.level = level
return level
def recalculate_all_node_levels(self):
leaves = self.get_leaves()
is_updated = dict()
for leaf in leaves:
is_updated[leaf.name] = False
for leaf in leaves:
if not is_updated[leaf.name]:
leaf.level = self.calculate_node_level(leaf)
is_updated[leaf.name] = True
siblings = self.get_siblings(leaf)
for asibling in siblings:
asibling.level = leaf.level
is_updated[asibling.name] = True
def get_all_number_of_children_values(self, at_level = None, ignore_leaves = True):
return numpy.array([len(anode.children) for anode in self.node_lookup.values() if ((at_level is not None and anode.level==at_level) or at_level is None) and ((ignore_leaves and len(anode.children)!=0) or not ignore_leaves)])
def get_branching_factor_distribution(self,at_level = None,ignore_leaves = True):
children_values = self.get_all_number_of_children_values(at_level,ignore_leaves)
return numpy.histogram(children_values,bins=(numpy.max(children_values) - numpy.min(children_values + 1)))
def get_average_branching_factor(self,at_level = None,ignore_leaves = True):
children_values = self.get_all_number_of_children_values(at_level,ignore_leaves)
return numpy.mean(children_values)
def get_tree_depth(self):
depth = 0
for anode in self.node_lookup.values():
if anode.level>depth:
depth = anode.level
return depth
def get_number_of_levels_below_node(self,node_object):
focal_node = self.format_input_node_object_to_tree_node(node_object)
if len(focal_node.children)==0:
return 0
else:
descendants = self.depth_first_search(focal_node)
max_level = max([node.level for node in descendants])
return max_level - focal_node.level
def depth_first_search(self,start_node_object):
def DFS(jump_node,container):
container.append(jump_node)
if len(jump_node.children)>0:
for c in jump_node.children:
DFS(c,container)
return container
start_node = self.format_input_node_object_to_tree_node(start_node_object)
nodes_traversed = DFS(start_node,[])
return nodes_traversed
def are_ancestor_descendant_pair(self,node_object1,node_object2):
node1 = self.format_input_node_object_to_tree_node(node_object1)
node2 = self.format_input_node_object_to_tree_node(node_object2)
if node1.level>node2.level:
new_node = node1
old_node = node2
elif node1.level<node2.level:
new_node = node2
old_node = node1
else:
return False
parent_node = new_node.parent
while parent_node is not None:
if parent_node == old_node:
return True
parent_node = parent_node.parent
return False
# IMPLEMENTED
# =============
# - Open tree from old version of tree class - extended constructor - DONE
# - Open tree from old version of tree class - external function - DONE
# - Added wrapper node_to_node around leaf_to_leaf so that it is more generalised - DONE
# TO IMPLEMENT
# =============
# - Store basic tree characteristics and update them only if there is any insertion/deletion
# - Examine possible code mismatches? - DONE elsewhere
# - USPTO specific: save class depth
# - get subtree given node
# - export to : file (depth first)
# - export to nested list
# - flatten tree given root node
# - read tree from a nested list
# - automatic node name generator
######## AD HOC FUNCTIONS
#
def read_old_tree_version_to_current_version(filename):
old_t = pickle.load(open(filename,'rb'))
return tree(old_t.root,old_t.node_lookup,old_t.max_depth)
#--- Debbie's tree output
def parse_Debbie_csv_based_tree_format_to_my_tree_class(filename):
t = tree()
# CLASS INDEX = 0
# CURRENT_CODE = 1
# PARENT_CODE_OF_CURRENT_CODE = 2
# DESCRIPTION
with open(filename,'r') as fp:
for aline in fp:
aline = aline.strip()
elems = aline.split(',')
current_class = elems[0]
current_node = current_class + '/' + elems[1]
parent_of_current = current_class + '/' + elems[2]
description = elems[3]
# check if current class exists in the tree if not, add it to root
if not t.has_node_with_name(current_class):
class_node = t.add_node(current_class,t.root)
else:
class_node = t.get_node_by_name(current_class)
# check if the description contains ****** if so, skip
if '******' in description:
continue
# check if current node is 000000 if so, skip
if current_node == (current_class + '/' + '000000') or current_node == (current_class + '/' + ''):
continue
# check if parent code is <empty> or 000000 if so, set parent to current class
if parent_of_current == (current_class + '/' + '000000') or parent_of_current==(current_class + '/' + ''):
parent_node = class_node
# otherwise, check if parent code exists
elif t.has_node_with_name(parent_of_current):
# if so, grab the object
parent_node = t.get_node_by_name(parent_of_current)
# otherwise, create the node with current class as parent.
else:
parent_node = t.add_node(parent_of_current,class_node)
# check if the current node has already been added to the network
if t.has_node_with_name(current_node):
# if so, update the parent with current
t.update_parent_of_node(current_node,parent_node)
else:
# if not, add as normal
t.add_node(current_node,parent_node)
t.recalculate_all_node_levels()
return t
#--- Daniel's tree output
def parse_Daniel_semicolon_based_tree_format_to_my_tree_class(filename):
t = tree()
with open(filename,'r') as fp:
for aline in fp:
aline = aline.strip()
node_names = aline.split(':')
n=0
for node_name in node_names:
if n==0:
class_name = node_name
if not t.has_node_with_name(node_name):
current_node = t.add_node(node_name)
else:
current_node = t.get_node_by_name(node_name)
else:
code_name = class_name + '/' + node_name
if not t.has_node_with_name(code_name):
current_node = t.add_node(code_name,prev_node)
else:
current_node = t.get_node_by_name(code_name)
prev_node = current_node
n+=1
return t
#---- Infomap
def convert_infomap_hierarchy_dot_tree_to_tree_string(filename):
tree_list = convert_infomap_hierarchy_dot_tree_to_tree_list(filename)
tree_string = convert_list_to_string_representation(tree_list)
return tree_string
def convert_infomap_hierarchy_dot_tree_to_tree_list(filename):
X,node_labels = read_infomap_hierarchy_dot_tree_file_contents(filename)
return parse_infomap_hierarchy_dot_tree_column(X,node_labels)
def read_infomap_hierarchy_dot_tree_file_contents(filename):
node_labels = []
max_depth = 0
with open(filename,'r') as fp:
Y = []
line_count = 0
for aline in fp:
line_count +=1
if line_count == 1:continue
aline = aline.strip()
fields = aline.split(' ')
node_name = fields[-1]
node_labels.append(node_name[1:-1])
elems = fields[0].split(':')
y = map(int,elems)
if len(y) > max_depth:
max_depth = len(y)
Y.append(y)
for y in Y:
lendiff = max_depth - len(y)
if lendiff >0:
for i in range(0,lendiff):
y.append(1)
X = numpy.array(Y)
return X,node_labels
def parse_infomap_hierarchy_dot_tree_column(X,node_labels):
current_level = []
comm_indices = X[:,0]
max_index = comm_indices[-1]
pivot2 = 0
for i in range(1,max_index+1):
pivot1 = pivot2
curr_elem = comm_indices[pivot2]
while curr_elem == i and pivot2<len(comm_indices):
pivot2 +=1
if pivot2<len(comm_indices): curr_elem = comm_indices[pivot2]
if pivot2 - pivot1 > 1:
current_level.append(parse_infomap_hierarchy_dot_tree_column(X[pivot1:pivot2,1:],node_labels[pivot1:pivot2]))
else:
current_level.append(node_labels[i-1])
if len(current_level) ==1:current_level = current_level[0]
return current_level
######## AUX
def convert_list_to_string_representation(tree_list):
tree_string = str(tree_list)
tree_string = tree_string.replace('[','(')
tree_string = tree_string.replace(']',')')
tree_string += ';'
return tree_string