-
Notifications
You must be signed in to change notification settings - Fork 0
/
big_cluster.py
executable file
·121 lines (105 loc) · 5.92 KB
/
big_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
from unionfind_bigcluster import UFNode, UnionFind
import itertools
import time
class BigCluster:
def __init__(self, filename: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clustering_big.txt'), min_spacing: int = 3): # by default, looks for the input in file "clustering_big.txt" in the same directory as this script
self.edge_endpoints = {} # { edge_cost: {(endpoint, endpoint)} } - still allows for (and there will be) duplicate edges with endpoints reversed - doesn't matter because the 2 endpoints will be unioned the first time their edge is considered, meaning the 2nd will be overlooked
self.min_spacing = min_spacing
self.cluster_field = UnionFind()
self.load_node_data(filename)
def load_node_data(self, filename: str):
with open(filename) as fh:
metadata = fh.readline().split()
print('Loading ' + metadata[0] + ' ' + metadata[1] + '-bit nodes')
raw_node_str = fh.readline()
while raw_node_str:
self.cluster_field.add_node_str(raw_node_str.strip().replace(' ', ''))
raw_node_str = fh.readline()
print(f'Loaded {len(self.cluster_field.nodes)} nodes')
def add_relevant_edges(self):
'''
Populate self.edge_endpoints with the (tail, head)s of strings not more than k-1 hamming distance apart
'''
for node_str in self.cluster_field.nodes:
possible_neighbors = self._calculate_possible_neighbors_iter(node_str)
present_neighbors = self._find_present_neighbors(possible_neighbors)
# add edges
for edge_cost in range(len(present_neighbors)):
# can use collections.defaultdict here
if edge_cost not in self.edge_endpoints:
self.edge_endpoints[edge_cost] = set()
for neighbor_str in present_neighbors[edge_cost]:
self.edge_endpoints[edge_cost].add((node_str, neighbor_str))
def _calculate_possible_neighbors_iter(self, node_str: str) -> list[set]:
'''
node_str: a string of binary digits representing a node
returns: A list of sets the possible neighbors of node_str, indexed by edge cost. eg, possible_neighbors[1] contains the set of possible neighbors with 1 change from node_str
'''
possible_edge_costs = range(self.min_spacing)
possible_neighbors = [set() for cost in possible_edge_costs] # [{possible neighbors of distance 1}, {of distance 2}, ...]
positions = [i for i in range(len(node_str))]
neighbor_cost_index = 0
for edge_cost in possible_edge_costs:
swap_indices = itertools.combinations(positions, edge_cost)
for swap_index_tup in swap_indices:
node_str_swapping = list(node_str)
for index in swap_index_tup:
node_str_swapping[index] = _swap_digit(node_str_swapping[index])
possible_neighbors[neighbor_cost_index].add(''.join(node_str_swapping))
neighbor_cost_index += 1
return possible_neighbors
def _find_present_neighbors(self, possible_neighbors: list[set]) -> list[set]:
'''
possible_neighbors: [{possible neighbors of distance 1}, {of distance 2}, ...]
return: in the same format, the neighbors that are actually present
'''
present_neighbors = [set() for edge_cost_possible_neighbors in possible_neighbors]
present_neighbors_index = 0
for edge_cost_possible_neighbors in possible_neighbors:
for possible_neighbor_str in edge_cost_possible_neighbors:
if possible_neighbor_str in self.cluster_field.nodes:
present_neighbors[present_neighbors_index].add(possible_neighbor_str)
present_neighbors_index += 1
return present_neighbors
def get_k_min_spacing(self):
'''
Carry out clustering for all relevant edges, stopping once all relevant edges have been processed.
Return the number of components remaining in self.cluster_field, which is the sought-after
'k such that there is a k-clustering with spacing at least min_spacing'
'''
edge_generator = self._yield_edges()
cost_and_endpoints = next(edge_generator)
while cost_and_endpoints:
cost, u, v = cost_and_endpoints[0], cost_and_endpoints[1][0], cost_and_endpoints[1][1]
if self.cluster_field.find(u) != self.cluster_field.find(v):
self.cluster_field.union(u,v)
try:
cost_and_endpoints = next(edge_generator)
except StopIteration: # upon exhausting all edges under the minimum spacing
return len(self.cluster_field.component_sizes)
def _yield_edges(self):
for edge_cost in self.edge_endpoints: # this depends on self.edge_endpoints containing edges in increasing order of edge_cost (all with cost 1, then all with cost 2, ...)
edges = self.edge_endpoints[edge_cost]
while edges:
yield edge_cost, edges.pop()
def _swap_digit(binary_digit: str):
if binary_digit == '1':
return '0'
elif binary_digit == '0':
return '1'
else:
raise ValueError('non-binary string value found')
if __name__ == '__main__':
load_start = time.time()
cluster_test = BigCluster(min_spacing=3)
load_finish = time.time()
print(f'Initialized clustering object in {load_finish-load_start}s')
add_edges_start = time.time()
cluster_test.add_relevant_edges()
add_edges_end = time.time()
print(f'Added relevant edges in {add_edges_end - add_edges_start}s')
get_k_start = time.time()
result = cluster_test.get_k_min_spacing()
get_k_finish = time.time()
print(f'Calculated k={result} clusters in {get_k_finish-get_k_start}s \n')