diff --git a/graphlearn/abstract_graphs/directedgraphtools.py b/graphlearn/abstract_graphs/directedgraphtools.py deleted file mode 100644 index 42e430c..0000000 --- a/graphlearn/abstract_graphs/directedgraphtools.py +++ /dev/null @@ -1,35 +0,0 @@ - - - -def get_start_and_end_node(graph): - # make n the first node of the sequence - start=-1 - end=-1 - for n,d in graph.nodes_iter(data=True): - - # edge nodes cant be start or end - if 'edge' in d: - continue - - # check for start - if start == -1: - l= graph.predecessors(n) - if len(l)==0: - start = n - if len(l)==1: - if graph.node[ l[0] ]['label']=='=': - start = n - - # check for end: - if end == -1: - l= graph.neighbors(n) - if len(l)==0: - end = n - if len(l)==1: - if graph.node[ l[0] ]['label']=='=': - end = n - - # check and return - if start==-1 or end==-1: - raise Exception ('your beautiful "rna" has no clear start or end') - return start,end diff --git a/graphlearn/abstract_graphs/forgi/__init__.py b/graphlearn/abstract_graphs/forgi/__init__.py index 850cc53..578179e 100644 --- a/graphlearn/abstract_graphs/forgi/__init__.py +++ b/graphlearn/abstract_graphs/forgi/__init__.py @@ -2,4 +2,118 @@ #i took what i needed and removed unnecessary dependencies # in this file i try to make a wrapper +import networkx as nx +#import graphlearn.abstract_graphs.forgi.bulge_graph as lol +import bulge_graph as lol + + +def get_abstr_graph(struct): + # get forgi string + bg = lol.BulgeGraph() + bg.from_dotbracket(struct, None) + forgi = bg.to_bg_string() + + g=make_abstract_graph(forgi) + return g + + +def make_abstract_graph(forgi): + g=forgi_to_graph(forgi) + connect_multiloop(g) + return g + + +def forgi_to_graph(forgi): + def make_node_set(numbers): + ''' + forgi gives me stuff like define STEM START,END,START,END .. we take indices and output a list + ''' + numbers=map(int,numbers) + ans=set() + while len(numbers)>1: + a,b = numbers[:2] + numbers=numbers[2:] + for n in range(a-1,b): ans.add(n) # should be range a,b+1 but the biologists are weired + return ans + + + def get_pairs(things): + ''' + ''' + current=[] + for thing in things: + if thing[0]=='m': + current.append(thing) + if len(current)==2: + yield current + current = [] + + + g=nx.Graph() + fni={} # forgi name to networkx node id + + for l in forgi.split('\n')[:-1]: + line= l.split() + if line[0] not in ['define','connect']: + continue + + # parse stuff like: define s0 1 7 65 71 + if line[0]=='define': + + # get necessary attributes for a node + label=line[1][0] + id=line[1] + myset=make_node_set(line[2:]) + node_id=len(g) + + # build a node and remember its id + g.add_node(node_id) + fni[id]=node_id + g.node[node_id].update( {'label':label, 'contracted':myset} ) + + # parse stuff like this: connect s3 h2 m1 m3 + if line[0]=='connect': + # get nx name of the first element. + hero= fni[ line[1] ] + # connect that hero to the following elements + for fn in line[2:]: + g.add_edge(hero, fni[fn]) + + # remember what pairs multiloop pieces we are part of + # i assume that if a stack is part of 2 multiloops they appear in order .. + # this assumption may be wrong so be careful + g.node[fni[line[1]]]['multipairs']=[] + for a,b in get_pairs(line[2:]): + g.node[fni[line[1]]]['multipairs'].append( (fni[a],fni[b]) ) + return g + + +def connect_multiloop(g): + + def merge(graph, node, node2): + ''' + merge node2 into the node. + input nodes are strings, + node is the king + ''' + for n in graph.neighbors(node2): + graph.add_edge(node, n) + graph.node[node]['contracted'].update(graph.node[node2]['contracted']) + graph.remove_node(node2) + + + merge_dict={} + for node,d in g.nodes(data=True): + if d['label'] == 's': + for a,b in g.node[node]['multipairs']: + # finding real names... this works by walking up the + #ladder merge history until the root is found :) + while a not in g: + a=merge_dict[a] + while b not in g: + b=merge_dict[b] + if a==b: + continue + merge_dict[b]=a + merge(g,a,b) diff --git a/graphlearn/abstract_graphs/forgi/__init__.pyc b/graphlearn/abstract_graphs/forgi/__init__.pyc index 0d09fb3..c232bec 100644 Binary files a/graphlearn/abstract_graphs/forgi/__init__.pyc and b/graphlearn/abstract_graphs/forgi/__init__.pyc differ diff --git a/graphlearn/abstract_graphs/forgi/bulge_graph.py b/graphlearn/abstract_graphs/forgi/bulge_graph.py index c3d9248..7634f94 100644 --- a/graphlearn/abstract_graphs/forgi/bulge_graph.py +++ b/graphlearn/abstract_graphs/forgi/bulge_graph.py @@ -2,7 +2,13 @@ """bulge_graph.py: A graph representation of RNA secondary structure based on its decomposition into primitive structure types: stems, hairpins, - interior loops, multiloops, etc...""" + interior loops, multiloops, etc... + + + for eden and graphlearn we stripped forgi down to this single file. + + forgi: https://github.com/pkerpedjiev/forgi + """ __author__ = "Peter Kerpedjiev" __copyright__ = "Copyright 2012, 2013, 2014" @@ -10,16 +16,160 @@ __maintainer__ = "Peter Kerpedjiev" __email__ = "pkerp@tbi.univie.ac.at" + + + import sys import collections as col -import random import re import itertools as it - -import stuff as fus import os -import itertools as it import operator as oper +import contextlib +import random +import shutil +import tempfile as tf + + +bracket_left = "([{0 and stack[k][len(stack[k])-1] < j: + k+=1 + stack[k].append(j) + return k + +def delete_from_stack(stack, j): + #print "del", j + k = 0 + while len(stack[k])==0 or stack[k][len(stack[k])-1] != j: + k+=1 + stack[k].pop() + return k + +def pairtable_to_dotbracket(pt): + """ + Converts arbitrary pair table array (ViennaRNA format) to structure in dot bracket format. + """ + stack = col.defaultdict(list) + seen = set() + res = "" + for i in range(1, pt[0]+1): + if pt[i] != 0 and pt[i] in seen: + raise ValueError('Invalid pairtable contains duplicate entries') + + seen.add(pt[i]) + + if pt[i]==0: + res += '.' + else: + if pt[i]>i: # '(' check if we can stack it... + res += bracket_left[insert_into_stack(stack, i, pt[i])] + else: # ')' + res += bracket_right[delete_from_stack(stack, i)] + + return res + +def inverse_brackets(bracket): + res = col.defaultdict(int) + for i,a in enumerate(bracket): + res[a] = i + return res + +def dotbracket_to_pairtable(struct): + """ + Converts arbitrary structure in dot bracket format to pair table (ViennaRNA format). + """ + pt = [0] * (len(struct)+1) + pt[0] = len(struct) + + stack = col.defaultdict(list) + inverse_bracket_left = inverse_brackets(bracket_left) + inverse_bracket_right = inverse_brackets(bracket_right) + + for i,a in enumerate(struct): + i += 1 + #print i,a, pt + if a == ".": pt[i] = 0 + else: + if a in inverse_bracket_left: stack[inverse_bracket_left[a]].append(i) + else: + if len(stack[inverse_bracket_right[a]]) == 0: + raise ValueError('Too many closing brackets!') + j = stack[inverse_bracket_right[a]].pop() + pt[i] = j + pt[j] = i + + if len(stack[inverse_bracket_left[a]]) != 0: + raise ValueError('Too many opening brackets!') + + return pt + +def pairtable_to_tuples(pt): + ''' + Convert a pairtable to a list of base pair tuples. + i.e. [4,3,4,1,2] -> [(1,3),(2,4),(3,1),(4,2)] + :param pt: A pairtable + :return: A list paired tuples + ''' + pt = iter(pt) + + # get rid of the first element which contains the length + # of the sequence. We'll figure it out after the traversal + pt.next() + + tuples = [] + for i, p in enumerate(pt): + tuples += [(i+1, p)] + return tuples + +def tuples_to_pairtable(pair_tuples, seq_length=None): + ''' + Convert a representation of an RNA consisting of a list of tuples + to a pair table: + + i.e. [(1,3),(2,4),(3,1),(4,2)] -> [4,3,4,1,2] + + :param tuples: A list of pair tuples + :param seq_length: How long is the sequence? Only needs to be passed in when + the unpaired nucleotides aren't passed in as (x,0) tuples. + :return: A pair table + ''' + if seq_length is None: + max_bp = max([max(x) for x in pair_tuples]) + else: + max_bp = seq_length + + pt = [0] * (max_bp + 1) + pt[0] = max_bp + + for tup in pair_tuples: + pt[tup[0]] = tup[1] + + return pt + + + def add_bulge(bulges, bulge, context, message): @@ -38,95 +188,6 @@ def add_bulge(bulges, bulge, context, message): return bulges -def from_id_seq_struct(id_str, seq, struct): - """ - Return a new BulgeGraph with the given id, - sequence and structure. - - :param id_str: The id (i.e. >1y26) - :param seq: the sequence (i.e. 'ACCGGG') - :param struct: The dotplot secondary structure (i.e. '((..))') - """ - bg = BulgeGraph() - bg.from_dotbracket(struct) - bg.name = id_str - bg.seq = seq - - return bg - - -def from_fasta_text(fasta_text): - """ - Create a bulge graph or multiple bulge - graphs from some fasta text. - """ - # compile searches for the fasta id, sequence and - # secondary structure respectively - id_search = re.compile('>(.+)') - seq_search = re.compile('^([acguACGU]+)$') - - prev_id = None - prev_seq = None - prev_struct = None - - bgs = [] - - for line in fasta_text.split('\n'): - # newlines suck - line = line.strip() - - # find out what this line contains - id_match = id_search.match(line) - seq_match = seq_search.match(line) - - if id_match is not None: - # we found an id, check if there's a previous - # sequence and structure, and create a BG - prev_id = id_match.group(0).strip('>') - - if prev_seq is None and prev_struct is None: - # must be the first sequence/structure - continue - - # make sure we have - if prev_seq is None: - raise Exception("No sequence for id: {}", prev_id) - if prev_struct is None: - raise Exception("No sequence for id: {}", prev_id) - if prev_id is None: - raise Exception("No previous id") - - bgs += [from_id_seq_struct(prev_id, prev_seq, prev_struct)] - - if seq_match is not None: - prev_seq = seq_match.group(0) - if id_match is None and seq_match is None: - if len(line) > 0: - prev_struct = line - - bgs += [from_id_seq_struct(prev_id, prev_seq, prev_struct)] - - if len(bgs) == 1: - return bgs[0] - else: - return bgs - - -def from_fasta(filename): - """ - Load a bulge graph from a fasta file. The format of the fasta - file is roughly: - - >1 - AACCCAA - ((...)) - """ - with open(filename, 'r') as f: - text = f.read() - bg = BulgeGraph() - bg.from_fasta(text) - return bg - def any_difference_of_one(stem, bulge): """ @@ -333,8 +394,8 @@ def __init__(self, bg_file=None, dotbracket_str='', seq=''): for i, s in enumerate(seq): self.seq_ids += [(' ', str(i + 1), ' ')] - if bg_file is not None: - self.from_bg_file(bg_file) + #if bg_file is not None: + # self.from_bg_file(bg_file) # get an internal index for a named vertex # this applies to both stems and edges @@ -796,36 +857,6 @@ def merge_vertices(self, vertices): return new_vertex - def shortest_bg_loop(self, vertex): - """ - Find the shortest loop containing this node. The vertex should - be a multiloop. - - :param vertex: The name of the vertex to find the loop. - :return: A list containing the elements in the shortest cycle. - """ - G = self.to_networkx() - - # use the nucleotide in the middle of this element as the starting point - residues = sorted(list(self.define_residue_num_iterator(vertex, adjacent=True))) - mid_res = residues[len(residues) / 2] - - if len(residues) == 2: - # no-residue multiloop - # find the neighbor which isn't part of the multiloop - neighbors = [n for n in G.neighbors(mid_res) if n != residues[0]] - - if len(neighbors) == 2: - # break the chain so that we don't get cycles within a stem - for n in neighbors: - if abs(n - mid_res) == 1: - G.remove_edge(n, mid_res) - break - - import forgi.utilities.graph as fug - - path = fug.shortest_cycle(G, mid_res) - return path def nucleotides_to_elements(self, nucleotides): """ @@ -1247,21 +1278,6 @@ def find_multiloop_loops(self): return loop_elems, loops - def from_fasta(self, fasta_str, dissolve_length_one_stems=False): - """ - Create a bulge graph from a fasta-type file containing the following - format: - - > id - ACCGGGG - ((...)) - """ - lines = fasta_str.split('\n') - self.from_dotbracket(lines[2].strip(), dissolve_length_one_stems) - self.name = lines[0].strip('>') - self.seq = lines[1].strip() - - self.seq_ids_from_seq() def seq_ids_from_seq(self): """ @@ -1346,8 +1362,8 @@ def from_dotbracket(self, dotbracket_str, dissolve_length_one_stems=False): if len(dotbracket_str) == 0: return - pt = fus.dotbracket_to_pairtable(dotbracket_str) - tuples = fus.pairtable_to_tuples(pt) + pt = dotbracket_to_pairtable(dotbracket_str) + tuples = pairtable_to_tuples(pt) self.from_tuples(tuples) if dissolve_length_one_stems: @@ -1364,7 +1380,7 @@ def to_pair_table(self): """ pair_tuples = self.to_pair_tuples() - return fus.tuples_to_pairtable(pair_tuples) + return tuples_to_pairtable(pair_tuples) def to_pair_tuples(self): """ @@ -1426,33 +1442,6 @@ def bpseq_to_tuples_and_seq(self, bpseq_str): return (tuples, seq) - def from_bpseq_str(self, bpseq_str, dissolve_length_one_stems=False): - """ - Create the graph from a string listing the base pairs. - - The string should be formatted like so: - - 1 G 115 - 2 A 0 - 3 A 0 - 4 U 0 - 5 U 112 - 6 G 111 - - :param bpseq_str: The string, containing newline characters. - :return: Nothing, but fill out this structure. - """ - self.__init__() - - tuples, seq = self.bpseq_to_tuples_and_seq(bpseq_str) - - self.seq = seq - self.seq_length = len(seq) - self.from_tuples(tuples) - - if dissolve_length_one_stems: - self.dissolve_length_one_stems() - def from_tuples(self, tuples): """ Create a bulge_graph from a list of pair tuples. Unpaired @@ -1540,65 +1529,8 @@ def to_dotbracket_string(self): :return: A dot-bracket representation of this BulgeGraph """ pt = self.to_pair_table() - return fus.pairtable_to_dotbracket(pt) - - def to_fasta_string(self): - """ - Output the BulgeGraph representation as a fast string of the - format: - - >id - AACCCAA - ((...)) - """ - output_string = '' - - output_string += ">%s\n" % (self.name) - output_string += "%s\n" % (self.seq) - output_string += "%s" % (self.to_dotbracket_string()) - - return output_string - - def from_bg_file(self, bg_file): - """ - Load from a file containing a text-based representation - of this BulgeGraph. - - :param bg_file: The filename. - :return: No return value since the current structure is the one - being loaded. - """ - with open(bg_file, 'r') as f: - bg_string = "".join(f.readlines()) - self.from_bg_string(bg_string) - - def from_bg_string(self, bg_str): - """ - Populate this BulgeGraph from the string created by the method - to_bg_string. - - :param bg_str: The string representation of this BugleGraph. - """ - lines = bg_str.split('\n') - for line in lines: - line = line.strip() - parts = line.split() - if len(parts) == 0: - # blank line - continue - if parts[0] == 'length': - self.seq_length = int(parts[1]) - elif parts[0] == 'define': - self.defines[parts[1]] = map(int, parts[2:]) - elif parts[0] == 'connect': - for p in parts[2:]: - self.edges[parts[1]].add(p) - self.edges[p].add(parts[1]) - elif parts[0] == 'seq': - self.seq = parts[1] - elif parts[0] == 'name': - self.name = parts[1].strip() - + return pairtable_to_dotbracket(pt) + def sorted_stem_iterator(self): """ Iterate over a list of the stems sorted by the lowest numbered @@ -2519,7 +2451,7 @@ def is_pseudoknot(self): return True return False - + ''' def to_networkx(self): """ Convert this graph to a networkx representation. This representation @@ -2550,7 +2482,7 @@ def to_networkx(self): G.add_edge(f, t) return G - + ''' def ss_distance(self, e1, e2): @@ -2605,6 +2537,7 @@ def ss_distance(self, e1, e2): return min(path_lengths) + 1 return min(path_lengths) + 2 + def get_position_in_element(self, resnum): node = self.get_node_from_residue_num(resnum) diff --git a/graphlearn/abstract_graphs/forgi/bulge_graph.pyc b/graphlearn/abstract_graphs/forgi/bulge_graph.pyc index 0045d41..4cb4aaf 100644 Binary files a/graphlearn/abstract_graphs/forgi/bulge_graph.pyc and b/graphlearn/abstract_graphs/forgi/bulge_graph.pyc differ diff --git a/graphlearn/abstract_graphs/forgi/stuff.py b/graphlearn/abstract_graphs/forgi/stuff.py index 5a04b87..97b75ac 100644 --- a/graphlearn/abstract_graphs/forgi/stuff.py +++ b/graphlearn/abstract_graphs/forgi/stuff.py @@ -5,59 +5,9 @@ import tempfile as tf import collections as col - - bracket_left = "([{ (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1), (s2n,s2n+1,s2n+2,...s3n-1), ... - ''' - return it.izip(*[iter(iterable)]*n) - -def merge_intervals(intervals, diff = 0): - ''' - Take a set of intervals, and combine them whenever the endpoints - match. - - I.e. [(42,47), (55,60), (60,63), (1,9), (63,71)] - - Should yield - - [(1,9),(42,47), (55,71)] - - There should be no overlapping intervals. - - @param intervals: A set of tuples indicating intervals - @return: A list of merged intervals - ''' - intervals.sort() - iter_intervals = iter(intervals) - - # get the first interval - curr_interval = list(next(iter_intervals)) - - merged_intervals = [] - - for i in iter_intervals: - if abs(i[0] - curr_interval[1]) <= diff: - # the start of this interval is equal to the end of the - # current merged interval, so we merge it - curr_interval[1] = i[1] - else: - # start a new interval and add the current merged one - # to the list of intervals to return - merged_intervals += [curr_interval] - curr_interval = list(i) - - merged_intervals += [curr_interval] - return merged_intervals - def gen_random_sequence(l): ''' Generate a random RNA sequence of length l. diff --git a/graphlearn/abstract_graphs/forgi/stuff.pyc b/graphlearn/abstract_graphs/forgi/stuff.pyc deleted file mode 100644 index 33c620a..0000000 Binary files a/graphlearn/abstract_graphs/forgi/stuff.pyc and /dev/null differ diff --git a/graphlearn/abstract_graphs/graphmanager.py b/graphlearn/abstract_graphs/graphmanager.py new file mode 100644 index 0000000..898aa5f --- /dev/null +++ b/graphlearn/abstract_graphs/graphmanager.py @@ -0,0 +1,151 @@ +''' +ubersamplers fit will take a list of graphmanager as input. +''' + +import subprocess as sp +import eden.converter.rna as conv +import forgi +import networkx as nx +import graphlearn.abstract_graphs.rnaabstract +from graphlearn.utils import draw +from eden.graph import Vectorizer + + +import rnasampler as rna +import rnaabstract as rnaa + +def fromfasta(file='RF00005.fa',vectorizer=None): + s=[] + with open('RF00005.fa') as f: + for line in f: + s.append(line) + + while s: + seqname=s[0] + seq='' + for i,l in enumerate(s[1:]): + if l[0] != '>': + seq+=l.strip() + else: + break + + shape=callRNAshapes(seq) + if shape: + yield GraphManager(seqname,seq,vectorizer,shape) + s=s[i+1:] + + +class GraphManager(object): + ''' + these are the basis for creating a fitting an ubersampler + def get_estimateable(self): + def get_base_graph(self): + def get_abstract_graph(self): + + ''' + def __init__(self,sequence_name,sequence,vectorizer,structure): + self.sequence_name=sequence_name + self.sequence=sequence + self.vectorizer=vectorizer + self.structure=structure + + # create a base_graph , expand + base_graph=conv.sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=structure) + self.base_graph=vectorizer._edge_to_vertex_transform(base_graph) + + # get an expanded abstract graph + abstract_graph=forgi.get_abstr_graph(structure) + abstract_graph=vectorizer._edge_to_vertex_transform(abstract_graph) + + #connect edges to nodes in the abstract graph + self.abstract_graph=edge_parent_finder(abstract_graph,self.base_graph) + + + # we are forced to set a label .. for eden reasons + def name_edges(graph,what=''): + for n,d in graph.nodes(data=True): + if 'edge' in d: + d['label']=what + + # in the abstract graph , all the edge nodes need to have a contracted attribute. + # originaly this happens naturally but since we make multiloops into one loop there are some left out + def setset(graph): + for n,d in graph.nodes(data=True): + if 'contracted' not in d: + d['contracted']=set() + + name_edges(self.abstract_graph) + setset(self.abstract_graph) + + + def get_estimateable(self): + + # returns an expanded, undirected graph + # that the eden machine learning can compute + return nx.disjoint_union(self.base_graph,self.abstract_graph) + + def get_base_graph(self): + if 'directed_base_graph' not in self.__dict__: + self.directed_base_graph= graphlearn.abstract_graphs.rnaabstract.expanded_rna_graph_to_digraph(self.base_graph) + + return self.directed_base_graph + + def get_abstract_graph(self): + return self.abstract_graph + + + + + + +def callRNAshapes(sequence): + + cmd='RNAshapes %s' % sequence + out = sp.check_output(cmd, shell=True) + s = out.strip().split('\n') + + for li in s[2:]: + #print li.split() + energy,shape,abstr= li.split() + if abstr=='[[][][]]': + return shape + + +def edge_parent_finder(abstract,graph): + # find out to which abstract node the edges belong + # finding out where the edge-nodes belong, because the contractor cant possibly do this + #draw.graphlearn_draw([abstract,graph],size=10, contract=False,vertex_label='id') + + getabstr = {contra: node for node, d in abstract.nodes(data=True) for contra in d.get('contracted', [])} + #print getabstr + for n, d in graph.nodes(data=True): + if 'edge' in d: + # if we have found an edge node... + # lets see whos left and right of it: + zomg= graph.neighbors(n) + #print zomg + #draw.draw_center(graph,1,20,contract=False,size=20) + n1, n2 = zomg + + # case1: ok those belong to the same gang so we most likely also belong there. + if getabstr[n1] == getabstr[n2]: + abstract.node[getabstr[n1]]['contracted'].add(n) + + # case2: neighbors belong to different gangs... + else: + abstract_intersect = set(abstract.neighbors(getabstr[n1])) & set(abstract.neighbors(getabstr[n2])) + + # case 3: abstract intersect in radius 1 failed, so lets try radius 2 + if not abstract_intersect: + abstract_intersect = set(nx.single_source_shortest_path(abstract,getabstr[n1],2)) & set(nx.single_source_shortest_path(abstract,getabstr[n2],2)) + if len(abstract_intersect) > 1: + print "weired abs intersect..." + + for ai_node in abstract_intersect: + if 'contracted' in abstract.node[ai_node]: + abstract.node[ai_node]['contracted'].add(n) + else: + abstract.node[ai_node]['contracted'] = set([n]) + + + return abstract diff --git a/graphlearn/abstract_graphs/molecules.py b/graphlearn/abstract_graphs/molecules.py new file mode 100644 index 0000000..d8a3dd9 --- /dev/null +++ b/graphlearn/abstract_graphs/molecules.py @@ -0,0 +1,290 @@ +import graphlearn.abstract_graphs.rnaabstract +from ubergraphlearn import UberSampler,UberGrammar +import ubergraphlearn +import networkx as nx +import graphlearn.utils.draw as draw +from collections import defaultdict +import eden + + +class MoleculeSampler(UberSampler): + + def _sample_init(self, graph): + self.postprocessor.fit(self) + graph=self.postprocessor.postprocess(graph) + self._score(graph) + self._sample_notes = '' + self._sample_path_score_set = set() + return graph + + def _score(self,graph): + estimateable=graph.graphmanager.get_estimateable() + super(MoleculeSampler,self)._score(estimateable) + graph._score=estimateable._score + return graph._score + + +class PostProcessor: + def __init__(self): + pass + + def fit(self, other): + self.vectorizer=other.vectorizer + + def postprocess(self, graph): + grmgr=GraphManager(graph,self.vectorizer) + graph=grmgr.get_base_graph() + graph.graphmanager=grmgr + return graph + + + +class GraphManager(object): + ''' + these are the basis for creating a fitting an ubersampler + def get_estimateable(self): + def get_base_graph(self): + def get_abstract_graph(self): + ''' + def __init__(self,graph,vectorizer): + self.base_graph=vectorizer._edge_to_vertex_transform(graph) + self.abstract_graph=make_abstract(self.base_graph.copy()) + + # in the abstract graph , all the edge nodes need to have a contracted attribute. + # originaly this happens naturally but since we make multiloops into one loop there are some left out + def setset(graph): + for n,d in graph.nodes(data=True): + if 'contracted' not in d: + d['contracted']=set() + setset(self.abstract_graph) + + def get_estimateable(self): + # returns an expanded, undirected graph + # that the eden machine learning can compute + return nx.disjoint_union(self.base_graph,self.abstract_graph) + def get_base_graph(self): + return self.base_graph + def get_abstract_graph(self): + return self.abstract_graph + + + +''' +here we invent the abstractor function +''' +import networkx as nx +import graphlearn.utils.draw as draw + + +def make_abstract(extgraph): + ''' + ok the plan: + build abstract graph by + - get abstractnodes + - save abs parents in each node of base graph + - look at all the edge nodes in the base graph to connect the abs graph nodes + :param graph: an edge to vertex transformed graph + :return: edge to vertex transformed abstract graph + ''' + + # annotate base graph + for n,d in extgraph.nodes(data=True): + d['cycle']=list(node_to_cycle(extgraph,n)) + d['cycle'].sort() + + # prepare + abstract_graph=nx.Graph() + def fhash(stuff): + return eden.fast_hash(stuff,2**20-1) + + + # make sure most of the abstract nodes are created. + # base graph nodes have a list of abstract parents. + for n,d in extgraph.nodes(data=True): + # make sure abstract node exists + cyclash = fhash(d['cycle']) + + if cyclash not in abstract_graph.node: + abstract_graph.add_node(cyclash) + abstract_graph.node[cyclash]['contracted']= set(d['cycle']) + + # tell everyone interested about it + for e in d['cycle']: + node=extgraph.node[e] + if 'parent' not in node: + node['parent'] = set() + node['parent'].add(cyclash) + + #connect nodes in the abstract graph + f=lambda x: list(x)[0] + for n,d in abstract_graph.nodes(data=True): + #look at all the children and their neighbors parents + + + if len(d['contracted'])>1: + # setting label for cycles.. + + # this will only use the length.. + #d['label']= "cycle "+str( len(d['contracted']) ) + # but i might as well use the hash of labels of all the contracted nodes + + + labels= [ ord( extgraph.node[childid]['label']) for childid in d['contracted'] ] + labels.sort() + d['label']="cycle %d" % fhash(labels) + + + else: + d['label']= extgraph.node[ f(d['contracted']) ]['label'] + + + if len(d['contracted']) ==1 and 'edge' in extgraph.node [ f(d['contracted'])]: + d['edge']=True + d['label']=d['label'] + # for all nodes + for base_node in d['contracted']: + base_neighbors = extgraph.neighbors(base_node) + # for all the neighbors + for neigh in base_neighbors: + # find out if we have to build a connector node + if len (extgraph.node[neigh]['cycle']) > 1 and len (d['contracted']) > 1: + + for other in extgraph.node[neigh]['parent']: + if other != n: + l=[other,n] + l.sort() + connector = fhash(l) + if connector not in abstract_graph.node: + # we need to consider making the edge the actual intersect of the two... + + abstract_graph.add_node(connector) + abstract_graph.node[connector]['edge']=True + + #abstract_graph.node[connector]['label']='edge' + num_shared_nodes = len(abstract_graph.node[other]['contracted'] & d['contracted']) + abstract_graph.node[connector]['label']="shared"+str(num_shared_nodes) + + abstract_graph.add_edge(other,connector) + abstract_graph.add_edge(connector,n) + + else: + for e in extgraph.node[neigh]['parent']: + abstract_graph.add_edge(n,e) + return abstract_graph + + + +def node_to_cycle(graph,n,min_cycle_size=3): + """ + :param graph: + :param n: start node + :param min_cycle_size: + :return: a cycle the node belongs to + + so we start in node n, + then we expand 1 node further in each step. + if we meet a node we had before we found a cycle. + + there are 3 possible cases. + - frontier hits frontier -> cycle of even length + - frontier hits visited nodes -> cycle of uneven length + - it is also possible that the newly found cycle doesnt contain our start node. so we check for that + """ + + def close_cycle(collisions,parent,root): + ''' + we found a cycle, but that does not say that the root node is part of that cycle..S + ''' + def extend_path_to_root(work_list,parent_dict,root): + """ + :param work_list: list with start node + :param parent_dict: the tree like dictionary that contains each nodes parent(s) + :param root: root node. probably we dont really need this since the root node is the orphan + :return: cyclenodes or none + + --- mm we dont care if we get the shortest path.. that is true for cycle checking.. but may be a problem in + --- cycle finding.. dousurururururu? + """ + current=work_list[-1] + while current != root: + work_list.append( parent_dict[current][0] ) + current=work_list[-1] + return work_list[:-1] + + # any should be fine. e is closing a cycle, + # note: e might contain more than one hit but we dont care + e= collisions.pop() + #print 'r',e + # we closed a cycle on e so e has 2 parents... + li=parent[e] + a = [li[0]] + b = [li[1]] + #print 'pre',a,b + # get the path until the root node + a=extend_path_to_root(a,parent,root) + b=extend_path_to_root(b,parent,root) + #print 'comp',a,b + #of the paths to the root node dont overlap, the root node musst be in the loop + a=set(a) + b=set(b) + intersect=a & b + if len(intersect) ==0: + paths= a | b + paths.add(e) + paths.add(root) + return paths + return False + + + FAILEDVALUE = set([n]) + frontier= set([n]) + step=0 + visited=set() + parent=defaultdict(list) + + while frontier: + #print frontier + step+=1 + + # give me new nodes: + next=[] + for front_node in frontier: + new = set(graph.neighbors(front_node)) - visited + next.append( new ) + for e in new: + parent[e].append(front_node) + + # we merge the new nodes. if 2 sets collide, we found a cycle of even length + while len(next)>1: + # merge + s1=next[1] + s2=next[0] + merge = s1 | s2 + + # check if we havee a cycle => s1,s2 overlap + if len(merge) < len(s1)+len(s2): + col= s1&s2 + cycle=close_cycle(col,parent,n) + if cycle: + if step*2 > min_cycle_size: + return cycle + return FAILEDVALUE + + #delete from list + next[0]=merge + del next[1] + next=next[0] + + # now we need to check for cycles of uneven length => the new nodes hit the old frontier + if len(next & frontier) > 0: + col= next&frontier + cycle = close_cycle(col,parent,n) + if cycle: + if step*2-1 > min_cycle_size: + return cycle + return FAILEDVALUE + + # we know that the current frontier didntclose cycles so we dont need to look at them again + visited = visited | frontier + frontier=next + return FAILEDVALUE \ No newline at end of file diff --git a/graphlearn/abstract_graphs/rnaabstract.py b/graphlearn/abstract_graphs/rnaabstract.py index 450c426..a40db3c 100644 --- a/graphlearn/abstract_graphs/rnaabstract.py +++ b/graphlearn/abstract_graphs/rnaabstract.py @@ -1,20 +1,20 @@ -import directedgraphtools + import networkx as nx from eden.modifier.graph.structure import contraction import subprocess as sp import graphlearn.graphtools as gt - +import graphlearn ''' -rna_abstractor_mk2 +direct_abstraction_wrapper extends the self-build abstractor, but only works on fresh graphs ''' def get_sequence(digraph): - current,end= directedgraphtools.get_start_and_end_node(digraph) + current,end= graphlearn.abstract_graphs.rnaabstract.get_start_and_end_node(digraph) seq=digraph.node[current]['label'] while current != end: @@ -113,7 +113,7 @@ def direct_abstractor(graph,v): :return: contracted graph ''' - n,not_used= directedgraphtools.get_start_and_end_node(graph) + n,not_used= graphlearn.abstract_graphs.rnaabstract.get_start_and_end_node(graph) tasks=[n] result = nx.Graph() @@ -290,3 +290,122 @@ def contract(graph): contracted_graph = contraction( [graph], contraction_attribute = 'type', modifiers = [], nesting = False).next() return contracted_graph + + + + + + + + +from eden.converter.rna.rnafold import rnafold_to_eden +class PostProcessor: + ''' + postprocessotr to refold with rnafold + ''' + def __init__(self): + pass + + def fit(self, other): + print 'OMG i got a vectorizer kthx' + self.vectorizer=other.vectorizer + + def postprocess(self, graph): + return self.rna_refold( graph ) + + def rna_refold(self, digraph=None, seq=None,vectorizer=None): + """ + :param digraph: + :param seq: + :return: will extract a sequence, RNAfold it and create a abstract graph + """ + # get a sequence no matter what :) + if not seq: + seq= get_sequence(digraph) + #print 'seq:',seq + graph = rnafold_to_eden([('emptyheader',seq)], shape_type=5, energy_range=30, max_num=3).next() + expanded_graph = self.vectorizer._edge_to_vertex_transform(graph) + ex_di_graph = graphlearn.abstract_graphs.rnaabstract.expanded_rna_graph_to_digraph(expanded_graph) + ex_di_graph.graph['sequence']= seq + #abstract_graph = directedgraphtools.direct_abstraction_wrapper(graph,0) + return ex_di_graph + + +import graphmanager +class ForgiPostprocessor: + def __init__(self): + pass + + def fit(self, other): + self.vectorizer=other.vectorizer + + def postprocess(self, seq): + # if we get a graph .. transform to sequence + if isinstance(seq,nx.Graph): + seq= get_sequence(seq) + + # get shape + shape = graphmanager.callRNAshapes(seq) + if shape is None: + #raise Exception('unfoldable') + return None + name='set real name later' + # build graphmanager + grmgr=graphmanager.GraphManager(name,seq,self.vectorizer,shape) + # get graph + graph=grmgr.get_base_graph() + graph.graphmanager=grmgr + graph.graph['sequence'] = seq + return graph + + +def get_start_and_end_node(graph): + # make n the first node of the sequence + start=-1 + end=-1 + for n,d in graph.nodes_iter(data=True): + + # edge nodes cant be start or end + if 'edge' in d: + continue + + # check for start + if start == -1: + l= graph.predecessors(n) + if len(l)==0: + start = n + if len(l)==1: + if graph.node[ l[0] ]['label']=='=': + start = n + + # check for end: + if end == -1: + l= graph.neighbors(n) + if len(l)==0: + end = n + if len(l)==1: + if graph.node[ l[0] ]['label']=='=': + end = n + + # check and return + if start==-1 or end==-1: + raise Exception ('your beautiful "rna" has no clear start or end') + return start,end + + +def expanded_rna_graph_to_digraph(graph): + ''' + :param graph: an expanded rna representing graph as produced by eden. + properties: backbone edges are replaced by a node labeled '-'. + rna reading direction is reflected by ascending node ids in the graph. + :return: a graph, directed edges along the backbone + ''' + digraph=nx.DiGraph(graph) + for n,d in digraph.nodes(data=True): + if 'edge' in d: + if d['label']=='-': + ns=digraph.neighbors(n) + ns.sort() + digraph.remove_edge(ns[1],n) + digraph.remove_edge(n,ns[0]) + return digraph \ No newline at end of file diff --git a/graphlearn/abstract_graphs/rnasampler.py b/graphlearn/abstract_graphs/rnasampler.py index 281babb..a92295f 100644 --- a/graphlearn/abstract_graphs/rnasampler.py +++ b/graphlearn/abstract_graphs/rnasampler.py @@ -1,11 +1,10 @@ - +import graphlearn.abstract_graphs.rnaabstract from ubergraphlearn import UberSampler,UberGrammar import ubergraphlearn import networkx as nx import graphlearn.utils.draw as draw -import random -from eden.converter.rna.rnafold import rnafold_to_eden -import directedgraphtools + + class RNASampler(UberSampler): @@ -32,14 +31,29 @@ def _stop_condition(self, graph): turning sample starter graph to digraph ''' def _sample_init(self, graph): - graph = self.vectorizer._edge_to_vertex_transform(graph) - graph = expanded_rna_graph_to_digraph(graph) + #graph = self.vectorizer._edge_to_vertex_transform(graph) + #graph = dgtools.expanded_rna_graph_to_digraph(graph) + self.postprocessor.fit(self) + + graph=self.postprocessor.postprocess(graph) + if graph is None: + raise Exception ('_sample_init failed, cant fold to trna') self._score(graph) self._sample_notes = '' self._sample_path_score_set = set() - self.postprocessor.fit(self) return graph + def _score(self,graph): + + estimateable=graph.graphmanager.get_estimateable() + super(RNASampler,self)._score(estimateable) + graph._score=estimateable._score + return graph._score + + def _sample_path_append(self, graph, force=False): + if not force: + self._sample_notes+=graph.graph.get('sequence',"0")+"n" + super(RNASampler,self)._sample_path_append(graph,force=force) ''' this is also used sometimes so we make better sure it doesnt fail @@ -66,85 +80,82 @@ def __init__(self,**kwargs): ''' def is_rna (graph): graph=graph.copy() - # remove structure bonds= [ n for n,d in graph.nodes(data=True) if d['label']=='=' ] graph.remove_nodes_from(bonds) - # see if we are cyclic for node,degree in graph.in_degree_iter( graph.nodes() ): if degree == 0: break else: return False - # check if we are connected. graph=nx.Graph(graph) return nx.is_connected(graph) -class PostProcessor: - def __init__(self): - pass - - def fit(self, other): - print 'OMG i got a vectorizer kthx' - self.vectorizer=other.vectorizer +# modifying ubergraphlearn further.. +def get_mod_dict(graph): + s,e= graphlearn.abstract_graphs.rnaabstract.get_start_and_end_node(graph) + return {s:696969 , e:123123123} +#ubergraphlearn.get_mod_dict=get_mod_dict +import rnaabstract +#ubergraphlearn.make_abstract = rnaabstract.direct_abstractor +#ubergraphlearn.make_abstract = rnaabstract.direct_abstraction_wrapper - def postprocess(self, graph): - return self.rna_refold( graph ) - def rna_refold(self, digraph=None, seq=None,vectorizer=None): - """ - :param digraph: - :param seq: - :return: will extract a sequence, RNAfold it and create a abstract graph - """ - # get a sequence no matter what :) - if not seq: - seq= rnaabstract.get_sequence(digraph) - #print 'seq:',seq - graph = rnafold_to_eden([('emptyheader',seq)], shape_type=5, energy_range=30, max_num=3).next() - expanded_graph = self.vectorizer._edge_to_vertex_transform(graph) - ex_di_graph = expanded_rna_graph_to_digraph(expanded_graph) - #abstract_graph = directedgraphtools.direct_abstraction_wrapper(graph,0) - return ex_di_graph +import subprocess as sp -def expanded_rna_graph_to_digraph(graph): +def infernal_checker(sequence_list): ''' - :param graph: an expanded rna representing graph as produced by eden. - properties: backbone edges are replaced by a node labeled '-'. - rna reading direction is reflected by ascending node ids in the graph. - :return: a graph, directed edges along the backbone + :param sequences: a bunch of rna sequences + :return: get evaluation from cmsearch ''' - digraph=nx.DiGraph(graph) - for n,d in digraph.nodes(data=True): - if 'edge' in d: - if d['label']=='-': - ns=digraph.neighbors(n) - ns.sort() - digraph.remove_edge(ns[1],n) - digraph.remove_edge(n,ns[0]) - return digraph + write_fasta(sequence_list,filename='temp.fa') + return call_cm_search('temp.fa',len(sequence_list)) + + + +def write_fasta(sequences,filename='asdasd'): + + fasta='' + for i,s in enumerate(sequences): + if len(s) > 5: + fasta+='>HACK%d\n%s\n' % (i,s) + + with open(filename, 'w') as f: + f.write(fasta) + + +def call_cm_search(filename, count): + + out = sp.check_output('./cmsearch -g --noali --incT 0 rf00005.cm %s' % filename, shell=True) + # -g global + # --noali, we dont want to see the alignment, score is enough + # --incT 0 we want to see everything with score > 0 + result={} + s = out.strip().split('\n') + for line in s: + if 'HACK' in line: + linez=line.split() + score=float(linez[3])/100 + id = int(linez[5][4:]) + result[id]=score + + + return [ result.get(k,0) for k in range(count) ] + -# modifying ubergraphlearn further.. -def get_mod_dict(graph): - s,e=directedgraphtools.get_start_and_end_node(graph) - return {s:696969 , e:123123123} -ubergraphlearn.get_mod_dict=get_mod_dict -import rnaabstract -#ubergraphlearn.make_abstract = rnaabstract.direct_abstractor -ubergraphlearn.make_abstract = rnaabstract.direct_abstraction_wrapper diff --git a/graphlearn/abstract_graphs/ubergraphlearn.py b/graphlearn/abstract_graphs/ubergraphlearn.py index e0016d6..eaae747 100644 --- a/graphlearn/abstract_graphs/ubergraphlearn.py +++ b/graphlearn/abstract_graphs/ubergraphlearn.py @@ -11,7 +11,7 @@ import eden.util.display as edraw import eden import traceback -import itertools + ''' first we build the new sampler that is able to handle abstract graphs... @@ -28,7 +28,6 @@ def __init__(self, base_thickness_list=[1, 2, 3], **kwargs): ''' graphlernsampler with its extensions.. - for now this: is a real_thickness_list and we make sure that the grammar can handle our new corez :) @@ -50,63 +49,62 @@ def __init__(self, base_thickness_list=[1, 2, 3], self.lsgg = UberGrammar(base_thickness_list=self.base_thickness_list, radius_list=self.radius_list, thickness_list=self.thickness_list, - complexity=self.complexity, + #complexity=self.complexity, min_cip_count=min_cip_count, min_interface_count=min_interface_count, nbit=self.nbit, node_entity_check=self.node_entity_check) - def fit(self, graphs, n_jobs=-1, nu=.5, batch_size=10): + def fit(self, graphmanagers, n_jobs=-1, nu=.5, batch_size=10): """ use input to fit the grammar and fit the estimator """ - def appabstr(graphs): - for gr in graphs: - ab= make_abstract(gr) - gr.grpah['abstract'] = ab - yield - def estimodification(graphs): - for gr in graphs: - ab=gr.graph['abstract'] + graphmanagers=list(graphmanagers) + + def get_esti_graphs(managers): + for manager in managers: + yield manager.get_estimateable() + + graphs_ = get_esti_graphs(graphmanagers) - graphs=appabstr(graphs) - graphs, graphs_ = itertools.tee(graphs) + #draw.graphlearn_draw(graphs_.next(),size=20,node_size=500, show_direction=True, contract = False) - graphs_= estimodification(graphs_) self.estimator = self.estimatorobject.fit(graphs_, vectorizer=self.vectorizer, nu=nu, n_jobs=n_jobs, random_state=self.random_state) - self.lsgg.fit(graphs, n_jobs, batch_size=batch_size) + self.lsgg.fit(graphmanagers, n_jobs, batch_size=batch_size) + ''' def _get_abstract_graph(self, graph): try: return make_abstract(graph, self.vectorizer) except Exception as exc: print 'le errer:' - logger.info(exc) logger.info(traceback.format_exc(10)) - draw.graphlearn_draw(graph,size=20,node_size=500, show_direction=True, contract = False) raise Exception('make_abstract died') + ''' - + def _get_abstract_graph(self,graph): + return graph.graphmanager.get_abstract_graph() def _original_cip_extraction(self, graph): ''' selects the next candidate. ''' - graph = self.vectorizer._edge_to_vertex_transform(graph) - abstr = self._get_abstract_graph(graph) + #graph = self.vectorizer._edge_to_vertex_transform(graph) + abstr = self._get_abstract_graph(graph)#self._get_abstract_graph(graph) + node = random.choice(abstr.nodes()) if 'edge' in abstr.node[node]: node = random.choice(abstr.neighbors(node)) @@ -147,11 +145,17 @@ def extract_cores_and_interfaces_mk2(parameters): return None try: # unpack arguments, expand the graph - graph, radius_list, thickness_list, vectorizer, hash_bitmask, node_entity_check, base_thickness_list = parameters - graph = vectorizer._edge_to_vertex_transform(graph) + graphmanager, radius_list, thickness_list, vectorizer, hash_bitmask, node_entity_check, base_thickness_list = parameters + + #print 'you called me like this',parameters + #graph = vectorizer._edge_to_vertex_transform(graph) + #abstr = graph.graph['abstract'] + + graph=graphmanager.get_base_graph() + abstr=graphmanager.get_abstract_graph() cips = [] - abstr = graph.graph['abstract']#make_abstract(graph, vectorizer) ??? mod_dict=get_mod_dict(graph) + for node in abstr.nodes_iter(): if 'edge' in abstr.node[node]: continue @@ -280,6 +284,7 @@ def extract_cips(node, ''' # if not filter(abstract_graph, node): # return [] + #print 'ok1' if 'hlabel' not in abstract_graph.node[abstract_graph.nodes()[0]]: vectorizer._label_preprocessing(abstract_graph) if 'hlabel' not in base_graph.node[base_graph.nodes()[0]]: @@ -295,6 +300,7 @@ def extract_cips(node, **argz) cips = [] + for acip in abstract_cips: # now we need to calculate the real cips: @@ -306,10 +312,20 @@ def extract_cips(node, acip.radius + 1) for abstract_node_id in acip.distance_dict.get(radius) for base_graph_id in abstract_graph.node[abstract_node_id]['contracted']] base_copy = base_graph.copy() + + + # remove duplicates: + mergeids = list(set(mergeids)) + for node in mergeids[1:]: graphtools.merge(base_copy, mergeids[0], node) # do cip extraction and calculate the real core hash + #draw.graphlearn_draw(base_copy,size=20) + + #draw.draw_center(base_copy,mergeids[0],5,size=20) + #print base_thickness_list,hash_bitmask + base_level_cips = graphtools.extract_core_and_interface(mergeids[0], base_copy, radius_list=[0], @@ -317,8 +333,12 @@ def extract_cips(node, vectorizer=vectorizer, hash_bitmask=hash_bitmask, **argz) + core_hash = graphtools.graph_hash(base_graph.subgraph(mergeids), hash_bitmask=hash_bitmask) + #print base_level_cips + + # now we have a bunch of base_level_cips and need to attach info from the abstract cip. for base_cip in base_level_cips: @@ -347,16 +367,12 @@ def extract_cips(node, get_mods(mod_dict,mergeids),0, hash_bitmask) - - - base_cip.core_nodes_count = acip.core_nodes_count base_cip.radius = acip.radius base_cip.abstract_thickness = acip.thickness # i want to see what they look like :) base_cip.abstract_view=acip.graph - cips.append(base_cip) return cips diff --git a/graphlearn/feasibility.py b/graphlearn/feasibility.py index 479bcbe..1c42b0c 100644 --- a/graphlearn/feasibility.py +++ b/graphlearn/feasibility.py @@ -63,140 +63,3 @@ def check(self, graph): return True - -# for the checking we need a function that takes a graph.. so we make one.. :) -def cycles(max_cycle_size): - return lambda x:not problem_cycle(x,max_cycle_size) - - - -def problem_cycle(graph,max_cycle_size): - ''' - for each node we check if a questionable cycle exists - :param graph: - :param max_cycle_size: - :return: problem aru ? True - ''' - for n in graph.nodes_iter(): - if rooted_problem_cycle(graph,n,max_cycle_size): - return True - return False - -def rooted_problem_cycle(graph,n,max_cycle_size): - """ - :param graph: - :param n: start node - :param max_cycle_size: - :return: do we have a questionable cycle that includes n - - - - so we start in node n, - then we expand 1 node further in each step. - if we meet a node we had before we found a cycle. - - there are 3 possible cases. - - frontier hits frontier -> cycle of even length - - frontier hits visited nodes -> cycle of uneven length - - it is also possible that the newly found cycle doesnt contain our start node. so we check for that - """ - - frontier= set([n]) - step=0 - visited=set() - parent=defaultdict(list) - - - - while frontier: - #print frontier - step+=1 - - # give me new nodes: - next=[] - for front_node in frontier: - new = set(graph.neighbors(front_node)) - visited - next.append( new ) - for e in new: - parent[e].append(front_node) - - # we merge the new nodes. if 2 sets collide, we found a cycle of even length - while len(next)>1: - # merge - s1=next[1] - s2=next[0] - merge = s1 | s2 - - # check if we havee a cycle => s1,s2 overlap - if len(merge) < len(s1)+len(s2): - col= s1&s2 - if root_is_last_common_ancestor(col,parent,n): - if step*2 > max_cycle_size: - return True - return False - - #delete from list - next[0]=merge - del next[1] - next=next[0] - - # now we need to check for cycles of uneven length => the new nodes hit the old frontier - if len(next & frontier) > 0: - col= next&frontier - if root_is_last_common_ancestor(col,parent,n): - - if step*2-1 > max_cycle_size: - return True - return False - - # we know that the current frontier didntclose cycles so we dont need to look at them again - visited = visited | frontier - frontier=next - - return False - -def root_is_last_common_ancestor(col,parent,n): - - # any should be fine. e is closing a cycle, - # note: e might contain more than one hit but we dont care - e= col.pop() - #print 'r',e - # we closed a cycle on e so e has 2 parents... - li=parent[e] - a = [li[0]] - b = [li[1]] - #print 'pre',a,b - # get the path until the root node - a=extend_path_to_root(a,parent,n) - b=extend_path_to_root(b,parent,n) - #print 'comp',a,b - #of the paths to the root node dont overlap, the root node musst be in the loop - return len(set(a) & set(b)) ==0 - - -def extend_path_to_root(l,parent,n): - """ - :param l: list with start node - :param parent: the tree like dictionary that contains each nodes parent(s) - :param n: root node. probably we dont really need this since the root node is the orphan - :return: path from l to n , be moving up the tree. note that we dont care to get the shortest path. - """ - current=l[-1] - while current != n: - l.append( parent[current][0] ) - current=l[-1] - return l[:-1] - - - - - - - - - - - - - - diff --git a/graphlearn/graphlearn.py b/graphlearn/graphlearn.py index 5e98232..4d76728 100644 --- a/graphlearn/graphlearn.py +++ b/graphlearn/graphlearn.py @@ -12,6 +12,7 @@ from eden.graph import Vectorizer from eden.util import serialize_dict import logging +from utils import cycles logger = logging.getLogger(__name__) @@ -155,6 +156,7 @@ def sample(self, graph_iter, similarity=-1, n_samples=None, + estimate_flowback = False, batch_size=10, n_jobs=0, max_cycle_size=False, @@ -167,6 +169,7 @@ def sample(self, graph_iter, select_cip_max_tries=20, burnin=0, + generator_mode=False, omit_seed=True, keep_duplicates=False): @@ -187,7 +190,7 @@ def sample(self, graph_iter, :param n_steps: how many samplesteps are conducted :param burnin: do this many steps before collecting samples :param max_cycle_size: max allowed size (slow) - + :estimate_flowback: True to calculate new and old options # sampling strategy :param target_orig_cip: we will use the estimator to determine weak regions in the graph that need improvement @@ -209,6 +212,12 @@ def sample(self, graph_iter, :return: yield graphs """ + self.estimate_backflow = estimate_flowback + if self.estimate_backflow: + print 'WARNING you set estimate backflow. the implementation is a little sketchy ' \ + 'so dont try this with weired graphs. ' + + self.similarity = similarity @@ -216,7 +225,7 @@ def sample(self, graph_iter, max_cycle_size = 2 * max_cycle_size - self.feasibility_checker.checklist.append(feasibility.cycles(max_cycle_size)) + self.feasibility_checker.checklist.append(cycles.cycles(max_cycle_size)) if probabilistic_core_choice + score_core_choice + max_core_size_diff == -1 > 1: raise Exception('choose max one cip choice strategy') @@ -319,7 +328,11 @@ def _sample(self, graph): if graph is None: return None # prepare variables and graph - graph = self._sample_init(graph) + try: + graph = self._sample_init(graph) + except Exception as exc: + logger.debug(exc) + return None self._score_list = [graph._score] self.sample_path = [] accept_counter = 0 @@ -399,7 +412,6 @@ def _sample_init(self, graph): graph = self.vectorizer._edge_to_vertex_transform(graph) if self.max_core_size_diff > -1: self.seed_size = len(graph) - self._score(graph) self._sample_notes = '' self._sample_path_score_set = set() @@ -517,14 +529,50 @@ def _propose_graph(self, graph): graph_new = core_substitution(graph, original_cip.graph, candidate_cip.graph) if self.feasibility_checker.check(graph_new): + + graph_clean(graph_new) - logger.debug("_propose_graph: iteration %d ; core %d of %d ; original_cips tried %d" % + # postproc may fail + tmp= self.postprocessor.postprocess(graph_new) + if tmp: + self.reverse_direction_probability(graph,tmp,original_cip) + logger.debug("_propose_graph: iteration %d ; core %d of %d ; original_cips tried %d" % (self.step, attempt, choices, orig_cip_ctr)) - return self.postprocessor.postprocess(graph_new) + + return tmp if self.quick_skip_orig_cip: break + def reverse_direction_probability(self,graph,graph_new,cip): + + # it is sufficient to look at the old cip, since the interface ids shoud be the same in the new graph.. + + + def ops(g,cip): + # possible operations for stuff + counter=0 + for n,d in cip.graph.nodes(data=True): + if 'edge' not in d and 'interface' in d: + cip=extract_core_and_interface(n, g, [cip.thickness], [cip.radius], vectorizer=self.vectorizer, + hash_bitmask=self.hash_bitmask, filter=self.node_entity_check) + if cip: + cip=cip[0] + if cip.interface_hash in self.lsgg.productions: + counter+=len(self.lsgg.productions[cip.interface_hash]) + return counter + + if self.estimate_backflow: + value=len(graph) / float(len(graph_new)) + print 'flow1: %f' % value + one=ops(graph, cip) + two=ops(graph_new,cip) + print one,two + value+= one/float(two) + graph.graph['flowback'] = value + print 'flow: %f' % value + + def _select_cips(self, cip, graph): """ :param cip: the cip we selected from the graph diff --git a/graphlearn/graphtools.py b/graphlearn/graphtools.py index 814edf5..8ff4275 100644 --- a/graphlearn/graphtools.py +++ b/graphlearn/graphtools.py @@ -5,7 +5,6 @@ import logging import traceback from eden.graph import Vectorizer - logger = logging.getLogger(__name__) @@ -91,8 +90,9 @@ def extract_core_and_interface(root_node=None, :param hash_bitmask: :return: radius_list*thicknes_list long list of cips """ - + DEBUG=False if not filter(graph, root_node): + if DEBUG:print 'filta' return [] if 'hlabel' not in graph.node[graph.nodes()[0]]: vectorizer._label_preprocessing(graph) @@ -114,13 +114,15 @@ def extract_core_and_interface(root_node=None, cip_list = [] for thickness_ in thickness_list: for radius_ in radius_list: - + if DEBUG:print 'thickrad',thickness_,radius_ # see if it is feasable to extract if radius_ + thickness_ not in node_dict: + if DEBUG:print 'jump1',node_dict continue core_graph_nodes = [item for x in range(radius_ + 1) for item in node_dict.get(x, [])] if not filter(master_cip_graph, core_graph_nodes): + if DEBUG:print 'jump2' continue core_hash = graph_hash(master_cip_graph.subgraph(core_graph_nodes), hash_bitmask) @@ -199,7 +201,9 @@ def merge(graph, node, node2): def find_all_isomorphisms(home, other): if iso.faster_could_be_isomorphic(home, other): + label_matcher = lambda x, y: x['distance_dependent_label'] == y['distance_dependent_label'] + graph_label_matcher = iso.GraphMatcher(home, other, node_match=label_matcher) for index, mapping in enumerate(graph_label_matcher.isomorphisms_iter()): if index == 15: # give up .. diff --git a/graphlearn/utils/cycles.py b/graphlearn/utils/cycles.py new file mode 100644 index 0000000..e5a3b18 --- /dev/null +++ b/graphlearn/utils/cycles.py @@ -0,0 +1,128 @@ +from collections import defaultdict + + + + +# for the checking we need a function that takes a graph.. so we make one.. :) +def cycles(max_cycle_size): + return lambda x:not problem_cycle(x,max_cycle_size) + + +def problem_cycle(graph,max_cycle_size): + ''' + for each node we check if a questionable cycle exists + :param graph: + :param max_cycle_size: + :return: problem aru ? True + ''' + for n in graph.nodes_iter(): + if rooted_problem_cycle(graph,n,max_cycle_size): + return True + return False + +def rooted_problem_cycle(graph,n,max_cycle_size): + """ + :param graph: + :param n: start node + :param max_cycle_size: + :return: do we have a questionable cycle that includes n + + + + so we start in node n, + then we expand 1 node further in each step. + if we meet a node we had before we found a cycle. + + there are 3 possible cases. + - frontier hits frontier -> cycle of even length + - frontier hits visited nodes -> cycle of uneven length + - it is also possible that the newly found cycle doesnt contain our start node. so we check for that + """ + + frontier= set([n]) + step=0 + visited=set() + parent=defaultdict(list) + + + + while frontier: + #print frontier + step+=1 + + # give me new nodes: + next=[] + for front_node in frontier: + new = set(graph.neighbors(front_node)) - visited + next.append( new ) + for e in new: + parent[e].append(front_node) + + # we merge the new nodes. if 2 sets collide, we found a cycle of even length + while len(next)>1: + # merge + s1=next[1] + s2=next[0] + merge = s1 | s2 + + # check if we havee a cycle => s1,s2 overlap + if len(merge) < len(s1)+len(s2): + col= s1&s2 + if root_is_last_common_ancestor(col,parent,n): + if step*2 > max_cycle_size: + return True + return False + + #delete from list + next[0]=merge + del next[1] + next=next[0] + + # now we need to check for cycles of uneven length => the new nodes hit the old frontier + if len(next & frontier) > 0: + col= next&frontier + if root_is_last_common_ancestor(col,parent,n): + + if step*2-1 > max_cycle_size: + return True + return False + + # we know that the current frontier didntclose cycles so we dont need to look at them again + visited = visited | frontier + frontier=next + + return False + +def root_is_last_common_ancestor(col,parent,n): + + # any should be fine. e is closing a cycle, + # note: e might contain more than one hit but we dont care + e= col.pop() + #print 'r',e + # we closed a cycle on e so e has 2 parents... + li=parent[e] + a = [li[0]] + b = [li[1]] + #print 'pre',a,b + # get the path until the root node + a=extend_path_to_root(a,parent,n) + b=extend_path_to_root(b,parent,n) + #print 'comp',a,b + #of the paths to the root node dont overlap, the root node musst be in the loop + return len(set(a) & set(b)) ==0 + + +def extend_path_to_root(l,parent,n): + """ + :param l: list with start node + :param parent: the tree like dictionary that contains each nodes parent(s) + :param n: root node. probably we dont really need this since the root node is the orphan + :return: path from l to n , be moving up the tree. note that we dont care to get the shortest path. + """ + current=l[-1] + while current != n: + l.append( parent[current][0] ) + current=l[-1] + return l[:-1] + +