From 0b52df04a69ef651c143c0b901d56f09107f26b6 Mon Sep 17 00:00:00 2001 From: Minniti Julien Date: Fri, 11 Oct 2024 18:17:39 +0200 Subject: [PATCH 1/2] Chromosome location function --- navigation/aio.py | 29 +++++++++++++++-- tfinder/__init__.py | 79 ++++++++++++++------------------------------- 2 files changed, 51 insertions(+), 57 deletions(-) diff --git a/navigation/aio.py b/navigation/aio.py index c193656e..010b9ab4 100644 --- a/navigation/aio.py +++ b/navigation/aio.py @@ -458,7 +458,9 @@ def aio_page(): name = "n.d." species = "n.d" region = "n.d" - dna_sequences.append((name, dna_sequence, species, region)) + strand = "n.d" + tss_ch = 0 + dna_sequences.append((name, dna_sequence, species, region, strand, tss_ch)) elif lines.startswith(">"): lines = dna_sequence.split("\n") i = 0 @@ -490,7 +492,22 @@ def aio_page(): region = "n.d" dna_sequence = lines[i + 1].upper() isfasta = IMO.is_dna(dna_sequence) - dna_sequences.append((name, dna_sequence, found_species, region)) + + match = re.search(r"Strand:\s*(\w+)", line) + if match: + strand = match.group(1).lower() + if strand not in ["plus", "minus"]: + strand = "n.d" + else: + strand = "n.d" + + match = re.search(r"TSS \(on chromosome\):\s*(\d+)", line) + if match: + tss_ch = match.group(1) + else: + tss_ch = 0 + + dna_sequences.append((name, dna_sequence, found_species, region, strand, tss_ch)) i += 1 else: i += 1 @@ -499,7 +516,10 @@ def aio_page(): else: isfasta = False - total_sequences_region_length = sum(len(dna_sequence) for _, dna_sequence, _, _ in dna_sequences) + if st.button("Try"): + st.toast(tss_ch) + + total_sequences_region_length = sum(len(dna_sequence) for _, dna_sequence, _, _, _, _ in dna_sequences) total_sequences = len(dna_sequences) # RE entry @@ -776,6 +796,9 @@ def aio_page(): df = pd.DataFrame(st.session_state['individual_motif_occurrences'][1:], columns=st.session_state['individual_motif_occurrences'][0]) + + df = df.sort_values(by='Rel Score', ascending=False) + st.session_state['df'] = df st.markdown('**Table**') diff --git a/tfinder/__init__.py b/tfinder/__init__.py index 0053cc1e..5c7d49a5 100644 --- a/tfinder/__init__.py +++ b/tfinder/__init__.py @@ -140,7 +140,7 @@ def find_sequences(self): return entrez_id, message if not self.all_slice_forms: - variant, gene_name, title, chraccver, chrstart, chrstop, species_API, message = NCBIdna.get_gene_info( + variant, gene_name, title, chraccver, chrstart, chrstop, strand, species_API, message = NCBIdna.get_gene_info( entrez_id, self.gr, gene_name_error=self.gene_id) if variant == "Error 200": return variant, message @@ -167,9 +167,9 @@ def find_sequences(self): dna_sequence = NCBIdna.get_dna_sequence(prom_term, upstream, downstream, chraccver, chrstart, chrstop) if prom_term == 'promoter': - dna_sequence = f">{variant} {gene_name} | {title} {chraccver} | {self.prom_term} | TSS (on chromosome): {chrstart + 1} | TSS (on sequence): {self.upstream}\n{dna_sequence}\n" + dna_sequence = f">{variant} {gene_name} | {title} {chraccver} | Strand: {strand} | {self.prom_term} | TSS (on chromosome): {chrstart + 1} | TSS (on sequence): {self.upstream}\n{dna_sequence}\n" else: - dna_sequence = f">{variant} {gene_name} | {title} {chraccver} | {self.prom_term} | Gene end (on chromosome): {chrstop} | Gene end (on sequence): {self.upstream}\n{dna_sequence}\n" + dna_sequence = f">{variant} {gene_name} | {title} {chraccver} | Strand: {strand} | {self.prom_term} | Gene end (on chromosome): {chrstop} | Gene end (on sequence): {self.upstream}\n{dna_sequence}\n" return dna_sequence, "OK" @@ -232,12 +232,14 @@ def get_gene_info(entrez_id, gr="Current", from_id=True, gene_name_error=None): if from_id: variant = NCBIdna.all_variant(entrez_id, from_id=True) - return variant, gene_name, title, chraccver, chrstart, chrstop, species_API, ( + strand = "plus" if chrstart < chrstop else "minus" + + return variant, gene_name, title, chraccver, chrstart, chrstop, strand, species_API, ( f"Response 200: Info for {entrez_id} {variant} {gene_name} retrieved." f"Entrez_ID: {entrez_id} | Gene name: {gene_name} | Genome Assembly: {title} | ChrAccVer: {chraccver}" - f"ChrStart/ChrStop: {chrstart}/{chrstop} | Species: {species_API}") + f"ChrStart/ChrStop: {chrstart}/{chrstop} | Strand: {strand} | Species: {species_API}") except Exception as e: - return "Error 200", None, None, None, None, None, None, f"Info for {gene_name_error} {entrez_id} not found." + return "Error 200", None, None, None, None, None, None, None, f"Info for {gene_name_error} {entrez_id} not found." elif response.status_code == 429: time.sleep(random.uniform(0.25, 0.5)) @@ -252,12 +254,11 @@ def extract_genomic_info(gene_id, gene_info, gr): time.sleep(1) - # Extraction depuis la section locationhist location_hist = gene_details.get('locationhist', []) if len(location_hist) == 0: location_hist = gene_details.get('genomicinfo', []) for loc in location_hist: - nc_accver = loc.get('chraccver') # Extrait le NC_XXXXX.YY + nc_accver = loc.get('chraccver') chrstart = loc.get('chrstart') chrstop = loc.get('chrstop') @@ -266,68 +267,39 @@ def extract_genomic_info(gene_id, gene_info, gr): if base_accession not in accession_dict: accession_dict[base_accession] = (chrstart, chrstop) else: - # Conserver la première occurrence des coordonnées existing_start, existing_stop = accession_dict[base_accession] accession_dict[base_accession] = (min(existing_start, chrstart), max(existing_stop, chrstop)) nc_dict = accession_dict - # Affichage des NC_ et leurs informations de position - # for base_accver, (chrstart, chrstop) in nc_dict.items(): - # print(f"{base_accver}: chrstart={chrstart}, chrstop={chrstop}") - - # Filtrer pour garder uniquement les entrées qui commencent par "NC_" nc_dict = {base_accver: (chrstart, chrstop) for base_accver, (chrstart, chrstop) in nc_dict.items() if base_accver.startswith("NC_")} - # Si le dictionnaire n'est pas vide, récupérez la base avant le point du premier élément if nc_dict: - first_base = next(iter(nc_dict)).split('.')[0] # Récupérer la base avant le point de la première clé + first_base = next(iter(nc_dict)).split('.')[0] - # Filtrer le dictionnaire pour ne garder que les éléments avec la même base nc_dict = {base_accver: (chrstart, chrstop) for base_accver, (chrstart, chrstop) in nc_dict.items() if base_accver.split('.')[0] == first_base} - # Votre code existant pour afficher le dictionnaire filtré - # print("\nDictionnaire filtré :") - # for base_accver, (chrstart, chrstop) in nc_dict.items(): - # print(f"{base_accver}: chrstart={chrstart}, chrstop={chrstop}") - - # Trouver l'entrée avec le chiffre le plus grand et le plus petit après le point max_version = -1 max_accver = None - max_coords = None # Pour stocker les coordonnées associées à la version maximale - min_version = float('inf') # Initialiser à l'infini pour trouver la plus petite version + max_coords = None + min_version = float('inf') min_accver = None - min_coords = None # Pour stocker les coordonnées associées à la version minimale + min_coords = None for base_accver in nc_dict.keys(): - version = int(base_accver.split('.')[1]) # Extraire la version après le point + version = int(base_accver.split('.')[1]) - # Mise à jour pour la version maximale if version > max_version: max_version = version max_accver = base_accver - max_coords = nc_dict[base_accver] # Stocker les coordonnées + max_coords = nc_dict[base_accver] - # Mise à jour pour la version minimale if version < min_version: min_version = version min_accver = base_accver - min_coords = nc_dict[base_accver] # Stocker les coordonnées - - # Afficher les résultats - # if max_accver: - # print(f"\nL'accès avec la version la plus élevée est : {max_accver} avec la version {max_version}.") - # print(f"Coordonnées : chrstart={max_coords[0]}, chrstop={max_coords[1]}") - # else: - # print("\nAucun accès trouvé.") - # - # if min_accver: - # print(f"L'accès avec la version la plus basse est : {min_accver} avec la version {min_version}.") - # print(f"Coordonnées : chrstart={min_coords[0]}, chrstop={min_coords[1]}") - # else: - # print("Aucun accès trouvé.") + min_coords = nc_dict[base_accver] if gr != "Current": title = NCBIdna.fetch_nc_info(min_accver) @@ -539,7 +511,7 @@ def all_variant(entrez_id, from_id=False): if len(all_variants) > 0: return all_variants, f"Transcript(s) found(s) for {entrez_id}: {all_variants}" else: - gene_name, title, chraccver, chrstart, chrstop, species_API, message = NCBIdna.get_gene_info( + gene_name, title, chraccver, chrstart, chrstop, strand, species_API, message = NCBIdna.get_gene_info( entrez_id, from_id=False) if gene_name == "Error 200": all_variants.append(("Error 200", None, None, None, None, None)) @@ -735,7 +707,7 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, progress_bar.update(1) random_scores = np.array(matrix_random_scores) - for name, dna_sequence, species, region in dna_sequences: + for name, dna_sequence, species, region, strand_seq, tss_ch in dna_sequences: if calc_pvalue == 'ATGCProportion': count_a = dna_sequence.count('A') count_t = dna_sequence.count('T') @@ -826,6 +798,9 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, if tss_ge_distance is not None: tis_position = position - tss_ge_distance - 1 + if strand_seq in ["plus", "minus"]: + ch_pos = int(tss_ch) - tis_position if strand_seq == "minus" else int(tss_ch) + tis_position + if normalized_score >= threshold: if calc_pvalue is not None: @@ -834,6 +809,8 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, row = [position] if tss_ge_distance is not None: row.append(tis_position) + if strand_seq in ["plus", "minus"]: + row.append(ch_pos) row += [sequence_with_context, "{:.6f}".format(normalized_score).ljust(12)] if calc_pvalue is not None: @@ -842,17 +819,11 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, individual_motif_occurrences.append(row) if len(individual_motif_occurrences) > 0: - if tss_ge_distance is not None and calc_pvalue is not None: - individual_motif_occurrences.sort(key=lambda x: (-float(x[3]), -float(x[4]))) - elif calc_pvalue is not None: - individual_motif_occurrences.sort(key=lambda x: (-float(x[2]), -float(x[3]))) - elif tss_ge_distance is not None: - individual_motif_occurrences.sort(key=lambda x: (-float(x[3]))) - else: - individual_motif_occurrences.sort(key=lambda x: (-float(x[2]))) header = ["Position"] if tss_ge_distance is not None: header.append("Rel Position") + if strand_seq in ["plus", "minus"]: + header.append("Ch Position") header += ["Sequence", "Rel Score"] if calc_pvalue is not None: header.append("p-value") From 94e8f6498c2ffcd1a99aa6a9b1ae2ffc5db93ae3 Mon Sep 17 00:00:00 2001 From: Minniti Julien Date: Fri, 18 Oct 2024 10:50:55 +0200 Subject: [PATCH 2/2] fix chromosome location --- navigation/aio.py | 24 +++++++++++++----------- tfinder/__init__.py | 20 +++++++++++++------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/navigation/aio.py b/navigation/aio.py index 010b9ab4..eb0b7608 100644 --- a/navigation/aio.py +++ b/navigation/aio.py @@ -122,7 +122,8 @@ def result_table_output(df): y=alt.Y('Rel Score:Q', axis=alt.Axis(title='Relative Score'), scale=alt.Scale(domain=[ystart, ystop])), color=alt.condition(gene_region_selection, color_scale, alt.value('lightgray')), - tooltip=['Sequence', 'Position'] + (['Rel Position'] if "Rel Position" in source else []) + ['Rel Score'] + ( + tooltip=['Sequence', 'Position'] + (['Rel Position'] if "Rel Position" in source else []) + ( + ['Ch Position'] if "Ch Position" in source else []) + ['Rel Score'] + ( ['p-value'] if 'p-value' in source else []) + ['Gene', 'Species', 'Region'], opacity=alt.condition(gene_region_selection, alt.value(0.8), alt.value(0.2)) ).transform_calculate(x=f'datum[{xcol_param.name}]').properties(width=600, @@ -178,7 +179,8 @@ def aio_page(): # Species st.markdown("🔹 :blue[**Step 1.2**] Species of gene names and sliced variants:") col1, col2, col3 = st.columns(3) - gr = col1.selectbox("Genome:", ["Current", "Previous"], index=0, help='Example for Homo sapiens:\n\n"Current" is GRCh38\n\n"Previous" is GRCh37') + gr = col1.selectbox("Genome:", ["Current", "Previous"], index=0, + help='Example for Homo sapiens:\n\n"Current" is GRCh38\n\n"Previous" is GRCh37') species = col2.selectbox("Species:", ["Human", "Mouse", "Rat", "Drosophila", "Zebrafish"], index=0) col3.markdown("") col3.markdown("") @@ -227,7 +229,8 @@ def aio_page(): pbar.progress(i / len(gene_ids), text=f'**:blue[Extract sequence... {gene_id}] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**') result_promoter_output, message = NCBIdna(gene_id, prom_term, upstream, downstream, - species, gr, all_slice_forms=True if all_variants else False).find_sequences() + species, gr, + all_slice_forms=True if all_variants else False).find_sequences() if message == "OK": pbar.progress((i + 1) / len(gene_ids), @@ -516,9 +519,6 @@ def aio_page(): else: isfasta = False - if st.button("Try"): - st.toast(tss_ch) - total_sequences_region_length = sum(len(dna_sequence) for _, dna_sequence, _, _, _, _ in dna_sequences) total_sequences = len(dna_sequences) @@ -566,7 +566,8 @@ def aio_page(): help="Only PWM generated with our tools are allowed") matrix_str = st.text_area("🔹 :blue[**Step 2.3**] Matrix:", value="A [ 20.0 0.0 0.0 0.0 0.0 0.0 0.0 100.0 0.0 60.0 20.0 ]\nT [ 60.0 20.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ]\nG [ 0.0 20.0 100.0 0.0 0.0 100.0 100.0 0.0 100.0 40.0 0.0 ]\nC [ 20.0 60.0 0.0 100.0 100.0 0.0 0.0 0.0 0.0 0.0 80.0 ]" - if 'MATRIX_STR_save' not in st.session_state else st.session_state['MATRIX_STR_save'], + if 'MATRIX_STR_save' not in st.session_state else st.session_state[ + 'MATRIX_STR_save'], label_visibility='collapsed', height=125) st.session_state['MATRIX_STR_save'] = matrix_str @@ -602,7 +603,8 @@ def aio_page(): help='Put FASTA sequences. Same sequence length required ⚠') individual_motif = st.text_area("🔹 :blue[**Step 2.3**] Sequences:", value=">seq1\nCTGCCGGAGGA\n>seq2\nAGGCCGGAGGC\n>seq3\nTCGCCGGAGAC\n>seq4\nCCGCCGGAGCG\n>seq5\nAGGCCGGATCG" - if 'individual_motif_save' not in st.session_state else st.session_state['individual_motif_save'], + if 'individual_motif_save' not in st.session_state else + st.session_state['individual_motif_save'], label_visibility='collapsed') st.session_state['individual_motif_save'] = individual_motif individual_motif = individual_motif.upper() @@ -727,8 +729,9 @@ def aio_page(): calc_pvalue = None with BSFcol3: - st.markdown("🔹 :blue[**_Experimental_**] Analyse all directions", help='Directions: **original (+ →)**, **reverse-complement (- ←)**, reverse (+ ←), complement (- →)\n\n' - 'Directions in bold are the default directions.') + st.markdown("🔹 :blue[**_Experimental_**] Analyse all directions", + help='Directions: **original (+ →)**, **reverse-complement (- ←)**, reverse (+ ←), complement (- →)\n\n' + 'Directions in bold are the default directions.') alldirection = st.toggle('All directions') if alldirection: st.markdown( @@ -774,7 +777,6 @@ def aio_page(): if st.button("🔹 :blue[**Step 2.6**] Click here to find motif in your sequences 🔎 🧬", use_container_width=True, disabled=button): - motif = motifs.Motif(counts=matrix) pwm = motif.counts.normalize(pseudocounts=0.1) log_odds_matrix = pwm.log_odds() diff --git a/tfinder/__init__.py b/tfinder/__init__.py index 5c7d49a5..c0d0f238 100644 --- a/tfinder/__init__.py +++ b/tfinder/__init__.py @@ -127,7 +127,7 @@ def find_sequences(self): result_promoter = f'Please verify {self.gene_id} variant' return result_promoter else: - variant, gene_name, title, chraccver, chrstart, chrstop, species_API = NCBIdna.get_variant_info( + variant, gene_name, title, chraccver, chrstart, chrstop, strand, species_API = NCBIdna.get_variant_info( entrez_id, self.gene_id) else: @@ -386,6 +386,8 @@ def get_variant_info(entrez_id, variant): chrstart = int(end_coords[0]) + 1 chrstop = int(start_coords[-1]) + 1 + strand = "plus" if chrstart < chrstop else "minus" + while True: url2 = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id={entrez_id}&retmode=json&rettype=xml" response = requests.get(url2, headers=headers) @@ -399,7 +401,7 @@ def get_variant_info(entrez_id, variant): else: gene_name = 'Bad ID' - return variant, gene_name, title, chraccver, chrstart, chrstop, species_API + return variant, gene_name, title, chraccver, chrstart, chrstop, strand, species_API elif response.status_code == 429: time.sleep(random.uniform(0.25, 0.5)) @@ -624,14 +626,15 @@ def transform_matrix(matrix, alldirection): @staticmethod # Generate random sequences for p_value - def generate_ranseq(probabilities, seq_length, progress_bar): + def generate_ranseq(probabilities, seq_length, progress_bar=None): motif_length = seq_length random_sequences = [] for _ in range(1000000): random_sequence = IMO.generate_random_sequence(motif_length, probabilities) random_sequences.append(random_sequence) - progress_bar.update(1) + if progress_bar: + progress_bar.update(1) return random_sequences @@ -704,7 +707,8 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, else: normalized_random_score = (random_score - min_score) / (max_score - min_score) matrix_random_scores.append(normalized_random_score) - progress_bar.update(1) + if progress_bar: + progress_bar.update(1) random_scores = np.array(matrix_random_scores) for name, dna_sequence, species, region, strand_seq, tss_ch in dna_sequences: @@ -752,7 +756,8 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, else: normalized_random_score = (random_score - min_score) / (max_score - min_score) matrix_random_scores.append(normalized_random_score) - progress_bar.update(1) + if progress_bar: + progress_bar.update(1) random_scores = np.array(matrix_random_scores) @@ -766,7 +771,8 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar=None, position = int(i) + 1 found_positions.append((position, seq, normalized_score)) - progress_bar.update(1) + if progress_bar: + progress_bar.update(1) # Sort positions in descending order of score percentage found_positions.sort(key=lambda x: x[1], reverse=True)