From 8998ff8f789bd3e7d0be9b15ecabbc0bf241c6aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 21 Mar 2024 17:51:47 +0100 Subject: [PATCH 01/10] formats: add COSMIC parser based-on CellBase parser, #TASK-5913, #TASK-5318 --- .../variant/VariantAnnotationUtils.java | 689 ++++++++++++++++++ .../formats/variant/cosmic/CosmicParser.java | 486 ++++++++++++ .../variant/cosmic/CosmicParserCallback.java | 10 + .../variant/cosmic/CosmicParserTest.java | 54 ++ .../src/test/resources/cosmic.small.tsv.gz | Bin 0 -> 8342 bytes .../biodata/models/common/DataVersion.java | 135 ++++ .../models/sequence/SequenceLocation.java | 87 +++ 7 files changed, 1461 insertions(+) create mode 100644 biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java create mode 100755 biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java create mode 100644 biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java create mode 100644 biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java create mode 100644 biodata-formats/src/test/resources/cosmic.small.tsv.gz create mode 100644 biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java create mode 100644 biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java new file mode 100644 index 00000000..58cab2ea --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java @@ -0,0 +1,689 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.biodata.formats.variant; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.variant.annotation.ConsequenceTypeMappings; +import org.opencb.biodata.models.variant.annotation.exceptions.SOTermNotAvailableException; +import org.opencb.biodata.models.variant.avro.*; + +import java.util.*; + +/** + * Created by fjlopez on 22/06/15. + */ +public class VariantAnnotationUtils { + + public static final char SEPARATOR_CHAR = ':'; + + public static final String THREEPRIME_OVERLAPPING_NCRNA = "3prime_overlapping_ncrna"; + public static final String IG_C_GENE = "IG_C_gene"; + public static final String IG_C_PSEUDOGENE = "IG_C_pseudogene"; + public static final String IG_D_GENE = "IG_D_gene"; + public static final String IG_J_GENE = "IG_J_gene"; + public static final String IG_J_PSEUDOGENE = "IG_J_pseudogene"; + public static final String IG_V_GENE = "IG_V_gene"; + public static final String IG_V_PSEUDOGENE = "IG_V_pseudogene"; + public static final String MT_RRNA = "Mt_rRNA"; + public static final String MT_TRNA = "Mt_tRNA"; + public static final String TR_C_GENE = "TR_C_gene"; + public static final String TR_D_GENE = "TR_D_gene"; + public static final String TR_J_GENE = "TR_J_gene"; + public static final String TR_J_PSEUDOGENE = "TR_J_pseudogene"; + public static final String TR_V_GENE = "TR_V_gene"; + public static final String TR_V_PSEUDOGENE = "TR_V_pseudogene"; + public static final String ANTISENSE = "antisense"; + public static final String LINCRNA = "lincRNA"; + public static final String MIRNA = "miRNA"; + public static final String MISC_RNA = "misc_RNA"; + public static final String POLYMORPHIC_PSEUDOGENE = "polymorphic_pseudogene"; + public static final String PROCESSED_PSEUDOGENE = "processed_pseudogene"; + public static final String PROCESSED_TRANSCRIPT = "processed_transcript"; + public static final String PROTEIN_CODING = "protein_coding"; + public static final String PSEUDOGENE = "pseudogene"; + public static final String RRNA = "rRNA"; + public static final String SENSE_INTRONIC = "sense_intronic"; + public static final String SENSE_OVERLAPPING = "sense_overlapping"; + public static final String SNRNA = "snRNA"; + public static final String SNORNA = "snoRNA"; + public static final String NONSENSE_MEDIATED_DECAY = "nonsense_mediated_decay"; + public static final String NMD_TRANSCRIPT_VARIANT = "NMD_transcript_variant"; + public static final String UNPROCESSED_PSEUDOGENE = "unprocessed_pseudogene"; + public static final String TRANSCRIBED_UNPROCESSED_PSEUDGENE = "transcribed_unprocessed_pseudogene"; + public static final String RETAINED_INTRON = "retained_intron"; + public static final String NON_STOP_DECAY = "non_stop_decay"; + public static final String UNITARY_PSEUDOGENE = "unitary_pseudogene"; + public static final String TRANSLATED_PROCESSED_PSEUDOGENE = "translated_processed_pseudogene"; + public static final String TRANSCRIBED_PROCESSED_PSEUDOGENE = "transcribed_processed_pseudogene"; + public static final String TRNA_PSEUDOGENE = "tRNA_pseudogene"; + public static final String SNORNA_PSEUDOGENE = "snoRNA_pseudogene"; + public static final String SNRNA_PSEUDOGENE = "snRNA_pseudogene"; + public static final String SCRNA_PSEUDOGENE = "scRNA_pseudogene"; + public static final String RRNA_PSEUDOGENE = "rRNA_pseudogene"; + public static final String MISC_RNA_PSEUDOGENE = "misc_RNA_pseudogene"; + public static final String MIRNA_PSEUDOGENE = "miRNA_pseudogene"; + public static final String NON_CODING = "non_coding"; + public static final String AMBIGUOUS_ORF = "ambiguous_orf"; + public static final String KNOWN_NCRNA = "known_ncrna"; + public static final String RETROTRANSPOSED = "retrotransposed"; + public static final String TRANSCRIBED_UNITARY_PSEUDOGENE = "transcribed_unitary_pseudogene"; + public static final String TRANSLATED_UNPROCESSED_PSEUDOGENE = "translated_unprocessed_pseudogene"; + public static final String LRG_GENE = "LRG_gene"; + + public static final String INTERGENIC_VARIANT = "intergenic_variant"; + public static final String REGULATORY_REGION_VARIANT = "regulatory_region_variant"; + public static final String TF_BINDING_SITE_VARIANT = "TF_binding_site_variant"; + public static final String UPSTREAM_GENE_VARIANT = "upstream_gene_variant"; + public static final String TWOKB_UPSTREAM_VARIANT = "2KB_upstream_variant"; + public static final String DOWNSTREAM_GENE_VARIANT = "downstream_gene_variant"; + public static final String TWOKB_DOWNSTREAM_VARIANT = "2KB_downstream_variant"; + public static final String SPLICE_DONOR_VARIANT = "splice_donor_variant"; + public static final String SPLICE_ACCEPTOR_VARIANT = "splice_acceptor_variant"; + public static final String INTRON_VARIANT = "intron_variant"; + public static final String SPLICE_REGION_VARIANT = "splice_region_variant"; + public static final String FIVE_PRIME_UTR_VARIANT = "5_prime_UTR_variant"; + public static final String THREE_PRIME_UTR_VARIANT = "3_prime_UTR_variant"; + public static final String INCOMPLETE_TERMINAL_CODON_VARIANT = "incomplete_terminal_codon_variant"; + public static final String STOP_RETAINED_VARIANT = "stop_retained_variant"; + public static final String START_RETAINED_VARIANT = "start_retained_variant"; + public static final String SYNONYMOUS_VARIANT = "synonymous_variant"; + public static final String INITIATOR_CODON_VARIANT = "initiator_codon_variant"; + public static final String START_LOST = "start_lost"; + public static final String STOP_GAINED = "stop_gained"; + public static final String STOP_LOST = "stop_lost"; + public static final String MISSENSE_VARIANT = "missense_variant"; + public static final String MATURE_MIRNA_VARIANT = "mature_miRNA_variant"; + public static final String NON_CODING_TRANSCRIPT_EXON_VARIANT = "non_coding_transcript_exon_variant"; + public static final String NON_CODING_TRANSCRIPT_VARIANT = "non_coding_transcript_variant"; + public static final String INFRAME_INSERTION = "inframe_insertion"; + public static final String INFRAME_VARIANT = "inframe_variant"; + public static final String FRAMESHIFT_VARIANT = "frameshift_variant"; + public static final String CODING_SEQUENCE_VARIANT = "coding_sequence_variant"; + public static final String TRANSCRIPT_ABLATION = "transcript_ablation"; + public static final String TRANSCRIPT_AMPLIFICATION = "transcript_amplification"; + public static final String COPY_NUMBER_CHANGE = "copy_number_change"; + public static final String TERMINATOR_CODON_VARIANT = "terminator_codon_variant"; + public static final String FEATURE_TRUNCATION = "feature_truncation"; + public static final String FEATURE_VARIANT = "feature_variant"; + public static final String STRUCTURAL_VARIANT = "structural_variant"; + public static final String INFRAME_DELETION = "inframe_deletion"; + + public static final String CDS_START_NF = "cds_start_NF"; + public static final String CDS_END_NF = "cds_end_NF"; + + public static final String FUNCTION_UNCERTAIN_VARIANT = "function_uncertain_variant"; + + public static final Map> IS_SYNONYMOUS_CODON = new HashMap<>(); + public static final Map> MT_IS_SYNONYMOUS_CODON = new HashMap<>(); + public static final Map SO_NAMES_CORRECTIONS = new HashMap<>(); + public static final Map> A_TO_CODON = new HashMap<>(); + public static final Map> MT_A_TO_CODON = new HashMap<>(); + public static final Map CODON_TO_A = new HashMap<>(); + public static final Map MT_CODON_TO_A = new HashMap<>(); + public static final Map COMPLEMENTARY_NT = new HashMap<>(); + public static final Map SIFT_DESCRIPTIONS = new HashMap<>(); + public static final Map POLYPHEN_DESCRIPTIONS = new HashMap<>(); + public static final Map SO_SEVERITY = new HashMap<>(); + public static final Map ORIGIN_STRING_TO_ALLELE_ORIGIN = new HashMap<>(); + public static final Set CODING_SO_NAMES = new HashSet<>(); + public static final Map CLINVAR_CLINSIG_TO_ACMG = new HashMap<>(); + public static final Map CLINVAR_CLINSIG_TO_TRAIT_ASSOCIATION = new HashMap<>(); + public static final HashMap CLINVAR_REVIEW_TO_CONSISTENCY_STATUS = new HashMap<>(); + // Currently left empty since the only item within DrugResponseClassification that seemed to match any clinvar + // tag ("responsive") was removed at some point from the model + public static final Map CLINVAR_CLINSIG_TO_DRUG_RESPONSE = new HashMap<>(); + + public static final String MT = "MT"; + public static final String UNKNOWN_AMINOACID = "X"; + + public static final HashMap MODEOFINHERITANCE_MAP = new HashMap<>(); + public static final HashMap COSMIC_SOMATICSTATUS_TO_ALLELE_ORIGIN = new HashMap<>(); + public static final HashMap TO_ABBREVIATED_AA = new HashMap<>(40); // 22 AA + public static final HashMap TO_LONG_AA = new HashMap<>(40); // 22 AA + private static final String ATG = "ATG"; + private static final String ATA = "ATA"; + private static final String TAA = "TAA"; + private static final String TAG = "TAG"; + private static final String AGA = "AGA"; + private static final String AGG = "AGG"; + private static final String TGA = "TGA"; + + static { + + MODEOFINHERITANCE_MAP.put("autosomal dominant inheritance", ModeOfInheritance.monoallelic); + MODEOFINHERITANCE_MAP.put("autosomal dominant inheritance with maternal imprinting", + ModeOfInheritance.monoallelic_maternally_imprinted); + MODEOFINHERITANCE_MAP.put("autosomal dominant inheritance with paternal imprinting", + ModeOfInheritance.monoallelic_paternally_imprinted); + MODEOFINHERITANCE_MAP.put("autosomal recessive inheritance", + ModeOfInheritance.biallelic); + MODEOFINHERITANCE_MAP.put("mitochondrial inheritance", + ModeOfInheritance.mitochondrial); + MODEOFINHERITANCE_MAP.put("sex-limited autosomal dominant", + ModeOfInheritance.monoallelic); + MODEOFINHERITANCE_MAP.put("x-linked dominant inheritance", + ModeOfInheritance.xlinked_monoallelic); + MODEOFINHERITANCE_MAP.put("x-linked recessive inheritance", + ModeOfInheritance.xlinked_biallelic); + + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("criteria_provided_conflicting_interpretations", ConsistencyStatus.conflict); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("criteria_provided_multiple_submitters_no_conflicts", ConsistencyStatus.congruent); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("criteria_provided_single_submitter", ConsistencyStatus.congruent); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("reviewed_by_expert_panel", ConsistencyStatus.congruent); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("conflicting interpretations", ConsistencyStatus.conflict); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("no conflicts", ConsistencyStatus.congruent); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("single submitter", ConsistencyStatus.congruent); + CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("reviewed by expert panel", ConsistencyStatus.congruent); + + + CLINVAR_CLINSIG_TO_ACMG.put("benign", ClinicalSignificance.benign); + CLINVAR_CLINSIG_TO_ACMG.put("likely benign", ClinicalSignificance.likely_benign); + CLINVAR_CLINSIG_TO_ACMG.put("conflicting interpretations of pathogenicity", ClinicalSignificance.uncertain_significance); + CLINVAR_CLINSIG_TO_ACMG.put("likely pathogenic", ClinicalSignificance.likely_pathogenic); + CLINVAR_CLINSIG_TO_ACMG.put("pathogenic", ClinicalSignificance.pathogenic); + CLINVAR_CLINSIG_TO_ACMG.put("uncertain significance", ClinicalSignificance.uncertain_significance); + CLINVAR_CLINSIG_TO_ACMG.put("conflicting data from submitters", ClinicalSignificance.uncertain_significance); + + CLINVAR_CLINSIG_TO_TRAIT_ASSOCIATION.put("risk factor", TraitAssociation.established_risk_allele); + CLINVAR_CLINSIG_TO_TRAIT_ASSOCIATION.put("protective", TraitAssociation.protective); + + /////////////////////////////////////////////////////////////////////// + ///// ClinVar and Cosmic allele origins to SO terms /////////////// + /////////////////////////////////////////////////////////////////////// + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("germline", AlleleOrigin.germline_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("maternal", AlleleOrigin.maternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("de novo", AlleleOrigin.de_novo_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("paternal", AlleleOrigin.paternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("somatic", AlleleOrigin.somatic_variant); + + /////////////////////////////////////////////////////////////////////// + ///// GENETIC CODE //////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////// + A_TO_CODON.put("ALA", new ArrayList()); + A_TO_CODON.get("ALA").add("GCT"); + A_TO_CODON.get("ALA").add("GCC"); + A_TO_CODON.get("ALA").add("GCA"); + A_TO_CODON.get("ALA").add("GCG"); + A_TO_CODON.put("ARG", new ArrayList()); + A_TO_CODON.get("ARG").add("CGT"); + A_TO_CODON.get("ARG").add("CGC"); + A_TO_CODON.get("ARG").add("CGA"); + A_TO_CODON.get("ARG").add("CGG"); + A_TO_CODON.get("ARG").add("AGA"); + A_TO_CODON.get("ARG").add("AGG"); + A_TO_CODON.put("ASN", new ArrayList()); + A_TO_CODON.get("ASN").add("AAT"); + A_TO_CODON.get("ASN").add("AAC"); + A_TO_CODON.put("ASP", new ArrayList()); + A_TO_CODON.get("ASP").add("GAT"); + A_TO_CODON.get("ASP").add("GAC"); + A_TO_CODON.put("CYS", new ArrayList()); + A_TO_CODON.get("CYS").add("TGT"); + A_TO_CODON.get("CYS").add("TGC"); + A_TO_CODON.put("GLN", new ArrayList()); + A_TO_CODON.get("GLN").add("CAA"); + A_TO_CODON.get("GLN").add("CAG"); + A_TO_CODON.put("GLU", new ArrayList()); + A_TO_CODON.get("GLU").add("GAA"); + A_TO_CODON.get("GLU").add("GAG"); + A_TO_CODON.put("GLY", new ArrayList()); + A_TO_CODON.get("GLY").add("GGT"); + A_TO_CODON.get("GLY").add("GGC"); + A_TO_CODON.get("GLY").add("GGA"); + A_TO_CODON.get("GLY").add("GGG"); + A_TO_CODON.put("HIS", new ArrayList()); + A_TO_CODON.get("HIS").add("CAT"); + A_TO_CODON.get("HIS").add("CAC"); + A_TO_CODON.put("ILE", new ArrayList()); + A_TO_CODON.get("ILE").add("ATT"); + A_TO_CODON.get("ILE").add("ATC"); + A_TO_CODON.get("ILE").add("ATA"); + A_TO_CODON.put("LEU", new ArrayList()); + A_TO_CODON.get("LEU").add("TTA"); + A_TO_CODON.get("LEU").add("TTG"); + A_TO_CODON.get("LEU").add("CTT"); + A_TO_CODON.get("LEU").add("CTC"); + A_TO_CODON.get("LEU").add("CTA"); + A_TO_CODON.get("LEU").add("CTG"); + A_TO_CODON.put("LYS", new ArrayList()); + A_TO_CODON.get("LYS").add("AAA"); + A_TO_CODON.get("LYS").add("AAG"); + A_TO_CODON.put("MET", new ArrayList()); + A_TO_CODON.get("MET").add("ATG"); + A_TO_CODON.put("PHE", new ArrayList()); + A_TO_CODON.get("PHE").add("TTT"); + A_TO_CODON.get("PHE").add("TTC"); + A_TO_CODON.put("PRO", new ArrayList()); + A_TO_CODON.get("PRO").add("CCT"); + A_TO_CODON.get("PRO").add("CCC"); + A_TO_CODON.get("PRO").add("CCA"); + A_TO_CODON.get("PRO").add("CCG"); + A_TO_CODON.put("SER", new ArrayList()); + A_TO_CODON.get("SER").add("TCT"); + A_TO_CODON.get("SER").add("TCC"); + A_TO_CODON.get("SER").add("TCA"); + A_TO_CODON.get("SER").add("TCG"); + A_TO_CODON.get("SER").add("AGT"); + A_TO_CODON.get("SER").add("AGC"); + A_TO_CODON.put("THR", new ArrayList()); + A_TO_CODON.get("THR").add("ACT"); + A_TO_CODON.get("THR").add("ACC"); + A_TO_CODON.get("THR").add("ACA"); + A_TO_CODON.get("THR").add("ACG"); + A_TO_CODON.put("TRP", new ArrayList()); + A_TO_CODON.get("TRP").add("TGG"); + A_TO_CODON.put("TYR", new ArrayList()); + A_TO_CODON.get("TYR").add("TAT"); + A_TO_CODON.get("TYR").add("TAC"); + A_TO_CODON.put("VAL", new ArrayList()); + A_TO_CODON.get("VAL").add("GTT"); + A_TO_CODON.get("VAL").add("GTC"); + A_TO_CODON.get("VAL").add("GTA"); + A_TO_CODON.get("VAL").add("GTG"); + A_TO_CODON.put("STOP", new ArrayList()); + A_TO_CODON.get("STOP").add("TAA"); + A_TO_CODON.get("STOP").add("TGA"); + A_TO_CODON.get("STOP").add("TAG"); + + for (String aa : A_TO_CODON.keySet()) { + for (String codon : A_TO_CODON.get(aa)) { + IS_SYNONYMOUS_CODON.put(codon, new HashMap()); + CODON_TO_A.put(codon, aa); + } + } + for (String codon1 : IS_SYNONYMOUS_CODON.keySet()) { + Map codonEntry = IS_SYNONYMOUS_CODON.get(codon1); + for (String codon2 : IS_SYNONYMOUS_CODON.keySet()) { + codonEntry.put(codon2, false); + } + } + for (String aa : A_TO_CODON.keySet()) { + for (String codon1 : A_TO_CODON.get(aa)) { + for (String codon2 : A_TO_CODON.get(aa)) { + IS_SYNONYMOUS_CODON.get(codon1).put(codon2, true); + } + } + } + + /////////////////////////////////////////////////////////////////////// + ///// MITOCHONDRIAL GENETIC CODE ////////////////////////////////// + /////////////////////////////////////////////////////////////////////// + MT_A_TO_CODON.put("ALA", new ArrayList()); + MT_A_TO_CODON.get("ALA").add("GCT"); + MT_A_TO_CODON.get("ALA").add("GCC"); + MT_A_TO_CODON.get("ALA").add("GCA"); + MT_A_TO_CODON.get("ALA").add("GCG"); + MT_A_TO_CODON.put("ARG", new ArrayList()); + MT_A_TO_CODON.get("ARG").add("CGT"); + MT_A_TO_CODON.get("ARG").add("CGC"); + MT_A_TO_CODON.get("ARG").add("CGA"); + MT_A_TO_CODON.get("ARG").add("CGG"); + MT_A_TO_CODON.put("ASN", new ArrayList()); + MT_A_TO_CODON.get("ASN").add("AAT"); + MT_A_TO_CODON.get("ASN").add("AAC"); + MT_A_TO_CODON.put("ASP", new ArrayList()); + MT_A_TO_CODON.get("ASP").add("GAT"); + MT_A_TO_CODON.get("ASP").add("GAC"); + MT_A_TO_CODON.put("CYS", new ArrayList()); + MT_A_TO_CODON.get("CYS").add("TGT"); + MT_A_TO_CODON.get("CYS").add("TGC"); + MT_A_TO_CODON.put("GLN", new ArrayList()); + MT_A_TO_CODON.get("GLN").add("CAA"); + MT_A_TO_CODON.get("GLN").add("CAG"); + MT_A_TO_CODON.put("GLU", new ArrayList()); + MT_A_TO_CODON.get("GLU").add("GAA"); + MT_A_TO_CODON.get("GLU").add("GAG"); + MT_A_TO_CODON.put("GLY", new ArrayList()); + MT_A_TO_CODON.get("GLY").add("GGT"); + MT_A_TO_CODON.get("GLY").add("GGC"); + MT_A_TO_CODON.get("GLY").add("GGA"); + MT_A_TO_CODON.get("GLY").add("GGG"); + MT_A_TO_CODON.put("HIS", new ArrayList()); + MT_A_TO_CODON.get("HIS").add("CAT"); + MT_A_TO_CODON.get("HIS").add("CAC"); + MT_A_TO_CODON.put("ILE", new ArrayList()); + MT_A_TO_CODON.get("ILE").add("ATT"); + MT_A_TO_CODON.get("ILE").add("ATC"); + MT_A_TO_CODON.put("LEU", new ArrayList()); + MT_A_TO_CODON.get("LEU").add("TTA"); + MT_A_TO_CODON.get("LEU").add("TTG"); + MT_A_TO_CODON.get("LEU").add("CTT"); + MT_A_TO_CODON.get("LEU").add("CTC"); + MT_A_TO_CODON.get("LEU").add("CTA"); + MT_A_TO_CODON.get("LEU").add("CTG"); + MT_A_TO_CODON.put("LYS", new ArrayList()); + MT_A_TO_CODON.get("LYS").add("AAA"); + MT_A_TO_CODON.get("LYS").add("AAG"); + MT_A_TO_CODON.put("MET", new ArrayList()); + MT_A_TO_CODON.get("MET").add("ATG"); + MT_A_TO_CODON.get("MET").add("ATA"); + MT_A_TO_CODON.put("PHE", new ArrayList()); + MT_A_TO_CODON.get("PHE").add("TTT"); + MT_A_TO_CODON.get("PHE").add("TTC"); + MT_A_TO_CODON.put("PRO", new ArrayList()); + MT_A_TO_CODON.get("PRO").add("CCT"); + MT_A_TO_CODON.get("PRO").add("CCC"); + MT_A_TO_CODON.get("PRO").add("CCA"); + MT_A_TO_CODON.get("PRO").add("CCG"); +// A_TO_CODON.put("SEC", new ArrayList<>()); +// A_TO_CODON.get("SEC").add("TGA"); + MT_A_TO_CODON.put("SER", new ArrayList()); + MT_A_TO_CODON.get("SER").add("TCT"); + MT_A_TO_CODON.get("SER").add("TCC"); + MT_A_TO_CODON.get("SER").add("TCA"); + MT_A_TO_CODON.get("SER").add("TCG"); + MT_A_TO_CODON.get("SER").add("AGT"); + MT_A_TO_CODON.get("SER").add("AGC"); + MT_A_TO_CODON.put("THR", new ArrayList()); + MT_A_TO_CODON.get("THR").add("ACT"); + MT_A_TO_CODON.get("THR").add("ACC"); + MT_A_TO_CODON.get("THR").add("ACA"); + MT_A_TO_CODON.get("THR").add("ACG"); + MT_A_TO_CODON.put("TRP", new ArrayList()); + MT_A_TO_CODON.get("TRP").add("TGG"); + MT_A_TO_CODON.get("TRP").add("TGA"); + MT_A_TO_CODON.put("TYR", new ArrayList()); + MT_A_TO_CODON.get("TYR").add("TAT"); + MT_A_TO_CODON.get("TYR").add("TAC"); + MT_A_TO_CODON.put("VAL", new ArrayList()); + MT_A_TO_CODON.get("VAL").add("GTT"); + MT_A_TO_CODON.get("VAL").add("GTC"); + MT_A_TO_CODON.get("VAL").add("GTA"); + MT_A_TO_CODON.get("VAL").add("GTG"); + MT_A_TO_CODON.put("STOP", new ArrayList()); + MT_A_TO_CODON.get("STOP").add("TAA"); + MT_A_TO_CODON.get("STOP").add("TAG"); + MT_A_TO_CODON.get("STOP").add("AGA"); + MT_A_TO_CODON.get("STOP").add("AGG"); + + for (String aa : MT_A_TO_CODON.keySet()) { + for (String codon : MT_A_TO_CODON.get(aa)) { + MT_IS_SYNONYMOUS_CODON.put(codon, new HashMap()); + MT_CODON_TO_A.put(codon, aa); + } + } + for (String codon1 : MT_IS_SYNONYMOUS_CODON.keySet()) { + Map codonEntry = MT_IS_SYNONYMOUS_CODON.get(codon1); + for (String codon2 : MT_IS_SYNONYMOUS_CODON.keySet()) { + codonEntry.put(codon2, false); + } + } + for (String aa : MT_A_TO_CODON.keySet()) { + for (String codon1 : MT_A_TO_CODON.get(aa)) { + for (String codon2 : MT_A_TO_CODON.get(aa)) { + MT_IS_SYNONYMOUS_CODON.get(codon1).put(codon2, true); + } + } + } + + /* + Aminoacid abbreviation map + */ + TO_ABBREVIATED_AA.put("ALA", "A"); + TO_ABBREVIATED_AA.put("ARG", "R"); + TO_ABBREVIATED_AA.put("ASN", "N"); + TO_ABBREVIATED_AA.put("ASP", "D"); + TO_ABBREVIATED_AA.put("ASX", "B"); + TO_ABBREVIATED_AA.put("CYS", "C"); + TO_ABBREVIATED_AA.put("GLU", "E"); + TO_ABBREVIATED_AA.put("GLN", "Q"); + TO_ABBREVIATED_AA.put("GLX", "Z"); + TO_ABBREVIATED_AA.put("GLY", "G"); + TO_ABBREVIATED_AA.put("HIS", "H"); + TO_ABBREVIATED_AA.put("ILE", "I"); + TO_ABBREVIATED_AA.put("LEU", "L"); + TO_ABBREVIATED_AA.put("LYS", "K"); + TO_ABBREVIATED_AA.put("MET", "M"); + TO_ABBREVIATED_AA.put("PHE", "F"); + TO_ABBREVIATED_AA.put("PRO", "P"); + TO_ABBREVIATED_AA.put("SEC", "U"); + TO_ABBREVIATED_AA.put("SER", "S"); + TO_ABBREVIATED_AA.put("THR", "T"); + TO_ABBREVIATED_AA.put("TRP", "W"); + TO_ABBREVIATED_AA.put("TYR", "Y"); + TO_ABBREVIATED_AA.put("VAL", "V"); + TO_ABBREVIATED_AA.put("STOP", "O"); + + for (String aa : TO_ABBREVIATED_AA.keySet()) { + TO_LONG_AA.put(TO_ABBREVIATED_AA.get(aa), buildUpperLowerCaseString(aa)); + } + + COMPLEMENTARY_NT.put('A', 'T'); + COMPLEMENTARY_NT.put('a', 't'); + COMPLEMENTARY_NT.put('C', 'G'); + COMPLEMENTARY_NT.put('c', 'g'); + COMPLEMENTARY_NT.put('G', 'C'); + COMPLEMENTARY_NT.put('g', 'c'); + COMPLEMENTARY_NT.put('T', 'A'); + COMPLEMENTARY_NT.put('t', 'a'); + COMPLEMENTARY_NT.put('N', 'N'); + COMPLEMENTARY_NT.put('n', 'n'); + + POLYPHEN_DESCRIPTIONS.put(0, "probably damaging"); + POLYPHEN_DESCRIPTIONS.put(1, "possibly damaging"); + POLYPHEN_DESCRIPTIONS.put(2, "benign"); + POLYPHEN_DESCRIPTIONS.put(3, "unknown"); + + SIFT_DESCRIPTIONS.put(0, "tolerated"); + SIFT_DESCRIPTIONS.put(1, "deleterious"); + + SO_SEVERITY.put("copy_number_change", 42); + SO_SEVERITY.put("transcript_ablation", 41); + SO_SEVERITY.put("structural_variant", 40); + SO_SEVERITY.put("splice_acceptor_variant", 39); + SO_SEVERITY.put("splice_donor_variant", 38); + SO_SEVERITY.put("stop_gained", 37); + SO_SEVERITY.put("frameshift_variant", 36); + SO_SEVERITY.put("stop_lost", 35); + SO_SEVERITY.put("terminator_codon_variant", 34); + SO_SEVERITY.put("start_lost", 34); + SO_SEVERITY.put("initiator_codon_variant", 33); + SO_SEVERITY.put("transcript_amplification", 32); + SO_SEVERITY.put("inframe_insertion", 31); + SO_SEVERITY.put("inframe_deletion", 30); + SO_SEVERITY.put("inframe_variant", 29); + SO_SEVERITY.put("missense_variant", 28); + SO_SEVERITY.put("splice_region_variant", 27); + SO_SEVERITY.put("incomplete_terminal_codon_variant", 26); + SO_SEVERITY.put("stop_retained_variant", 25); + SO_SEVERITY.put("start_retained_variant", 24); + SO_SEVERITY.put("synonymous_variant", 23); + SO_SEVERITY.put("coding_sequence_variant", 22); + SO_SEVERITY.put("mature_miRNA_variant", 21); + SO_SEVERITY.put("5_prime_UTR_variant", 20); + SO_SEVERITY.put("3_prime_UTR_variant", 19); + SO_SEVERITY.put("non_coding_transcript_exon_variant", 18); + SO_SEVERITY.put("intron_variant", 17); + SO_SEVERITY.put("NMD_transcript_variant", 16); + SO_SEVERITY.put("non_coding_transcript_variant", 15); + SO_SEVERITY.put("2KB_upstream_variant", 14); + SO_SEVERITY.put("upstream_gene_variant", 13); + SO_SEVERITY.put("2KB_downstream_variant", 12); + SO_SEVERITY.put("downstream_gene_variant", 11); + SO_SEVERITY.put("TFBS_ablation", 10); + SO_SEVERITY.put("TFBS_amplification", 9); + SO_SEVERITY.put("TF_binding_site_variant", 8); + SO_SEVERITY.put("regulatory_region_ablation", 7); + SO_SEVERITY.put("regulatory_region_amplification", 6); + SO_SEVERITY.put("regulatory_region_variant", 5); + SO_SEVERITY.put("feature_elongation", 4); + SO_SEVERITY.put("feature_truncation", 3); + SO_SEVERITY.put("feature_variant", 2); + SO_SEVERITY.put("intergenic_variant", 1); + + CODING_SO_NAMES.add(STOP_RETAINED_VARIANT); + CODING_SO_NAMES.add(START_RETAINED_VARIANT); + CODING_SO_NAMES.add(SYNONYMOUS_VARIANT); + CODING_SO_NAMES.add(STOP_GAINED); + CODING_SO_NAMES.add(INITIATOR_CODON_VARIANT); + CODING_SO_NAMES.add(START_LOST); + CODING_SO_NAMES.add(STOP_LOST); + CODING_SO_NAMES.add(MISSENSE_VARIANT); + + SO_NAMES_CORRECTIONS.put("nc_transcript_variant", "non_coding_transcript_variant"); + SO_NAMES_CORRECTIONS.put("non_coding_exon_variant", "non_coding_transcript_exon_variant"); + } + + public static String buildUpperLowerCaseString(String aa) { + if (StringUtils.isEmpty(aa)) { + return null; + } + StringBuilder stringBuilder = new StringBuilder(aa); + + for (int i = 1; i < stringBuilder.length(); i++) { + stringBuilder.setCharAt(i, String.valueOf(stringBuilder.charAt(i)).toLowerCase().charAt(0)); + } + + return stringBuilder.toString(); + } + + public static Boolean isSynonymousCodon(String codon1, String codon2) { + return isSynonymousCodon(false, codon1, codon2); + } + + public static Boolean isSynonymousCodon(Boolean mitochondrialCode, String codon1, String codon2) { +// Map geneticCode = null; + if (mitochondrialCode) { + return MT_IS_SYNONYMOUS_CODON.get(codon1.toUpperCase()).get(codon2.toUpperCase()); + } else { + return IS_SYNONYMOUS_CODON.get(codon1.toUpperCase()).get(codon2.toUpperCase()); + } + } + + public static Boolean isStopCodon(String codon) { + return isStopCodon(false, codon); + } + + public static Boolean isStopCodon(boolean mitochondrialCode, String codon) { + if (mitochondrialCode) { + if (codon.equals(TAA) || codon.equals(TAG) || codon.equals(AGA) || codon.equals(AGG)) { + return true; + } + } else { + if (codon.equals(TAA) || codon.equals(TGA) || codon.equals(TAG)) { + return true; + } + } + return false; + } + + public static boolean isStartCodon(boolean mitochondrialCode, String codon) { + if (mitochondrialCode) { + if (codon.equals(ATG) || codon.equals(ATA)) { + return true; + } + } else { + if (codon.equals(ATG)) { + return true; + } + } + return false; + } + + + public static String getAminoacid(boolean mitochondrialCode, String codon) { + if (mitochondrialCode) { + return MT_CODON_TO_A.get(codon); + } else { + return CODON_TO_A.get(codon); + } + } + + public static List getSequenceOntologyTerms(Iterable soNames) throws SOTermNotAvailableException { + List sequenceOntologyTerms = new ArrayList<>(); + for (String name : soNames) { + name = fixSONameIfNeeded(name); + sequenceOntologyTerms.add(newSequenceOntologyTerm(name)); + } + return sequenceOntologyTerms; + } + + private static String fixSONameIfNeeded(String name) { + String fixedName = SO_NAMES_CORRECTIONS.get(name); + return fixedName == null ? name : fixedName; + } + + public static SequenceOntologyTerm newSequenceOntologyTerm(String name) throws SOTermNotAvailableException { + return new SequenceOntologyTerm(ConsequenceTypeMappings.getSoAccessionString(name), name); + } + + public static String buildVariantId(String chromosome, int start, String reference, String alternate) { + StringBuilder stringBuilder = new StringBuilder(); + + appendChromosome(chromosome, stringBuilder) + .append(SEPARATOR_CHAR) + .append(StringUtils.leftPad(Integer.toString(start), 10, " ")) + .append(SEPARATOR_CHAR); + +// if (reference.length() > Variant.SV_THRESHOLD) { +// stringBuilder.append(new String(CryptoUtils.encryptSha1(reference))); +// } else if (!(reference == null || reference.isEmpty() || reference.equals("-"))) { + if (!(reference == null || reference.isEmpty() || reference.equals("-"))) { + stringBuilder.append(reference); + } + stringBuilder.append(SEPARATOR_CHAR); +// if (alternate.length() > Variant.SV_THRESHOLD) { +// stringBuilder.append(new String(CryptoUtils.encryptSha1(alternate))); +// } else if (!(alternate == null || alternate.isEmpty() || alternate.equals("-"))) { + if (!(alternate == null || alternate.isEmpty() || alternate.equals("-"))) { + stringBuilder.append(alternate); + } + return stringBuilder.toString(); + } + + protected static StringBuilder appendChromosome(String chromosome, StringBuilder stringBuilder) { + if (chromosome.length() == 1 && Character.isDigit(chromosome.charAt(0))) { + stringBuilder.append(' '); + } + return stringBuilder.append(chromosome); + } + + public static String reverseComplement(String string) { + StringBuilder stringBuilder = new StringBuilder(string).reverse(); + for (int i = 0; i < stringBuilder.length(); i++) { + char nextNt = stringBuilder.charAt(i); + // Protection against weird characters, e.g. alternate:"TBS" found in ClinVar + if (VariantAnnotationUtils.COMPLEMENTARY_NT.containsKey(nextNt)) { + stringBuilder.setCharAt(i, VariantAnnotationUtils.COMPLEMENTARY_NT.get(nextNt)); + } else { + return null; + } + } + return stringBuilder.toString(); + } + + + public static String translate(String dnaSequence) { + return translate(dnaSequence, "-"); + } + + public static String translate(String dnaSequence, String separator) { + StringBuilder aaSequence = new StringBuilder(); + dnaSequence = dnaSequence.toUpperCase(); + dnaSequence = dnaSequence.replaceAll("T", "U"); + for (int i = 0; i < dnaSequence.length(); i += 3) { + if (i + 2 < dnaSequence.length()) { + aaSequence.append(CODON_TO_A.get(dnaSequence.charAt(i) + dnaSequence.charAt(i + 1) + dnaSequence.charAt(i + 2))); + if (i + 3 < dnaSequence.length()) { + aaSequence.append(separator); + } + } + } + return aaSequence.toString(); + } + +} diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java new file mode 100755 index 00000000..ee13c4d3 --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java @@ -0,0 +1,486 @@ +/* + * + * + */ + +package org.opencb.biodata.formats.variant.cosmic; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.variant.VariantAnnotationUtils; +import org.opencb.biodata.models.sequence.SequenceLocation; +import org.opencb.biodata.models.variant.avro.*; +import org.opencb.commons.utils.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.text.NumberFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class CosmicParser { + + private static final int GENE_NAMES_COLUMN = 0; + private static final int HGNC_COLUMN = 3; + private static final int PRIMARY_SITE_COLUMN = 7; + private static final int SITE_SUBTYPE_COLUMN = 8; + private static final int PRIMARY_HISTOLOGY_COLUMN = 11; + private static final int HISTOLOGY_SUBTYPE_COLUMN = 12; + private static final int ID_COLUMN = 16; + private static final int COSM_ID_COLUMN = 17; + private static final int HGVS_COLUMN = 19; + private static final int MUTATION_DESCRIPTION_COLUMN = 21; + private static final int MUTATION_ZYGOSITY_COLUMN = 22; + private static final int FATHMM_PREDICTION_COLUMN = 29; + private static final int FATHMM_SCORE_COLUMN = 30; + private static final int MUTATION_SOMATIC_STATUS_COLUMN = 31; + private static final int PUBMED_PMID_COLUMN = 32; + private static final int SAMPLE_SOURCE_COLUMN = 34; + private static final int TUMOUR_ORIGIN_COLUMN = 35; + + private static final String SYMBOL = "symbol"; + + private static final String HGVS_INSERTION_TAG = "ins"; + private static final String HGVS_SNV_CHANGE_SYMBOL = ">"; + private static final String HGVS_DELETION_TAG = "del"; + private static final String HGVS_DUPLICATION_TAG = "dup"; + private static final String CHROMOSOME = "CHR"; + private static final String START = "START"; + private static final String END = "END"; + private static final String REF = "REF"; + private static final String ALT = "ALT"; + + private static final String VARIANT_STRING_PATTERN = "[ACGT]*"; + + private static final Pattern mutationGRCh37GenomePositionPattern = Pattern.compile("(?<" + CHROMOSOME + ">\\S+):(?<" + START + ">\\d+)-(?<" + END + ">\\d+)"); + private static final Pattern snvPattern = Pattern.compile("c\\.\\d+((\\+|\\-|_)\\d+)?(?<" + REF + ">([ACTG])+)>(?<" + ALT + ">([ACTG])+)"); + + private static Logger logger = LoggerFactory.getLogger(CosmicParser.class); + + private CosmicParser() { + throw new IllegalStateException("Utility class"); + } + + /** + * Method to parse the COSMIC file and call the callback function for the evidence entries for the given location + * + * @param cosmicFile Cosmic file to parse + * @param version Cosmic version, e.g: v95 + * @param name Evidence source name, e.g.: cosmic + * @param assembly Assembly, e.g.: GRCh38 + * @param callback Callback function to process the evidence entries for that location + * @throws IOException + */ + public static void parse(Path cosmicFile, String version, String name, String assembly, CosmicParserCallback callback) + throws IOException { + + int totalNumberRecords = 0; + int ignoredCosmicLines = 0; + int numberProcessedRecords = 0; + int invalidPositionLines = 0; + int invalidSubstitutionLines = 0; + int invalidDeletionLines = 0; + int invalidInsertionLines = 0; + int invalidDuplicationLines = 0; + int invalidMutationCDSOtherReason = 0; + + try (BufferedReader cosmicReader = new BufferedReader(new InputStreamReader(FileUtils.newInputStream(cosmicFile)))) { + long t0; + long t1 = 0; + long t2 = 0; + List evidenceEntries = new ArrayList<>(); + SequenceLocation old = null; + + String headerLine = cosmicReader.readLine(); // First line is the header -> ignore it + logger.info("Skipping header line: {}", headerLine); + + String line; + while ((line = cosmicReader.readLine()) != null) { + String[] fields = line.split("\t", -1); + + t0 = System.currentTimeMillis(); + EvidenceEntry evidenceEntry = buildCosmic(name, version, assembly, fields); + t1 += System.currentTimeMillis() - t0; + + SequenceLocation sequenceLocation = parseLocation(fields); + if (sequenceLocation == null) { + invalidPositionLines++; + } + if (old == null) { + old = sequenceLocation; + } + + if (sequenceLocation != null) { + // Parse variant + boolean validVariant = false; + String mutationCds = fields[HGVS_COLUMN]; + VariantType variantType = getVariantType(mutationCds); + if (variantType != null) { + switch (variantType) { + case SNV: + validVariant = parseSnv(mutationCds, sequenceLocation); + if (!validVariant) { + invalidSubstitutionLines++; + } + break; + case DELETION: + validVariant = parseDeletion(mutationCds, sequenceLocation); + if (!validVariant) { + invalidDeletionLines++; + } + break; + case INSERTION: + validVariant = parseInsertion(mutationCds, sequenceLocation); + if (!validVariant) { + invalidInsertionLines++; + } + break; + case DUPLICATION: + validVariant = parseDuplication(mutationCds); + if (!validVariant) { + invalidDuplicationLines++; + } + break; + default: + logger.warn("Skipping unkonwn variant type = {}", variantType); + validVariant = false; + invalidMutationCDSOtherReason++; + } + } + + if (validVariant) { + if (sequenceLocation.getStart() == old.getStart() && sequenceLocation.getAlternate().equals(old.getAlternate())) { + evidenceEntries.add(evidenceEntry); + } else { + boolean success = callback.processEvidenceEntries(old, evidenceEntries); + t2 += System.currentTimeMillis() - t0; + if (success) { + numberProcessedRecords += evidenceEntries.size(); + } else { + ignoredCosmicLines += evidenceEntries.size(); + } + old = sequenceLocation; + evidenceEntries.clear(); + evidenceEntries.add(evidenceEntry); + } + } else { + ignoredCosmicLines++; + } + } else { + ignoredCosmicLines++; + } + totalNumberRecords++; + + if (totalNumberRecords % 10000 == 0) { + logger.info("totalNumberRecords = {}", totalNumberRecords); + logger.info("numberIndexedRecords = {} ({} %)", numberProcessedRecords, + (numberProcessedRecords * 100 / totalNumberRecords)); + logger.info("ignoredCosmicLines = {}", ignoredCosmicLines); + logger.info("buildCosmic time = {}", t1); + logger.info("callback time = {}", t2); + + t1 = 0; + t2 = 0; + } + } + } finally { + logger.info("Done"); + logger.info("Total number of parsed Cosmic records: {}", totalNumberRecords); + logger.info("Number of processed Cosmic records: {}", numberProcessedRecords); + NumberFormat formatter = NumberFormat.getInstance(); + if (logger.isInfoEnabled()) { + logger.info("{} cosmic lines ignored: ", formatter.format(ignoredCosmicLines)); + } + if (invalidPositionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid position", formatter.format(invalidPositionLines)); + } + if (invalidSubstitutionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid substitution CDS", formatter.format(invalidSubstitutionLines)); + } + if (invalidInsertionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid insertion CDS", formatter.format(invalidInsertionLines)); + } + if (invalidDeletionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid deletion CDS", formatter.format(invalidDeletionLines)); + } + if (invalidDuplicationLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines because mutation CDS is a duplication", formatter.format(invalidDuplicationLines)); + } + if (invalidMutationCDSOtherReason > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines because mutation CDS is invalid for other reasons", + formatter.format(invalidMutationCDSOtherReason)); + } + } + } + + private static VariantType getVariantType(String mutationCds) { + if (mutationCds.contains(HGVS_SNV_CHANGE_SYMBOL)) { + return VariantType.SNV; + } else if (mutationCds.contains(HGVS_DELETION_TAG)) { + return VariantType.DELETION; + } else if (mutationCds.contains(HGVS_INSERTION_TAG)) { + return VariantType.INSERTION; + } else if (mutationCds.contains(HGVS_DUPLICATION_TAG)) { + return VariantType.DUPLICATION; + } else { + return null; + } + } + + private static boolean parseDuplication(String dup) { + // TODO: The only Duplication in Cosmic V70 is a structural variation that is not going to be serialized + return false; + } + + private static boolean parseInsertion(String mutationCds, SequenceLocation sequenceLocation) { + boolean validVariant = true; + String[] insParts = mutationCds.split("ins"); + + if (insParts.length > 1) { + String insertedNucleotides = insParts[1]; + if (insertedNucleotides.matches("\\d+") || !insertedNucleotides.matches(VARIANT_STRING_PATTERN)) { + //c.503_508ins30 + validVariant = false; + } else { + sequenceLocation.setReference(""); + sequenceLocation.setAlternate(getPositiveStrandString(insertedNucleotides, sequenceLocation.getStrand())); + } + } else { + validVariant = false; + } + + return validVariant; + } + + private static boolean parseDeletion(String mutationCds, SequenceLocation sequenceLocation) { + boolean validVariant = true; + String[] mutationCDSArray = mutationCds.split("del"); + + // For deletions, only deletions of, at most, deletionLength nucleotide are allowed + if (mutationCDSArray.length < 2) { // c.503_508del (usually, deletions of several nucleotides) + // TODO: allow these variants + validVariant = false; + } else if (mutationCDSArray[1].matches("\\d+") + || !mutationCDSArray[1].matches(VARIANT_STRING_PATTERN)) { // Avoid allele strings containing Ns, for example + validVariant = false; + } else { + sequenceLocation.setReference(getPositiveStrandString(mutationCDSArray[1], sequenceLocation.getStrand())); + sequenceLocation.setAlternate(""); + } + + return validVariant; + } + + private static boolean parseSnv(String mutationCds, SequenceLocation sequenceLocation) { + boolean validVariant = true; + Matcher snvMatcher = snvPattern.matcher(mutationCds); + + if (snvMatcher.matches()) { + String ref = snvMatcher.group(REF); + String alt = snvMatcher.group(ALT); + if (!ref.equalsIgnoreCase("N") && !alt.equalsIgnoreCase("N")) { + sequenceLocation.setReference(getPositiveStrandString(ref, sequenceLocation.getStrand())); + sequenceLocation.setAlternate(getPositiveStrandString(alt, sequenceLocation.getStrand())); + } else { + validVariant = false; + } + } else { + validVariant = false; + } + + return validVariant; + } + + private static String getPositiveStrandString(String alleleString, String strand) { + if (strand.equals("-")) { + return reverseComplementary(alleleString); + } else { + return alleleString; + } + } + + private static String reverseComplementary(String alleleString) { + char[] reverseAlleleString = new StringBuilder(alleleString).reverse().toString().toCharArray(); + for (int i = 0; i < reverseAlleleString.length; i++) { + reverseAlleleString[i] = VariantAnnotationUtils.COMPLEMENTARY_NT.get(reverseAlleleString[i]); + } + + return String.valueOf(reverseAlleleString); + } + + private static EvidenceEntry buildCosmic(String name, String version, String assembly, String[] fields) { + String id = fields[ID_COLUMN]; + String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; + + EvidenceSource evidenceSource = new EvidenceSource(name, version, null); + SomaticInformation somaticInformation = getSomaticInformation(fields); + List genomicFeatureList = getGenomicFeature(fields); + + List additionalProperties = new ArrayList<>(); + additionalProperties.add(new Property("COSM_ID", "Legacy COSM ID", fields[COSM_ID_COLUMN])); + additionalProperties.add(new Property("MUTATION_DESCRIPTION", "Description", fields[MUTATION_DESCRIPTION_COLUMN])); + if (StringUtils.isNotEmpty(fields[MUTATION_ZYGOSITY_COLUMN])) { + additionalProperties.add(new Property("MUTATION_ZYGOSITY", "Mutation Zygosity", fields[MUTATION_ZYGOSITY_COLUMN])); + } + additionalProperties.add(new Property("FATHMM_PREDICTION", "FATHMM Prediction", fields[FATHMM_PREDICTION_COLUMN])); + additionalProperties.add(new Property("FATHMM_SCORE", "FATHMM Score", "0" + fields[FATHMM_SCORE_COLUMN])); + additionalProperties.add(new Property("MUTATION_SOMATIC_STATUS", "Mutation Somatic Status", + fields[MUTATION_SOMATIC_STATUS_COLUMN])); + + List bibliography = getBibliography(fields[PUBMED_PMID_COLUMN]); + + return new EvidenceEntry(evidenceSource, Collections.emptyList(), somaticInformation, + url, id, assembly, + getAlleleOriginList(Collections.singletonList(fields[MUTATION_SOMATIC_STATUS_COLUMN])), + Collections.emptyList(), genomicFeatureList, null, null, null, null, + EthnicCategory.Z, null, null, null, additionalProperties, bibliography); + } + + private static SomaticInformation getSomaticInformation(String[] fields) { + String primarySite = null; + if (!isMissing(fields[PRIMARY_SITE_COLUMN])) { + primarySite = fields[PRIMARY_SITE_COLUMN].replace("_", " "); + } + String siteSubtype = null; + if (!isMissing(fields[SITE_SUBTYPE_COLUMN])) { + siteSubtype = fields[SITE_SUBTYPE_COLUMN].replace("_", " "); + } + String primaryHistology = null; + if (!isMissing(fields[PRIMARY_HISTOLOGY_COLUMN])) { + primaryHistology = fields[PRIMARY_HISTOLOGY_COLUMN].replace("_", " "); + } + String histologySubtype = null; + if (!isMissing(fields[HISTOLOGY_SUBTYPE_COLUMN])) { + histologySubtype = fields[HISTOLOGY_SUBTYPE_COLUMN].replace("_", " "); + } + String tumourOrigin = null; + if (!isMissing(fields[TUMOUR_ORIGIN_COLUMN])) { + tumourOrigin = fields[TUMOUR_ORIGIN_COLUMN].replace("_", " "); + } + String sampleSource = null; + if (!isMissing(fields[SAMPLE_SOURCE_COLUMN])) { + sampleSource = fields[SAMPLE_SOURCE_COLUMN].replace("_", " "); + } + + return new SomaticInformation(primarySite, siteSubtype, primaryHistology, histologySubtype, tumourOrigin, sampleSource); + } + + private static List getBibliography(String bibliographyString) { + if (!isMissing(bibliographyString)) { + return Collections.singletonList("PMID:" + bibliographyString); + } + + return Collections.emptyList(); + } + + private static List getGenomicFeature(String[] fields) { + List genomicFeatureList = new ArrayList<>(5); + if (fields[GENE_NAMES_COLUMN].contains("_")) { + genomicFeatureList.add(createGeneGenomicFeature(fields[GENE_NAMES_COLUMN].split("_")[0])); + } + // Add transcript ID + if (StringUtils.isNotEmpty(fields[1])) { + genomicFeatureList.add(createGeneGenomicFeature(fields[1], FeatureTypes.transcript)); + } + if (!fields[HGNC_COLUMN].equalsIgnoreCase(fields[GENE_NAMES_COLUMN]) && !isMissing(fields[HGNC_COLUMN])) { + genomicFeatureList.add(createGeneGenomicFeature(fields[HGNC_COLUMN])); + } + + return genomicFeatureList; + } + + private static SequenceLocation parseLocation(String[] fields) { + SequenceLocation sequenceLocation = null; + String locationString = fields[25]; + if (StringUtils.isNotEmpty(locationString)) { + Matcher matcher = mutationGRCh37GenomePositionPattern.matcher(locationString); + if (matcher.matches()) { + sequenceLocation = new SequenceLocation(); + sequenceLocation.setChromosome(getCosmicChromosome(matcher.group(CHROMOSOME))); + sequenceLocation.setStrand(fields[26]); + + String mutationCds = fields[HGVS_COLUMN]; + VariantType variantType = getVariantType(mutationCds); + if (VariantType.INSERTION.equals(variantType)) { + sequenceLocation.setEnd(Integer.parseInt(matcher.group(START))); + sequenceLocation.setStart(Integer.parseInt(matcher.group(END))); + } else { + sequenceLocation.setStart(Integer.parseInt(matcher.group(START))); + sequenceLocation.setEnd(Integer.parseInt(matcher.group(END))); + } + } + } + return sequenceLocation; + } + + private static String getCosmicChromosome(String chromosome) { + switch (chromosome) { + case "23": + return "X"; + case "24": + return "Y"; + case "25": + return "MT"; + default: + return chromosome; + } + } + + private static GenomicFeature createGeneGenomicFeature(String gene) { + Map map = new HashMap<>(1); + map.put(SYMBOL, gene); + + return new GenomicFeature(FeatureTypes.gene, null, map); + } + + private static GenomicFeature createGeneGenomicFeature(String featureId, FeatureTypes featureTypes) { + Map map = new HashMap<>(1); + map.put(SYMBOL, featureId); + return new GenomicFeature(featureTypes, null, map); + } + + private static List getAlleleOriginList(List sourceOriginList) { + List alleleOrigin; + alleleOrigin = new ArrayList<>(sourceOriginList.size()); + for (String originString : sourceOriginList) { + if (VariantAnnotationUtils.ORIGIN_STRING_TO_ALLELE_ORIGIN.containsKey(originString)) { + alleleOrigin.add(VariantAnnotationUtils.ORIGIN_STRING_TO_ALLELE_ORIGIN.get(originString)); + } else { + logger.debug("No SO term found for allele origin {}. Skipping.", originString); + } + } + return alleleOrigin; + } + + private static boolean isMissing(String string) { + return !((string != null) && !string.isEmpty() + && !string.replace(" ", "") + .replace("not specified", "") + .replace("NS", "") + .replace("NA", "") + .replace("na", "") + .replace("NULL", "") + .replace("null", "") + .replace("\t", "") + .replace(".", "") + .replace("-", "").isEmpty()); + } +} diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java new file mode 100644 index 00000000..36614184 --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java @@ -0,0 +1,10 @@ +package org.opencb.biodata.formats.variant.cosmic; + +import org.opencb.biodata.models.sequence.SequenceLocation; +import org.opencb.biodata.models.variant.avro.EvidenceEntry; + +import java.util.List; + +public interface CosmicParserCallback { + boolean processEvidenceEntries(SequenceLocation sequenceLocation, List evidenceEntries); +} diff --git a/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java b/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java new file mode 100644 index 00000000..8e6bb96a --- /dev/null +++ b/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java @@ -0,0 +1,54 @@ +package org.opencb.biodata.formats.variant.cosmic; + +import org.junit.Assert; +import org.junit.Test; +import org.opencb.biodata.models.sequence.SequenceLocation; +import org.opencb.biodata.models.variant.avro.EvidenceEntry; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +public class CosmicParserTest { + + // Implementation of the LineCallback function + public class MyCallback implements CosmicParserCallback { + private String msg; + private int counter; + + public MyCallback(String msg) { + this.msg = msg; + this.counter = 0; + } + + @Override + public boolean processEvidenceEntries(SequenceLocation sequenceLocation, List evidenceEntries) { + System.out.println(msg); + System.out.println("Sequence location = " + sequenceLocation); + System.out.println("Num. evidences = " + evidenceEntries.size()); + for (EvidenceEntry evidenceEntry : evidenceEntries) { + System.out.println("evidences = " + evidenceEntry); + counter++; + } + return true; + } + + public int getCounter() { + return counter; + } + } + + @Test + public void testCosmicParser() throws IOException { + Path cosmicFile = Paths.get(getClass().getResource("/cosmic.small.tsv.gz").getPath()); + String version = "v95"; + String name = "cosmic"; + String assembly = "GRCh38"; + + MyCallback callback = new MyCallback(">>> Testing message"); + + CosmicParser.parse(cosmicFile, version, name, assembly, callback); + Assert.assertEquals(90, callback.getCounter()); + } +} \ No newline at end of file diff --git a/biodata-formats/src/test/resources/cosmic.small.tsv.gz b/biodata-formats/src/test/resources/cosmic.small.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..e1539f827b47f510ab3ea6a6caa56c5f341ebaff GIT binary patch literal 8342 zcmV;HAZgzpiwFp*1^i_I17mMb;~t>MM^Y>}>>w$IV!=yrH^brFq|<;x=cqpOSQ z#(jya?QXf=Wzk(WUnbe>*=D{?qcOakZFb+buP^Bt-T#v9f60q~`8?lj*NgSjYjl~f zD!=3KJ=X9ZNBP&wH2&xO0iH3-(sUJ#KHYx4xf)JyzDx#_tIxMn*v|E*(O~%N-dlU? z-8R{xRzMTw7lT2$zDVJ>&tGs|zWK-N(;5`;8eM?qjc+-_vRNuK%f3H2ggNN(oWYP{Hu48_k8#Rz?&aC#W_enw<+R_3)=j^m2Yj zrBT<>c=v6yoo{zeNAcOse6vYcn>30dZX(Klq)c$ZsT6UsiemV42TJ}7Msqcc&aD-i zMPHL_o~*WK>&LU*>bKSUpQ~JXoKQohR8h3qWlw4L8b8keCw)MBpA)p4zDOBg@ZbU13z87E?=J4^M~mc)Ur!Ubr;`QrR$ePvROv% zHJQ%R#bUaAO&29PuN7z*po-8~Dy?9~L8C0w6rlm+Mn4S!ntLjl{*O8|v-QJ#^#slv zy#6+NPPcmy2~9{`tQth~mL43Cz$RNTv~W40wNic=&Y1gAUfc2!8rSL%1IGQQN!Y5b>ONrN37eCh`>=s zk6F5T{`bdh{YQ!qy8S$kUVQwGq-_Re^8zA{7R@Io>>~%oiv}?D@mIs4R1?I)GO7>@ zB{^ULF(K>pPr1sOHIga(;+9gut+HO_@S;KG#;_k4V_-z+qW&}b;?+~Lb~vq?*tl5j zTRWkgt1caL#=CSJH7cdLOVXr=|_aj`n2F#@9gdwc_LR(Dml!ZAA$dRngIkemz8AuVxy1(f1AB$pq+A$9JS z9fcvmExQ8(?o0^pmdA|Bztr9G?RwSZmKhguu{u;e13W*J$$)4RILM9e*vnsW#|**g zQ&Nt1hCTyf+H7FWz(cXlg zFyUU@$q}$kJUI}SJ5J8?I$I(bSvpH!w(IP)b7YLJqyoyP0!C?v4`gLN6##q;{rhS7 zC@lj%xXM!jYh_o+ed_^ofEYT-UYY?3l9zy>Ed5de7ds@0fsd;g?8YUl%+pK=A|DYu zz$W2QW$Z8>2Z{6(aFB3frtqR4QefVjiAO|Jdt!11XE0>Pu#z5;V`0P#s93A^@$oH* z*rB{74k*jCkj4MNpf@ZRG#*g-4;Rw4;Q}ZkRYnV|qIC7JUZ&e@zFRh(mg3v!*4lBg z6i$N6$GB4X+YuJaTu4`t4|w-AVbt2UrDL2kY2sp4BdC!1>28E#G}phep6z!5n~SznOFM3a5^!vMflna?!L$P&HuOQCo3|Yr!IUXUP)m zcK&$iy#&PSxL6$xDR0p|L=`7anmulSAuHR)k)80RyFx+WogOIsVAm zek5#seakTAV}=SzFpts1*!PxX|GLY5n{T&e{<3*lXUPNnBJ4xeOS2$Zg(`b40E~m8 zSL2ehylny0=rm#mdS)^%RtKq!QKGB10)}EF5bH9`)M_hjtqT;eg}UYYC^19H{(oWy zo|(a`KVl5S1IsaE3^21Qbeirt<0mCIf)nj&fnOaWW{K`hKKnq1!HDs;R444BtT>`z z{QzUYpdn0@e=B1xsLE9Wmb=+{lPsb~0B?~_pVHNQ3;Dm}aMORqhB3sehjg1}%lRtV zrvHXez>m(KR-lXNGF>E1Wl8`@K|l%(DT@!bAq!s1QVJF{8S!6@u`ca3xPGH^3l$DJ^WqqLyRQMgr z1*YNyLpev_DF=!Qgkuy!mKI`sT5?L132iA{g>nISp%1MVK$t>_TIy}N;1zBSqV%hM z`m-a-zwr!F^JknPp~4yBz+ghx1iUvwqnI-@cZP_yS0_QkC05Ie)uFTFxWTudA&^@% zcd79Bpm>Y4ne&BRl$;OcdPhg#&nO@0vV35Q@`0~Xqcd>v2WY<-wqkO~y?txTuMT0s zp#Aa11BgOS%-7};Xug{&A1C%56*uhGNWsc(jSAfw^D*5T?5|h4HNJzOq%aN}MMI}% z1QsU0o_1b16ErSXwGKOY%&rlXqrIEvfHPXWIsw6{G@=srQ7!6VWwd8M(#p_}L?&fi z8|F>@NXe-v`;m}sq0Ti?Tw-UJsu;2_;bVPkI}&`e@-+ars^vCB+ODwPv$I=~tl(Mh z={>Xlv7mX*Z+*^h*azkO4494fZAiv&I4F=Q3keM7Dcl(EbS^37q@7IFGfFt#JT6v; zPDg{U`=A2A<$Qy(wUMZN3zZbb_GP@6I9>qkQ^C1_Pz|I;1^3cm&>ecWmPkTFB~n8Y z1%WS&?kLIfW;-3#OuK5=gh}NGL)_|cFvLpR#fcy`VxA?UymYTRdh4`WO7sux*{vE5 zA({!VLlJbS+jZ!T+|@-^=`LGi9Gzug4J|=5C{_f;YH$}Jio#x*%MNh9!Z&-lChbvl zKRW<<<6?D~9T=rL3%qq*x6>JvsU>9QqF1s35T;+B_f}z=Z4LSrr4^;*lr-CYJzcFI z^62@-M(=*6O+%)wgj8Ypf#I3JsV|XJ9R%()kid>YICZOrMlblfPLLo&02kX{g*UW) zfQ_~?oqRLZ5jH{l`Je9{36#ns0XQO7L;^SGsq?h@4ycfBZ@$;(2svdmE>=xv$8`Fs zH$(zAXA82mwcT|sopuuSi^-rOqo?;2vlfnnmvD?M9I1{B`e8T2?5!o5h3{H9hoyVV7`jC9EzL=0k3M!0YKE2;$gm{ zN`3}1_MIVei#4^ zH1KJkjk#5gj+#z3`qnZAkbRH~Kxou{E`6bWCs5xZl9{33W^) zNC~NN9(4gJV5^V^K-LXGMRS)2VACFzJpr$CtVH@FDU$&q@G>a6!v zD+|_gWzLx&7C{O3N~S@tKbHHTh)gcps24KQXq-%>S}6_81UL}I(cESd^oE=aun^V? z-&mlGNea2Y!vdm!a9GeV`+P4nyEzdHY2#vbhy`q+EBpeXl40)8LGgp=6} zh~G>F5a|Zbdma@bVg}(dltv*1V|Y{qviu(TF`f&;z_q)A8%Tz?v=9r(YAMblP%cOe zhEOm5RCTW*Y{bQ?=7hQ@a*kEkfSNUHwIWubS$sN@%(rXy?t&w$On_usVHBp#!s7h7 z>kd4p2_{~%--7B+Df~+!tjVto8I4V9uUCOJ1ndsdA0*bP+SCfEBEnp(YHDS&^EL{{ zVINkZlOcjG(=D)rWCO%qvu4#+Bnz7L0t|a;_WH}OhiX*M7W`HOHc5zN!x{neK^YW^p)0w)GHC{8bvdk*7QSR~%F(@U){om_l6(tW+oliG#Lez?;4o9$Ad6e1Tt#kuEkw5dv_8Xmhxcp zxY~yUG||at&@?z+X6W|evJn&l5Q9$U#i*+t|GJkuB3jG5!b;nW=N2h0kbIozFs`(Z?Wgo;FD~4Ull`GOcyPsRh zLm}|-{N50E)7b`ZQ+z-t3iS*Kkw5R`oN1Hwrp@4?NJAe*6&#J;->ySJkYXo3N9D z<)Co=5OT5jeiDxfBw5s{AU%BCi;-GIhByRdnFfkS7Z30)*6zul2 z$PDRgoUe{wjyM{OS)=l?k_pM|X4R)X$dS9GJpw{0R<2d|=zdj% zs%hh;Lyh<5wKwhjD$v_an;H?QCZt1L75?AFS%Xs)n5079#E%W`PJV%H{*CX zx%bHyCqCH%cG8t>d3_%wTIupz_PSmri^DpN%x~`i)8$xoMnn{$u4$Fu*!{|gJp~Y; zdPIk#fU6oH+H@X?GS_Nr7Ve>dGPR%(1TP6F$kGply9>b};|)a|yy}KU91=!6w;61q zy@0dX0ZDzF&*sTux_Qac1o?YIO-Jc4%xQ_%EQAD$%_}e{#-#72|Cb>4*!^~&h-ngv zJZ^O`nnpW69qD+HXK$Xfgk-c>frOK`8gkf2k*A?(qh3+r&Q2j6pnNK)V?Mp7Q)CjZ zldF9vm-F78Tuf69Gd@ByqPe>V>dpk5zoN!Qr~Yr|8&|EhHdU*&bvCF@>()Ym=_s-7 z305t&$!Q@IHmG`}V#6UN!jA+}9C!jsDU5j&r35X84|^eaM}viKiPe~1i5N#Bd~K(d zQeycmE>`v46a)V!Y({T+QC!;ut4oyLYbjsBoIjl~{AXzbZMFd6}25}h&}B>&l5e+M{@O*jRr zf`g^Ss%gN9b~aXzm1&ym!jtR;C2Xq$gng831w%Nv zr|Kj(c8--`Mgl(>C_6c^GEegSA65XA`YNsK--x6A0GUrQO4}kCyVR z1h1OL#j1YEo{DbQ4Sw!+Z$Q4Kg;$ZKpc?qRepKUjczHLyxVjr*OO$a+6vN3qE%8d| zEM0B01ZP{b-%)>?*UdIv0`o}bE4T7$d|J#=r)!{xYoH|AdiJ`7H>RTuY=3%nbrmJ6 z1m`ce^I6UKCHFo^XrY*qp;lnsGzT@-gy90k`LX#W?}Lc@sLyJ{;=E{DDE=Va>;Y9G!JuB`SEZYys(=a68lI_nQ;&Pxne5CN9MnzF< zK{loVHAc@72z|7(KhG zzYs}Flg0PZWWm?I#bkB0b0-MPo2A4R%VRKNb)-%UR_9IDt?EMo^w=ev9wu9gRq#Q4 zbiZ2FC4NAYGg)f9$pXS{d}E2(?9{Q2vM1CU_&ArcOa(R1jaTS=jD-TrsjGKyjs*f+ zT&(H|i_o3r*}A#Xj=_~&kaBVbp4U&IqrcvM8cxCGF-tTOFa}f`FBg5AFXjMh=(Ig0 zi^aD^^6*fYYK2lPlSSw793>Lms>mm{AcZ$z%7NZd*XRM}=mXr(j2)er&dX1X#Ko%N zA~=3DL09oZDsJq-$HjrvvbpGS_ne||Y4hMk*g>TZh1J16koX(Mr2O06G#2|ejg!?n zOR>1oQje-R56DEZvPknwtBhl_(R&Z+QgXba5N4#MJaK>#C;sDNm46}}f7b>47Uho? zuP9bq#VH71DaA_p4rYy?3P*%3hUBmSvCyKs3x#8H*ER&m1_|yK%h82Nn41gP7 zoMios<%2H&+upVH#&KKWXaAKH6Yx&X<7VT?Mr1>F)=1w3!Byg-swCJkTJ+cVoWny7 zXLdL{!$wtxWaMRD;VdOE0MjMoPq(BXds& zw=$M36?=_1I(AuvT&~|bqXrkuKoGMU(B{bu0d0`4v4&#z&f#O?EjGNvzl+hlRT+l+ zhrfNEE1+}eVUyhg4{ypSkn_QTV()j0F+#9ujp*`ddcJK~UUb=GDrcI=smHCOI7PhL zh(lX|6&qNhF{s#oY0p+7*4yB^h(5&tL!dTCd7!g=DXmmwmE@hJGKSk z?^j9=QJhL%jAbq=PHfnHTN;rAO;IDNtPx#=-HOE#ze>3(=!(8sUUA7QKBfv0X9Zm= zTJiAl5hbhyU3nUX|40u>L(dRy1tq>P6+$hHx<9jpQMsapk%4Iqj^2N-oKQPj|3f(4 z&5|1Cq}N>@O`Z|&7oq9a<-4O!Fw24iQ1RD(gUc#gU&tTp5z=3tKHJ(P3?Fo7jfPQ6Z z^nBYjmdK%)HIOEr&Cpzm@|^)N^CioAX>UMf7m&`}{EPW0>b~}U!r=giYYQ?hEU@bG z==cGX%UWYoaqk$Eg9M%1n>3rU5nG=FxBcb!M!R8eyiI!(0q1&d3wxuST?I;yIY4c@ zJUX^Xm02vYXxx;GyL%L|(jL$A7!Yevv^KT!%j(}r@M|9wq@J)lm z0vh<`Jv{BmA~;2d0O=WP9!4vGfwe%Q5LoRc?5lZ&f&B(7*{s4SXHh`=fq>Upz&60( zFM1v10NG*kbr1xwC@~e!=>Nj4MvOxUcH8CAvC6=%#HT~fCfO?BDQOBNA%JVdmwkaz zF|5K#Ij_8@#*O z)g~Spkf;Wj$buK3e5P-tP$HzK0|wfhHyXQcf`#?3rlmgIKHmKCfCmP{5fvAMjyzU{ zW>7dAfh(QRI^$-M$V-@Tf?AdHXqsUSi!6jxzO;;n;uoN_qHqP4mb1PQ(eD$%dnfUvCB_ z^MO}V$BH^@*H_dLAB&32X|LcUT14@Qm}7!K9F!EX$n&|E-_?LgCPy?l<08Qnj#iMc zoLb3wd+@WDHC8E>I-gXMs12FqhnM@e*Ggfth{JYJc!OjBPqf~GH|=fTPV}LjO;iR# zE0gl{?Nqk`{O>Sh&L8=G+Ugjg!%7of7?hlVPn++ z?2tz0ue3w-kUPV@!dLLsZLA+ieXXZJUs z9@2QUmBTN7za7P7?;q~3|8e{A2JYtH|HJJfxncr2x!H8>;hp@Hf`y#e4T0 z_WD$Va&n)+KkV{orY>180uCX^fdt!i6T=>;E)ejt;x?pd6s|N#b_T)T&sW%>M?VSL zOp8ddhlHu;K${V=v{82O;RGYGgEik%P}Z6RyV!?y;Ux@p2GiwHYA4igG11w=YK|gR z!bwT;G5{M)qp7>!ODVYN0n--P^Z*9L786a|pb9W)X8uB8W&)G8e?c4W6qvNNeYDDG zG_64P%Pee-VA6wRLud}a{y$yHFf3*8c_}eP-$|r)hxa-x69vO78=}V>M@Ps8{zRQ? z!(xtQOUoI*e=aY8f0K9ttfnlcw_wRU4KIMxD$@%39DNf6(G4$1OvFF;ez!8jgT}nc z?uD6YV~|*q(oPCcNvWa3;cFV1CsH~b#vGcc-Y;Lil1f@e%9YHMW}XZw7oJj=Ojq0z zjzLI)Pqv5u9;o!^Qfck$b@dZZ8f+Y9LZ#6RSln_=&{onkkgbLdnx;^&JA*{KCaCEz zzZYH>dlAxJNTQ?#BP-=Jt=Y)A31++I;53eoy4?I?JZ>v)v8nI}rYRKo(N3GT5UBS- zWYA*b*GACdaAwBet0LpC7{?xqeG^r9Yw{U8cQohQNoSVy1Kecs$`6-5zjhF7`Wi+agHJ z+rzW-JhTj+9%ZTkLPzI}ace&QK5l3m`DP${ zmX5FWOo=`}Q`)6xib$4}&b^(6`(IjSsULe`1Xn1G$IVg+%%1*5hn3uoz$fkE<_tB g$MExKrGHZL)N}er{F(mwn(3ea0!i$rfjpN00M*0!rvLx| literal 0 HcmV?d00001 diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java b/biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java new file mode 100644 index 00000000..d5397a71 --- /dev/null +++ b/biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java @@ -0,0 +1,135 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.biodata.models.common; + +import org.opencb.commons.datastore.core.ObjectMap; + +import java.util.ArrayList; +import java.util.List; + +public class DataVersion { + + private String data; + private String name; + private String version; + private String date; + private String species; + private String assembly; + private List files; + private List urls; + private ObjectMap attributes; + + public DataVersion() { + files = new ArrayList<>(); + urls = new ArrayList<>(); + attributes = new ObjectMap(); + } + + public DataVersion(String data, String name, String version, String date, String species, String assembly, List files, + List urls, ObjectMap attributes) { + this.data = data; + this.name = name; + this.version = version; + this.date = date; + this.species = species; + this.assembly = assembly; + this.files = files; + this.urls = urls; + this.attributes = attributes; + } + + public String getData() { + return data; + } + + public DataVersion setData(String data) { + this.data = data; + return this; + } + + public String getName() { + return name; + } + + public DataVersion setName(String name) { + this.name = name; + return this; + } + + public String getVersion() { + return version; + } + + public DataVersion setVersion(String version) { + this.version = version; + return this; + } + + public String getDate() { + return date; + } + + public DataVersion setDate(String date) { + this.date = date; + return this; + } + + public String getSpecies() { + return species; + } + + public DataVersion setSpecies(String species) { + this.species = species; + return this; + } + + public String getAssembly() { + return assembly; + } + + public DataVersion setAssembly(String assembly) { + this.assembly = assembly; + return this; + } + + public List getFiles() { + return files; + } + + public DataVersion setFiles(List files) { + this.files = files; + return this; + } + + public List getUrls() { + return urls; + } + + public DataVersion setUrls(List urls) { + this.urls = urls; + return this; + } + + public ObjectMap getAttributes() { + return attributes; + } + + public DataVersion setAttributes(ObjectMap attributes) { + this.attributes = attributes; + return this; + } +} diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java b/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java new file mode 100644 index 00000000..a2d8298f --- /dev/null +++ b/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java @@ -0,0 +1,87 @@ +package org.opencb.biodata.models.sequence; + +public class SequenceLocation { + private String chromosome; + private int start; + private int end; + private String reference; + private String alternate; + private String strand; + + public SequenceLocation() { + } + + SequenceLocation(String chromosome, int start, int end, String reference, String alternate) { + this(chromosome, start, end, reference, alternate, "+"); + } + + SequenceLocation(String chromosome, int start, int end, String reference, String alternate, String strand) { + this.chromosome = chromosome; + this.start = start; + this.end = end; + this.reference = reference; + this.alternate = alternate; + this.strand = strand; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SequenceLocation{"); + sb.append("chromosome='").append(chromosome).append('\''); + sb.append(", start=").append(start); + sb.append(", end=").append(end); + sb.append(", reference='").append(reference).append('\''); + sb.append(", alternate='").append(alternate).append('\''); + sb.append(", strand='").append(strand).append('\''); + sb.append('}'); + return sb.toString(); + } + + public String getChromosome() { + return chromosome; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + public String getReference() { + return reference; + } + + public String getAlternate() { + return alternate; + } + + public String getStrand() { + return strand; + } + + public void setChromosome(String chromosome) { + this.chromosome = chromosome; + } + + public void setStart(int start) { + this.start = start; + } + + public void setEnd(int end) { + this.end = end; + } + + public void setReference(String reference) { + this.reference = reference; + } + + public void setAlternate(String alternate) { + this.alternate = alternate; + } + + public void setStrand(String strand) { + this.strand = strand; + } +} From af57a140373c60c93f4f5407f94779edbd253e12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 6 Jun 2024 11:36:04 +0200 Subject: [PATCH 02/10] formats: add checks to the COSMIC parser: valid format file and assembly match, #TASK-5913, #TASK-5318 On branch TASK-5318 Changes to be committed: modified: biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java modified: biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java --- .../formats/variant/cosmic/CosmicParser.java | 23 ++++++++++++++++++- .../variant/cosmic/CosmicParserTest.java | 3 ++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java index ee13c4d3..301ec9ec 100755 --- a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java @@ -20,6 +20,7 @@ package org.opencb.biodata.formats.variant.cosmic; import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.formats.variant.VariantAnnotationUtils; import org.opencb.biodata.models.sequence.SequenceLocation; import org.opencb.biodata.models.variant.avro.*; @@ -90,7 +91,10 @@ private CosmicParser() { * @throws IOException */ public static void parse(Path cosmicFile, String version, String name, String assembly, CosmicParserCallback callback) - throws IOException { + throws IOException, FileFormatException { + + int numCosmicFields = 39; + int assemblyFieldIndex = 24; int totalNumberRecords = 0; int ignoredCosmicLines = 0; @@ -111,11 +115,28 @@ public static void parse(Path cosmicFile, String version, String name, String as String headerLine = cosmicReader.readLine(); // First line is the header -> ignore it logger.info("Skipping header line: {}", headerLine); + String[] headerFields = headerLine.split("\t", -1); + if (headerFields.length != numCosmicFields) { + throw new FileFormatException("Invalid COSMIC format file. Expected " + numCosmicFields + " fields, got " + + headerFields.length + " at " + headerLine); + } String line; while ((line = cosmicReader.readLine()) != null) { String[] fields = line.split("\t", -1); + // Check fields number + if (headerFields.length != numCosmicFields) { + throw new FileFormatException("Invalid COSMIC format file. Expected " + numCosmicFields + " fields, got " + + headerFields.length + " at " + headerLine); + } + // Check assembly + String cosmicAssembly = headerFields[assemblyFieldIndex] + fields[assemblyFieldIndex]; + if (!cosmicAssembly.equalsIgnoreCase(assembly)) { + throw new IllegalArgumentException("Mismatch assembly: COSMIC file assembly is " + cosmicAssembly + " but input" + + " assembly is " + assembly); + } + t0 = System.currentTimeMillis(); EvidenceEntry evidenceEntry = buildCosmic(name, version, assembly, fields); t1 += System.currentTimeMillis() - t0; diff --git a/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java b/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java index 8e6bb96a..9aba1123 100644 --- a/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java +++ b/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java @@ -2,6 +2,7 @@ import org.junit.Assert; import org.junit.Test; +import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.sequence.SequenceLocation; import org.opencb.biodata.models.variant.avro.EvidenceEntry; @@ -40,7 +41,7 @@ public int getCounter() { } @Test - public void testCosmicParser() throws IOException { + public void testCosmicParser() throws IOException, FileFormatException { Path cosmicFile = Paths.get(getClass().getResource("/cosmic.small.tsv.gz").getPath()); String version = "v95"; String name = "cosmic"; From 2c326675f12650475a19c1ba0c55b781e5717f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 25 Jun 2024 16:48:46 +0100 Subject: [PATCH 03/10] formatS: Remove unused fields from VariantAnnotationUtils. #TASK-5913, #TASK-5318 --- .../variant/VariantAnnotationUtils.java | 639 +----------------- .../formats/variant/cosmic/CosmicParser.java | 31 +- 2 files changed, 35 insertions(+), 635 deletions(-) diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java index 58cab2ea..157fa507 100644 --- a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java @@ -16,191 +16,21 @@ package org.opencb.biodata.formats.variant; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.models.variant.annotation.ConsequenceTypeMappings; -import org.opencb.biodata.models.variant.annotation.exceptions.SOTermNotAvailableException; -import org.opencb.biodata.models.variant.avro.*; +import org.opencb.biodata.models.variant.avro.AlleleOrigin; -import java.util.*; +import java.util.HashMap; +import java.util.Map; /** * Created by fjlopez on 22/06/15. */ public class VariantAnnotationUtils { - public static final char SEPARATOR_CHAR = ':'; - - public static final String THREEPRIME_OVERLAPPING_NCRNA = "3prime_overlapping_ncrna"; - public static final String IG_C_GENE = "IG_C_gene"; - public static final String IG_C_PSEUDOGENE = "IG_C_pseudogene"; - public static final String IG_D_GENE = "IG_D_gene"; - public static final String IG_J_GENE = "IG_J_gene"; - public static final String IG_J_PSEUDOGENE = "IG_J_pseudogene"; - public static final String IG_V_GENE = "IG_V_gene"; - public static final String IG_V_PSEUDOGENE = "IG_V_pseudogene"; - public static final String MT_RRNA = "Mt_rRNA"; - public static final String MT_TRNA = "Mt_tRNA"; - public static final String TR_C_GENE = "TR_C_gene"; - public static final String TR_D_GENE = "TR_D_gene"; - public static final String TR_J_GENE = "TR_J_gene"; - public static final String TR_J_PSEUDOGENE = "TR_J_pseudogene"; - public static final String TR_V_GENE = "TR_V_gene"; - public static final String TR_V_PSEUDOGENE = "TR_V_pseudogene"; - public static final String ANTISENSE = "antisense"; - public static final String LINCRNA = "lincRNA"; - public static final String MIRNA = "miRNA"; - public static final String MISC_RNA = "misc_RNA"; - public static final String POLYMORPHIC_PSEUDOGENE = "polymorphic_pseudogene"; - public static final String PROCESSED_PSEUDOGENE = "processed_pseudogene"; - public static final String PROCESSED_TRANSCRIPT = "processed_transcript"; - public static final String PROTEIN_CODING = "protein_coding"; - public static final String PSEUDOGENE = "pseudogene"; - public static final String RRNA = "rRNA"; - public static final String SENSE_INTRONIC = "sense_intronic"; - public static final String SENSE_OVERLAPPING = "sense_overlapping"; - public static final String SNRNA = "snRNA"; - public static final String SNORNA = "snoRNA"; - public static final String NONSENSE_MEDIATED_DECAY = "nonsense_mediated_decay"; - public static final String NMD_TRANSCRIPT_VARIANT = "NMD_transcript_variant"; - public static final String UNPROCESSED_PSEUDOGENE = "unprocessed_pseudogene"; - public static final String TRANSCRIBED_UNPROCESSED_PSEUDGENE = "transcribed_unprocessed_pseudogene"; - public static final String RETAINED_INTRON = "retained_intron"; - public static final String NON_STOP_DECAY = "non_stop_decay"; - public static final String UNITARY_PSEUDOGENE = "unitary_pseudogene"; - public static final String TRANSLATED_PROCESSED_PSEUDOGENE = "translated_processed_pseudogene"; - public static final String TRANSCRIBED_PROCESSED_PSEUDOGENE = "transcribed_processed_pseudogene"; - public static final String TRNA_PSEUDOGENE = "tRNA_pseudogene"; - public static final String SNORNA_PSEUDOGENE = "snoRNA_pseudogene"; - public static final String SNRNA_PSEUDOGENE = "snRNA_pseudogene"; - public static final String SCRNA_PSEUDOGENE = "scRNA_pseudogene"; - public static final String RRNA_PSEUDOGENE = "rRNA_pseudogene"; - public static final String MISC_RNA_PSEUDOGENE = "misc_RNA_pseudogene"; - public static final String MIRNA_PSEUDOGENE = "miRNA_pseudogene"; - public static final String NON_CODING = "non_coding"; - public static final String AMBIGUOUS_ORF = "ambiguous_orf"; - public static final String KNOWN_NCRNA = "known_ncrna"; - public static final String RETROTRANSPOSED = "retrotransposed"; - public static final String TRANSCRIBED_UNITARY_PSEUDOGENE = "transcribed_unitary_pseudogene"; - public static final String TRANSLATED_UNPROCESSED_PSEUDOGENE = "translated_unprocessed_pseudogene"; - public static final String LRG_GENE = "LRG_gene"; - - public static final String INTERGENIC_VARIANT = "intergenic_variant"; - public static final String REGULATORY_REGION_VARIANT = "regulatory_region_variant"; - public static final String TF_BINDING_SITE_VARIANT = "TF_binding_site_variant"; - public static final String UPSTREAM_GENE_VARIANT = "upstream_gene_variant"; - public static final String TWOKB_UPSTREAM_VARIANT = "2KB_upstream_variant"; - public static final String DOWNSTREAM_GENE_VARIANT = "downstream_gene_variant"; - public static final String TWOKB_DOWNSTREAM_VARIANT = "2KB_downstream_variant"; - public static final String SPLICE_DONOR_VARIANT = "splice_donor_variant"; - public static final String SPLICE_ACCEPTOR_VARIANT = "splice_acceptor_variant"; - public static final String INTRON_VARIANT = "intron_variant"; - public static final String SPLICE_REGION_VARIANT = "splice_region_variant"; - public static final String FIVE_PRIME_UTR_VARIANT = "5_prime_UTR_variant"; - public static final String THREE_PRIME_UTR_VARIANT = "3_prime_UTR_variant"; - public static final String INCOMPLETE_TERMINAL_CODON_VARIANT = "incomplete_terminal_codon_variant"; - public static final String STOP_RETAINED_VARIANT = "stop_retained_variant"; - public static final String START_RETAINED_VARIANT = "start_retained_variant"; - public static final String SYNONYMOUS_VARIANT = "synonymous_variant"; - public static final String INITIATOR_CODON_VARIANT = "initiator_codon_variant"; - public static final String START_LOST = "start_lost"; - public static final String STOP_GAINED = "stop_gained"; - public static final String STOP_LOST = "stop_lost"; - public static final String MISSENSE_VARIANT = "missense_variant"; - public static final String MATURE_MIRNA_VARIANT = "mature_miRNA_variant"; - public static final String NON_CODING_TRANSCRIPT_EXON_VARIANT = "non_coding_transcript_exon_variant"; - public static final String NON_CODING_TRANSCRIPT_VARIANT = "non_coding_transcript_variant"; - public static final String INFRAME_INSERTION = "inframe_insertion"; - public static final String INFRAME_VARIANT = "inframe_variant"; - public static final String FRAMESHIFT_VARIANT = "frameshift_variant"; - public static final String CODING_SEQUENCE_VARIANT = "coding_sequence_variant"; - public static final String TRANSCRIPT_ABLATION = "transcript_ablation"; - public static final String TRANSCRIPT_AMPLIFICATION = "transcript_amplification"; - public static final String COPY_NUMBER_CHANGE = "copy_number_change"; - public static final String TERMINATOR_CODON_VARIANT = "terminator_codon_variant"; - public static final String FEATURE_TRUNCATION = "feature_truncation"; - public static final String FEATURE_VARIANT = "feature_variant"; - public static final String STRUCTURAL_VARIANT = "structural_variant"; - public static final String INFRAME_DELETION = "inframe_deletion"; - - public static final String CDS_START_NF = "cds_start_NF"; - public static final String CDS_END_NF = "cds_end_NF"; - - public static final String FUNCTION_UNCERTAIN_VARIANT = "function_uncertain_variant"; - - public static final Map> IS_SYNONYMOUS_CODON = new HashMap<>(); - public static final Map> MT_IS_SYNONYMOUS_CODON = new HashMap<>(); - public static final Map SO_NAMES_CORRECTIONS = new HashMap<>(); - public static final Map> A_TO_CODON = new HashMap<>(); - public static final Map> MT_A_TO_CODON = new HashMap<>(); - public static final Map CODON_TO_A = new HashMap<>(); - public static final Map MT_CODON_TO_A = new HashMap<>(); - public static final Map COMPLEMENTARY_NT = new HashMap<>(); - public static final Map SIFT_DESCRIPTIONS = new HashMap<>(); - public static final Map POLYPHEN_DESCRIPTIONS = new HashMap<>(); - public static final Map SO_SEVERITY = new HashMap<>(); - public static final Map ORIGIN_STRING_TO_ALLELE_ORIGIN = new HashMap<>(); - public static final Set CODING_SO_NAMES = new HashSet<>(); - public static final Map CLINVAR_CLINSIG_TO_ACMG = new HashMap<>(); - public static final Map CLINVAR_CLINSIG_TO_TRAIT_ASSOCIATION = new HashMap<>(); - public static final HashMap CLINVAR_REVIEW_TO_CONSISTENCY_STATUS = new HashMap<>(); - // Currently left empty since the only item within DrugResponseClassification that seemed to match any clinvar - // tag ("responsive") was removed at some point from the model - public static final Map CLINVAR_CLINSIG_TO_DRUG_RESPONSE = new HashMap<>(); - - public static final String MT = "MT"; - public static final String UNKNOWN_AMINOACID = "X"; - - public static final HashMap MODEOFINHERITANCE_MAP = new HashMap<>(); - public static final HashMap COSMIC_SOMATICSTATUS_TO_ALLELE_ORIGIN = new HashMap<>(); - public static final HashMap TO_ABBREVIATED_AA = new HashMap<>(40); // 22 AA - public static final HashMap TO_LONG_AA = new HashMap<>(40); // 22 AA - private static final String ATG = "ATG"; - private static final String ATA = "ATA"; - private static final String TAA = "TAA"; - private static final String TAG = "TAG"; - private static final String AGA = "AGA"; - private static final String AGG = "AGG"; - private static final String TGA = "TGA"; + private static final Map ORIGIN_STRING_TO_ALLELE_ORIGIN = new HashMap<>(); + private static final Map COMPLEMENTARY_NT = new HashMap<>(); static { - MODEOFINHERITANCE_MAP.put("autosomal dominant inheritance", ModeOfInheritance.monoallelic); - MODEOFINHERITANCE_MAP.put("autosomal dominant inheritance with maternal imprinting", - ModeOfInheritance.monoallelic_maternally_imprinted); - MODEOFINHERITANCE_MAP.put("autosomal dominant inheritance with paternal imprinting", - ModeOfInheritance.monoallelic_paternally_imprinted); - MODEOFINHERITANCE_MAP.put("autosomal recessive inheritance", - ModeOfInheritance.biallelic); - MODEOFINHERITANCE_MAP.put("mitochondrial inheritance", - ModeOfInheritance.mitochondrial); - MODEOFINHERITANCE_MAP.put("sex-limited autosomal dominant", - ModeOfInheritance.monoallelic); - MODEOFINHERITANCE_MAP.put("x-linked dominant inheritance", - ModeOfInheritance.xlinked_monoallelic); - MODEOFINHERITANCE_MAP.put("x-linked recessive inheritance", - ModeOfInheritance.xlinked_biallelic); - - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("criteria_provided_conflicting_interpretations", ConsistencyStatus.conflict); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("criteria_provided_multiple_submitters_no_conflicts", ConsistencyStatus.congruent); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("criteria_provided_single_submitter", ConsistencyStatus.congruent); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("reviewed_by_expert_panel", ConsistencyStatus.congruent); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("conflicting interpretations", ConsistencyStatus.conflict); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("no conflicts", ConsistencyStatus.congruent); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("single submitter", ConsistencyStatus.congruent); - CLINVAR_REVIEW_TO_CONSISTENCY_STATUS.put("reviewed by expert panel", ConsistencyStatus.congruent); - - - CLINVAR_CLINSIG_TO_ACMG.put("benign", ClinicalSignificance.benign); - CLINVAR_CLINSIG_TO_ACMG.put("likely benign", ClinicalSignificance.likely_benign); - CLINVAR_CLINSIG_TO_ACMG.put("conflicting interpretations of pathogenicity", ClinicalSignificance.uncertain_significance); - CLINVAR_CLINSIG_TO_ACMG.put("likely pathogenic", ClinicalSignificance.likely_pathogenic); - CLINVAR_CLINSIG_TO_ACMG.put("pathogenic", ClinicalSignificance.pathogenic); - CLINVAR_CLINSIG_TO_ACMG.put("uncertain significance", ClinicalSignificance.uncertain_significance); - CLINVAR_CLINSIG_TO_ACMG.put("conflicting data from submitters", ClinicalSignificance.uncertain_significance); - - CLINVAR_CLINSIG_TO_TRAIT_ASSOCIATION.put("risk factor", TraitAssociation.established_risk_allele); - CLINVAR_CLINSIG_TO_TRAIT_ASSOCIATION.put("protective", TraitAssociation.protective); - /////////////////////////////////////////////////////////////////////// ///// ClinVar and Cosmic allele origins to SO terms /////////////// /////////////////////////////////////////////////////////////////////// @@ -210,258 +40,6 @@ public class VariantAnnotationUtils { ORIGIN_STRING_TO_ALLELE_ORIGIN.put("paternal", AlleleOrigin.paternal_variant); ORIGIN_STRING_TO_ALLELE_ORIGIN.put("somatic", AlleleOrigin.somatic_variant); - /////////////////////////////////////////////////////////////////////// - ///// GENETIC CODE //////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////// - A_TO_CODON.put("ALA", new ArrayList()); - A_TO_CODON.get("ALA").add("GCT"); - A_TO_CODON.get("ALA").add("GCC"); - A_TO_CODON.get("ALA").add("GCA"); - A_TO_CODON.get("ALA").add("GCG"); - A_TO_CODON.put("ARG", new ArrayList()); - A_TO_CODON.get("ARG").add("CGT"); - A_TO_CODON.get("ARG").add("CGC"); - A_TO_CODON.get("ARG").add("CGA"); - A_TO_CODON.get("ARG").add("CGG"); - A_TO_CODON.get("ARG").add("AGA"); - A_TO_CODON.get("ARG").add("AGG"); - A_TO_CODON.put("ASN", new ArrayList()); - A_TO_CODON.get("ASN").add("AAT"); - A_TO_CODON.get("ASN").add("AAC"); - A_TO_CODON.put("ASP", new ArrayList()); - A_TO_CODON.get("ASP").add("GAT"); - A_TO_CODON.get("ASP").add("GAC"); - A_TO_CODON.put("CYS", new ArrayList()); - A_TO_CODON.get("CYS").add("TGT"); - A_TO_CODON.get("CYS").add("TGC"); - A_TO_CODON.put("GLN", new ArrayList()); - A_TO_CODON.get("GLN").add("CAA"); - A_TO_CODON.get("GLN").add("CAG"); - A_TO_CODON.put("GLU", new ArrayList()); - A_TO_CODON.get("GLU").add("GAA"); - A_TO_CODON.get("GLU").add("GAG"); - A_TO_CODON.put("GLY", new ArrayList()); - A_TO_CODON.get("GLY").add("GGT"); - A_TO_CODON.get("GLY").add("GGC"); - A_TO_CODON.get("GLY").add("GGA"); - A_TO_CODON.get("GLY").add("GGG"); - A_TO_CODON.put("HIS", new ArrayList()); - A_TO_CODON.get("HIS").add("CAT"); - A_TO_CODON.get("HIS").add("CAC"); - A_TO_CODON.put("ILE", new ArrayList()); - A_TO_CODON.get("ILE").add("ATT"); - A_TO_CODON.get("ILE").add("ATC"); - A_TO_CODON.get("ILE").add("ATA"); - A_TO_CODON.put("LEU", new ArrayList()); - A_TO_CODON.get("LEU").add("TTA"); - A_TO_CODON.get("LEU").add("TTG"); - A_TO_CODON.get("LEU").add("CTT"); - A_TO_CODON.get("LEU").add("CTC"); - A_TO_CODON.get("LEU").add("CTA"); - A_TO_CODON.get("LEU").add("CTG"); - A_TO_CODON.put("LYS", new ArrayList()); - A_TO_CODON.get("LYS").add("AAA"); - A_TO_CODON.get("LYS").add("AAG"); - A_TO_CODON.put("MET", new ArrayList()); - A_TO_CODON.get("MET").add("ATG"); - A_TO_CODON.put("PHE", new ArrayList()); - A_TO_CODON.get("PHE").add("TTT"); - A_TO_CODON.get("PHE").add("TTC"); - A_TO_CODON.put("PRO", new ArrayList()); - A_TO_CODON.get("PRO").add("CCT"); - A_TO_CODON.get("PRO").add("CCC"); - A_TO_CODON.get("PRO").add("CCA"); - A_TO_CODON.get("PRO").add("CCG"); - A_TO_CODON.put("SER", new ArrayList()); - A_TO_CODON.get("SER").add("TCT"); - A_TO_CODON.get("SER").add("TCC"); - A_TO_CODON.get("SER").add("TCA"); - A_TO_CODON.get("SER").add("TCG"); - A_TO_CODON.get("SER").add("AGT"); - A_TO_CODON.get("SER").add("AGC"); - A_TO_CODON.put("THR", new ArrayList()); - A_TO_CODON.get("THR").add("ACT"); - A_TO_CODON.get("THR").add("ACC"); - A_TO_CODON.get("THR").add("ACA"); - A_TO_CODON.get("THR").add("ACG"); - A_TO_CODON.put("TRP", new ArrayList()); - A_TO_CODON.get("TRP").add("TGG"); - A_TO_CODON.put("TYR", new ArrayList()); - A_TO_CODON.get("TYR").add("TAT"); - A_TO_CODON.get("TYR").add("TAC"); - A_TO_CODON.put("VAL", new ArrayList()); - A_TO_CODON.get("VAL").add("GTT"); - A_TO_CODON.get("VAL").add("GTC"); - A_TO_CODON.get("VAL").add("GTA"); - A_TO_CODON.get("VAL").add("GTG"); - A_TO_CODON.put("STOP", new ArrayList()); - A_TO_CODON.get("STOP").add("TAA"); - A_TO_CODON.get("STOP").add("TGA"); - A_TO_CODON.get("STOP").add("TAG"); - - for (String aa : A_TO_CODON.keySet()) { - for (String codon : A_TO_CODON.get(aa)) { - IS_SYNONYMOUS_CODON.put(codon, new HashMap()); - CODON_TO_A.put(codon, aa); - } - } - for (String codon1 : IS_SYNONYMOUS_CODON.keySet()) { - Map codonEntry = IS_SYNONYMOUS_CODON.get(codon1); - for (String codon2 : IS_SYNONYMOUS_CODON.keySet()) { - codonEntry.put(codon2, false); - } - } - for (String aa : A_TO_CODON.keySet()) { - for (String codon1 : A_TO_CODON.get(aa)) { - for (String codon2 : A_TO_CODON.get(aa)) { - IS_SYNONYMOUS_CODON.get(codon1).put(codon2, true); - } - } - } - - /////////////////////////////////////////////////////////////////////// - ///// MITOCHONDRIAL GENETIC CODE ////////////////////////////////// - /////////////////////////////////////////////////////////////////////// - MT_A_TO_CODON.put("ALA", new ArrayList()); - MT_A_TO_CODON.get("ALA").add("GCT"); - MT_A_TO_CODON.get("ALA").add("GCC"); - MT_A_TO_CODON.get("ALA").add("GCA"); - MT_A_TO_CODON.get("ALA").add("GCG"); - MT_A_TO_CODON.put("ARG", new ArrayList()); - MT_A_TO_CODON.get("ARG").add("CGT"); - MT_A_TO_CODON.get("ARG").add("CGC"); - MT_A_TO_CODON.get("ARG").add("CGA"); - MT_A_TO_CODON.get("ARG").add("CGG"); - MT_A_TO_CODON.put("ASN", new ArrayList()); - MT_A_TO_CODON.get("ASN").add("AAT"); - MT_A_TO_CODON.get("ASN").add("AAC"); - MT_A_TO_CODON.put("ASP", new ArrayList()); - MT_A_TO_CODON.get("ASP").add("GAT"); - MT_A_TO_CODON.get("ASP").add("GAC"); - MT_A_TO_CODON.put("CYS", new ArrayList()); - MT_A_TO_CODON.get("CYS").add("TGT"); - MT_A_TO_CODON.get("CYS").add("TGC"); - MT_A_TO_CODON.put("GLN", new ArrayList()); - MT_A_TO_CODON.get("GLN").add("CAA"); - MT_A_TO_CODON.get("GLN").add("CAG"); - MT_A_TO_CODON.put("GLU", new ArrayList()); - MT_A_TO_CODON.get("GLU").add("GAA"); - MT_A_TO_CODON.get("GLU").add("GAG"); - MT_A_TO_CODON.put("GLY", new ArrayList()); - MT_A_TO_CODON.get("GLY").add("GGT"); - MT_A_TO_CODON.get("GLY").add("GGC"); - MT_A_TO_CODON.get("GLY").add("GGA"); - MT_A_TO_CODON.get("GLY").add("GGG"); - MT_A_TO_CODON.put("HIS", new ArrayList()); - MT_A_TO_CODON.get("HIS").add("CAT"); - MT_A_TO_CODON.get("HIS").add("CAC"); - MT_A_TO_CODON.put("ILE", new ArrayList()); - MT_A_TO_CODON.get("ILE").add("ATT"); - MT_A_TO_CODON.get("ILE").add("ATC"); - MT_A_TO_CODON.put("LEU", new ArrayList()); - MT_A_TO_CODON.get("LEU").add("TTA"); - MT_A_TO_CODON.get("LEU").add("TTG"); - MT_A_TO_CODON.get("LEU").add("CTT"); - MT_A_TO_CODON.get("LEU").add("CTC"); - MT_A_TO_CODON.get("LEU").add("CTA"); - MT_A_TO_CODON.get("LEU").add("CTG"); - MT_A_TO_CODON.put("LYS", new ArrayList()); - MT_A_TO_CODON.get("LYS").add("AAA"); - MT_A_TO_CODON.get("LYS").add("AAG"); - MT_A_TO_CODON.put("MET", new ArrayList()); - MT_A_TO_CODON.get("MET").add("ATG"); - MT_A_TO_CODON.get("MET").add("ATA"); - MT_A_TO_CODON.put("PHE", new ArrayList()); - MT_A_TO_CODON.get("PHE").add("TTT"); - MT_A_TO_CODON.get("PHE").add("TTC"); - MT_A_TO_CODON.put("PRO", new ArrayList()); - MT_A_TO_CODON.get("PRO").add("CCT"); - MT_A_TO_CODON.get("PRO").add("CCC"); - MT_A_TO_CODON.get("PRO").add("CCA"); - MT_A_TO_CODON.get("PRO").add("CCG"); -// A_TO_CODON.put("SEC", new ArrayList<>()); -// A_TO_CODON.get("SEC").add("TGA"); - MT_A_TO_CODON.put("SER", new ArrayList()); - MT_A_TO_CODON.get("SER").add("TCT"); - MT_A_TO_CODON.get("SER").add("TCC"); - MT_A_TO_CODON.get("SER").add("TCA"); - MT_A_TO_CODON.get("SER").add("TCG"); - MT_A_TO_CODON.get("SER").add("AGT"); - MT_A_TO_CODON.get("SER").add("AGC"); - MT_A_TO_CODON.put("THR", new ArrayList()); - MT_A_TO_CODON.get("THR").add("ACT"); - MT_A_TO_CODON.get("THR").add("ACC"); - MT_A_TO_CODON.get("THR").add("ACA"); - MT_A_TO_CODON.get("THR").add("ACG"); - MT_A_TO_CODON.put("TRP", new ArrayList()); - MT_A_TO_CODON.get("TRP").add("TGG"); - MT_A_TO_CODON.get("TRP").add("TGA"); - MT_A_TO_CODON.put("TYR", new ArrayList()); - MT_A_TO_CODON.get("TYR").add("TAT"); - MT_A_TO_CODON.get("TYR").add("TAC"); - MT_A_TO_CODON.put("VAL", new ArrayList()); - MT_A_TO_CODON.get("VAL").add("GTT"); - MT_A_TO_CODON.get("VAL").add("GTC"); - MT_A_TO_CODON.get("VAL").add("GTA"); - MT_A_TO_CODON.get("VAL").add("GTG"); - MT_A_TO_CODON.put("STOP", new ArrayList()); - MT_A_TO_CODON.get("STOP").add("TAA"); - MT_A_TO_CODON.get("STOP").add("TAG"); - MT_A_TO_CODON.get("STOP").add("AGA"); - MT_A_TO_CODON.get("STOP").add("AGG"); - - for (String aa : MT_A_TO_CODON.keySet()) { - for (String codon : MT_A_TO_CODON.get(aa)) { - MT_IS_SYNONYMOUS_CODON.put(codon, new HashMap()); - MT_CODON_TO_A.put(codon, aa); - } - } - for (String codon1 : MT_IS_SYNONYMOUS_CODON.keySet()) { - Map codonEntry = MT_IS_SYNONYMOUS_CODON.get(codon1); - for (String codon2 : MT_IS_SYNONYMOUS_CODON.keySet()) { - codonEntry.put(codon2, false); - } - } - for (String aa : MT_A_TO_CODON.keySet()) { - for (String codon1 : MT_A_TO_CODON.get(aa)) { - for (String codon2 : MT_A_TO_CODON.get(aa)) { - MT_IS_SYNONYMOUS_CODON.get(codon1).put(codon2, true); - } - } - } - - /* - Aminoacid abbreviation map - */ - TO_ABBREVIATED_AA.put("ALA", "A"); - TO_ABBREVIATED_AA.put("ARG", "R"); - TO_ABBREVIATED_AA.put("ASN", "N"); - TO_ABBREVIATED_AA.put("ASP", "D"); - TO_ABBREVIATED_AA.put("ASX", "B"); - TO_ABBREVIATED_AA.put("CYS", "C"); - TO_ABBREVIATED_AA.put("GLU", "E"); - TO_ABBREVIATED_AA.put("GLN", "Q"); - TO_ABBREVIATED_AA.put("GLX", "Z"); - TO_ABBREVIATED_AA.put("GLY", "G"); - TO_ABBREVIATED_AA.put("HIS", "H"); - TO_ABBREVIATED_AA.put("ILE", "I"); - TO_ABBREVIATED_AA.put("LEU", "L"); - TO_ABBREVIATED_AA.put("LYS", "K"); - TO_ABBREVIATED_AA.put("MET", "M"); - TO_ABBREVIATED_AA.put("PHE", "F"); - TO_ABBREVIATED_AA.put("PRO", "P"); - TO_ABBREVIATED_AA.put("SEC", "U"); - TO_ABBREVIATED_AA.put("SER", "S"); - TO_ABBREVIATED_AA.put("THR", "T"); - TO_ABBREVIATED_AA.put("TRP", "W"); - TO_ABBREVIATED_AA.put("TYR", "Y"); - TO_ABBREVIATED_AA.put("VAL", "V"); - TO_ABBREVIATED_AA.put("STOP", "O"); - - for (String aa : TO_ABBREVIATED_AA.keySet()) { - TO_LONG_AA.put(TO_ABBREVIATED_AA.get(aa), buildUpperLowerCaseString(aa)); - } - COMPLEMENTARY_NT.put('A', 'T'); COMPLEMENTARY_NT.put('a', 't'); COMPLEMENTARY_NT.put('C', 'G'); @@ -472,187 +50,13 @@ public class VariantAnnotationUtils { COMPLEMENTARY_NT.put('t', 'a'); COMPLEMENTARY_NT.put('N', 'N'); COMPLEMENTARY_NT.put('n', 'n'); - - POLYPHEN_DESCRIPTIONS.put(0, "probably damaging"); - POLYPHEN_DESCRIPTIONS.put(1, "possibly damaging"); - POLYPHEN_DESCRIPTIONS.put(2, "benign"); - POLYPHEN_DESCRIPTIONS.put(3, "unknown"); - - SIFT_DESCRIPTIONS.put(0, "tolerated"); - SIFT_DESCRIPTIONS.put(1, "deleterious"); - - SO_SEVERITY.put("copy_number_change", 42); - SO_SEVERITY.put("transcript_ablation", 41); - SO_SEVERITY.put("structural_variant", 40); - SO_SEVERITY.put("splice_acceptor_variant", 39); - SO_SEVERITY.put("splice_donor_variant", 38); - SO_SEVERITY.put("stop_gained", 37); - SO_SEVERITY.put("frameshift_variant", 36); - SO_SEVERITY.put("stop_lost", 35); - SO_SEVERITY.put("terminator_codon_variant", 34); - SO_SEVERITY.put("start_lost", 34); - SO_SEVERITY.put("initiator_codon_variant", 33); - SO_SEVERITY.put("transcript_amplification", 32); - SO_SEVERITY.put("inframe_insertion", 31); - SO_SEVERITY.put("inframe_deletion", 30); - SO_SEVERITY.put("inframe_variant", 29); - SO_SEVERITY.put("missense_variant", 28); - SO_SEVERITY.put("splice_region_variant", 27); - SO_SEVERITY.put("incomplete_terminal_codon_variant", 26); - SO_SEVERITY.put("stop_retained_variant", 25); - SO_SEVERITY.put("start_retained_variant", 24); - SO_SEVERITY.put("synonymous_variant", 23); - SO_SEVERITY.put("coding_sequence_variant", 22); - SO_SEVERITY.put("mature_miRNA_variant", 21); - SO_SEVERITY.put("5_prime_UTR_variant", 20); - SO_SEVERITY.put("3_prime_UTR_variant", 19); - SO_SEVERITY.put("non_coding_transcript_exon_variant", 18); - SO_SEVERITY.put("intron_variant", 17); - SO_SEVERITY.put("NMD_transcript_variant", 16); - SO_SEVERITY.put("non_coding_transcript_variant", 15); - SO_SEVERITY.put("2KB_upstream_variant", 14); - SO_SEVERITY.put("upstream_gene_variant", 13); - SO_SEVERITY.put("2KB_downstream_variant", 12); - SO_SEVERITY.put("downstream_gene_variant", 11); - SO_SEVERITY.put("TFBS_ablation", 10); - SO_SEVERITY.put("TFBS_amplification", 9); - SO_SEVERITY.put("TF_binding_site_variant", 8); - SO_SEVERITY.put("regulatory_region_ablation", 7); - SO_SEVERITY.put("regulatory_region_amplification", 6); - SO_SEVERITY.put("regulatory_region_variant", 5); - SO_SEVERITY.put("feature_elongation", 4); - SO_SEVERITY.put("feature_truncation", 3); - SO_SEVERITY.put("feature_variant", 2); - SO_SEVERITY.put("intergenic_variant", 1); - - CODING_SO_NAMES.add(STOP_RETAINED_VARIANT); - CODING_SO_NAMES.add(START_RETAINED_VARIANT); - CODING_SO_NAMES.add(SYNONYMOUS_VARIANT); - CODING_SO_NAMES.add(STOP_GAINED); - CODING_SO_NAMES.add(INITIATOR_CODON_VARIANT); - CODING_SO_NAMES.add(START_LOST); - CODING_SO_NAMES.add(STOP_LOST); - CODING_SO_NAMES.add(MISSENSE_VARIANT); - - SO_NAMES_CORRECTIONS.put("nc_transcript_variant", "non_coding_transcript_variant"); - SO_NAMES_CORRECTIONS.put("non_coding_exon_variant", "non_coding_transcript_exon_variant"); - } - - public static String buildUpperLowerCaseString(String aa) { - if (StringUtils.isEmpty(aa)) { - return null; - } - StringBuilder stringBuilder = new StringBuilder(aa); - - for (int i = 1; i < stringBuilder.length(); i++) { - stringBuilder.setCharAt(i, String.valueOf(stringBuilder.charAt(i)).toLowerCase().charAt(0)); - } - - return stringBuilder.toString(); - } - - public static Boolean isSynonymousCodon(String codon1, String codon2) { - return isSynonymousCodon(false, codon1, codon2); - } - - public static Boolean isSynonymousCodon(Boolean mitochondrialCode, String codon1, String codon2) { -// Map geneticCode = null; - if (mitochondrialCode) { - return MT_IS_SYNONYMOUS_CODON.get(codon1.toUpperCase()).get(codon2.toUpperCase()); - } else { - return IS_SYNONYMOUS_CODON.get(codon1.toUpperCase()).get(codon2.toUpperCase()); - } - } - - public static Boolean isStopCodon(String codon) { - return isStopCodon(false, codon); - } - - public static Boolean isStopCodon(boolean mitochondrialCode, String codon) { - if (mitochondrialCode) { - if (codon.equals(TAA) || codon.equals(TAG) || codon.equals(AGA) || codon.equals(AGG)) { - return true; - } - } else { - if (codon.equals(TAA) || codon.equals(TGA) || codon.equals(TAG)) { - return true; - } - } - return false; - } - - public static boolean isStartCodon(boolean mitochondrialCode, String codon) { - if (mitochondrialCode) { - if (codon.equals(ATG) || codon.equals(ATA)) { - return true; - } - } else { - if (codon.equals(ATG)) { - return true; - } - } - return false; - } - - - public static String getAminoacid(boolean mitochondrialCode, String codon) { - if (mitochondrialCode) { - return MT_CODON_TO_A.get(codon); - } else { - return CODON_TO_A.get(codon); - } } - public static List getSequenceOntologyTerms(Iterable soNames) throws SOTermNotAvailableException { - List sequenceOntologyTerms = new ArrayList<>(); - for (String name : soNames) { - name = fixSONameIfNeeded(name); - sequenceOntologyTerms.add(newSequenceOntologyTerm(name)); - } - return sequenceOntologyTerms; - } - - private static String fixSONameIfNeeded(String name) { - String fixedName = SO_NAMES_CORRECTIONS.get(name); - return fixedName == null ? name : fixedName; - } - - public static SequenceOntologyTerm newSequenceOntologyTerm(String name) throws SOTermNotAvailableException { - return new SequenceOntologyTerm(ConsequenceTypeMappings.getSoAccessionString(name), name); - } - - public static String buildVariantId(String chromosome, int start, String reference, String alternate) { - StringBuilder stringBuilder = new StringBuilder(); - - appendChromosome(chromosome, stringBuilder) - .append(SEPARATOR_CHAR) - .append(StringUtils.leftPad(Integer.toString(start), 10, " ")) - .append(SEPARATOR_CHAR); - -// if (reference.length() > Variant.SV_THRESHOLD) { -// stringBuilder.append(new String(CryptoUtils.encryptSha1(reference))); -// } else if (!(reference == null || reference.isEmpty() || reference.equals("-"))) { - if (!(reference == null || reference.isEmpty() || reference.equals("-"))) { - stringBuilder.append(reference); - } - stringBuilder.append(SEPARATOR_CHAR); -// if (alternate.length() > Variant.SV_THRESHOLD) { -// stringBuilder.append(new String(CryptoUtils.encryptSha1(alternate))); -// } else if (!(alternate == null || alternate.isEmpty() || alternate.equals("-"))) { - if (!(alternate == null || alternate.isEmpty() || alternate.equals("-"))) { - stringBuilder.append(alternate); - } - return stringBuilder.toString(); - } - - protected static StringBuilder appendChromosome(String chromosome, StringBuilder stringBuilder) { - if (chromosome.length() == 1 && Character.isDigit(chromosome.charAt(0))) { - stringBuilder.append(' '); - } - return stringBuilder.append(chromosome); + public static String reverseComplement(String string) { + return reverseComplement(string, false); } - public static String reverseComplement(String string) { + public static String reverseComplement(String string, boolean failOnUnknownNt) { StringBuilder stringBuilder = new StringBuilder(string).reverse(); for (int i = 0; i < stringBuilder.length(); i++) { char nextNt = stringBuilder.charAt(i); @@ -660,30 +64,19 @@ public static String reverseComplement(String string) { if (VariantAnnotationUtils.COMPLEMENTARY_NT.containsKey(nextNt)) { stringBuilder.setCharAt(i, VariantAnnotationUtils.COMPLEMENTARY_NT.get(nextNt)); } else { - return null; + if (failOnUnknownNt) { + throw new IllegalArgumentException("Unknown nucleotide: '" + nextNt+ "'. " + + "Unable to reverse-complement sequence '" + string + "'."); + } else { + return null; + } } } return stringBuilder.toString(); } - - public static String translate(String dnaSequence) { - return translate(dnaSequence, "-"); - } - - public static String translate(String dnaSequence, String separator) { - StringBuilder aaSequence = new StringBuilder(); - dnaSequence = dnaSequence.toUpperCase(); - dnaSequence = dnaSequence.replaceAll("T", "U"); - for (int i = 0; i < dnaSequence.length(); i += 3) { - if (i + 2 < dnaSequence.length()) { - aaSequence.append(CODON_TO_A.get(dnaSequence.charAt(i) + dnaSequence.charAt(i + 1) + dnaSequence.charAt(i + 2))); - if (i + 3 < dnaSequence.length()) { - aaSequence.append(separator); - } - } - } - return aaSequence.toString(); + public static AlleleOrigin parseAlleleOrigin(String alleleOrigin) { + return ORIGIN_STRING_TO_ALLELE_ORIGIN.get(alleleOrigin); } } diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java index 301ec9ec..85ff905f 100755 --- a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java @@ -332,21 +332,12 @@ private static boolean parseSnv(String mutationCds, SequenceLocation sequenceLoc private static String getPositiveStrandString(String alleleString, String strand) { if (strand.equals("-")) { - return reverseComplementary(alleleString); + return VariantAnnotationUtils.reverseComplement(alleleString, true); } else { return alleleString; } } - private static String reverseComplementary(String alleleString) { - char[] reverseAlleleString = new StringBuilder(alleleString).reverse().toString().toCharArray(); - for (int i = 0; i < reverseAlleleString.length; i++) { - reverseAlleleString[i] = VariantAnnotationUtils.COMPLEMENTARY_NT.get(reverseAlleleString[i]); - } - - return String.valueOf(reverseAlleleString); - } - private static EvidenceEntry buildCosmic(String name, String version, String assembly, String[] fields) { String id = fields[ID_COLUMN]; String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; @@ -478,12 +469,28 @@ private static GenomicFeature createGeneGenomicFeature(String featureId, Feature return new GenomicFeature(featureTypes, null, map); } + private static Map ORIGIN_STRING_TO_ALLELE_ORIGIN = new HashMap<>(); + + static { + + /////////////////////////////////////////////////////////////////////// + ///// ClinVar and Cosmic allele origins to SO terms /////////////// + /////////////////////////////////////////////////////////////////////// + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("germline", AlleleOrigin.germline_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("maternal", AlleleOrigin.maternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("de novo", AlleleOrigin.de_novo_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("paternal", AlleleOrigin.paternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("somatic", AlleleOrigin.somatic_variant); + } + + private static List getAlleleOriginList(List sourceOriginList) { List alleleOrigin; alleleOrigin = new ArrayList<>(sourceOriginList.size()); for (String originString : sourceOriginList) { - if (VariantAnnotationUtils.ORIGIN_STRING_TO_ALLELE_ORIGIN.containsKey(originString)) { - alleleOrigin.add(VariantAnnotationUtils.ORIGIN_STRING_TO_ALLELE_ORIGIN.get(originString)); + AlleleOrigin alleleOriginValue = VariantAnnotationUtils.parseAlleleOrigin(originString); + if (alleleOriginValue != null) { + alleleOrigin.add(alleleOriginValue); } else { logger.debug("No SO term found for allele origin {}. Skipping.", originString); } From 2e3963694246ffb0c579a01d307445aabcdd8349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 26 Jun 2024 16:25:26 +0200 Subject: [PATCH 04/10] models: set public constructors, #TASK-5913, #TASK-5318 On branch TASK-5318 Changes to be committed: modified: biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java --- .../org/opencb/biodata/models/sequence/SequenceLocation.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java b/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java index a2d8298f..0d8a6cea 100644 --- a/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java +++ b/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java @@ -11,11 +11,11 @@ public class SequenceLocation { public SequenceLocation() { } - SequenceLocation(String chromosome, int start, int end, String reference, String alternate) { + public SequenceLocation(String chromosome, int start, int end, String reference, String alternate) { this(chromosome, start, end, reference, alternate, "+"); } - SequenceLocation(String chromosome, int start, int end, String reference, String alternate, String strand) { + public SequenceLocation(String chromosome, int start, int end, String reference, String alternate, String strand) { this.chromosome = chromosome; this.start = start; this.end = end; From 09fd3c3a50f9e90012ebdf337b4dd1c07c86a91d Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Fri, 13 Sep 2024 11:56:51 +0200 Subject: [PATCH 05/10] Prepare next release 3.2.2-SNAPSHOT --- biodata-external/pom.xml | 2 +- biodata-formats/pom.xml | 2 +- biodata-models/pom.xml | 2 +- biodata-tools/pom.xml | 2 +- pom.xml | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml index dafe64c2..87218dda 100644 --- a/biodata-external/pom.xml +++ b/biodata-external/pom.xml @@ -6,7 +6,7 @@ biodata org.opencb.biodata - 3.2.1 + 3.2.2-SNAPSHOT ../pom.xml diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml index e071844f..67e13301 100644 --- a/biodata-formats/pom.xml +++ b/biodata-formats/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.1 + 3.2.2-SNAPSHOT ../pom.xml diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml index f2311443..68804eb5 100644 --- a/biodata-models/pom.xml +++ b/biodata-models/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.1 + 3.2.2-SNAPSHOT ../pom.xml diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index a9a269a7..dc8bf9a4 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.1 + 3.2.2-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index cd18672d..cf8a2ba6 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.1 + 3.2.2-SNAPSHOT pom Biodata @@ -38,7 +38,7 @@ - 5.2.1 + 5.2.2-SNAPSHOT 2.14.3 4.4 From 85ea39e8ffd51e9fd75d61dcfe86a77c65df1975 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Mon, 16 Sep 2024 15:02:16 +0200 Subject: [PATCH 06/10] SDLC: Prepare release 2.3.0 de Xetabase #TASK-6879 --- biodata-external/pom.xml | 2 +- biodata-formats/pom.xml | 2 +- biodata-models/pom.xml | 2 +- biodata-tools/pom.xml | 2 +- pom.xml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml index 87218dda..d6d1acdd 100644 --- a/biodata-external/pom.xml +++ b/biodata-external/pom.xml @@ -6,7 +6,7 @@ biodata org.opencb.biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml index 67e13301..c3332c2e 100644 --- a/biodata-formats/pom.xml +++ b/biodata-formats/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml index 68804eb5..7325a383 100644 --- a/biodata-models/pom.xml +++ b/biodata-models/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index dc8bf9a4..e8d90ada 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index cf8a2ba6..7bd5b7ee 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT pom Biodata From f04f854a6ef7d50506d9a7169a63b8ad2ab968c7 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Mon, 16 Sep 2024 15:02:16 +0200 Subject: [PATCH 07/10] SDLC: Prepare release 2.3.0 of Xetabase #TASK-6879 --- biodata-external/pom.xml | 2 +- biodata-formats/pom.xml | 2 +- biodata-models/pom.xml | 2 +- biodata-tools/pom.xml | 2 +- pom.xml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml index 87218dda..d6d1acdd 100644 --- a/biodata-external/pom.xml +++ b/biodata-external/pom.xml @@ -6,7 +6,7 @@ biodata org.opencb.biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml index 67e13301..c3332c2e 100644 --- a/biodata-formats/pom.xml +++ b/biodata-formats/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml index 68804eb5..7325a383 100644 --- a/biodata-models/pom.xml +++ b/biodata-models/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index dc8bf9a4..e8d90ada 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index cf8a2ba6..7bd5b7ee 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 3.2.2-SNAPSHOT + 3.3.0-SNAPSHOT pom Biodata From ce681b04ecb3911458622e73d802474947146f1d Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Tue, 17 Sep 2024 14:57:22 +0200 Subject: [PATCH 08/10] pom:update internal dependencies #TASK-6879 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7bd5b7ee..6dc030db 100644 --- a/pom.xml +++ b/pom.xml @@ -38,7 +38,7 @@ - 5.2.2-SNAPSHOT + 5.3.0-SNAPSHOT 2.14.3 4.4 From 73e85d6b0302b42cfe3ffe5968b7c4c4324eabdf Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Tue, 17 Sep 2024 15:08:41 +0200 Subject: [PATCH 09/10] cicd scripts fix new branch and version references #TASK-6879 --- .github/workflows/scripts/get-xetabase-branch.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/scripts/get-xetabase-branch.sh b/.github/workflows/scripts/get-xetabase-branch.sh index a1eb7e52..781e29a3 100644 --- a/.github/workflows/scripts/get-xetabase-branch.sh +++ b/.github/workflows/scripts/get-xetabase-branch.sh @@ -19,11 +19,11 @@ get_xetabase_branch() { return 0 fi - # Check if the branch name starts with "release-" and follows the patterns "release-a.b.x" or "release-a.b.c.x" - if [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.x$ ]] || [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.([0-9]+)\.x$ ]]; then + # Check if the branch name starts with "release-" and follows the patterns "release-a.x.x" or "release-a.b.x" + if [[ "$input_branch" =~ ^release-([0-9]+)\.x\.x$ ]] || [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.x$ ]]; then # Extract the MAJOR part of the branch name MAJOR=${BASH_REMATCH[1]} - # Calculate the XETABASE_MAJOR by subtracting 3 from MAJOR + # Calculate the XETABASE_MAJOR by subtracting 1 from MAJOR XETABASE_MAJOR=$((MAJOR - 1)) # Check if the XETABASE_MAJOR is negative if (( XETABASE_MAJOR < 0 )); then From 6a20e30feaa7379cc553914a78d0353e505f411e Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Tue, 17 Sep 2024 15:14:03 +0200 Subject: [PATCH 10/10] cicd scripts fix new branch and version references #TASK-6879 --- .github/workflows/pull-request-approved.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-request-approved.yml b/.github/workflows/pull-request-approved.yml index d339f65b..ba378d3a 100644 --- a/.github/workflows/pull-request-approved.yml +++ b/.github/workflows/pull-request-approved.yml @@ -24,7 +24,7 @@ jobs: chmod +x ./.github/workflows/scripts/get-xetabase-branch.sh echo "github.event.pull_request.base.ref: ${{ github.event.pull_request.base.ref }}" echo "github.event.pull_request.head.ref: ${{ github.event.pull_request.head.ref }}" - xetabase_branch=$(./.github/workflows/scripts/get-xetabase-branch.sh ${{ github.event.pull_request.head.ref }}) + xetabase_branch=$(./.github/workflows/scripts/get-xetabase-branch.sh ${{ github.event.pull_request.base.ref }}) echo "__Xetabase ref:__ \"${xetabase_branch}\"" | tee -a ${GITHUB_STEP_SUMMARY} echo "xetabase_branch=${xetabase_branch}" >> $GITHUB_OUTPUT env: