Skip to content

Commit

Permalink
Updated to match latest gallia-core changes
Browse files Browse the repository at this point in the history
  • Loading branch information
anthony-cros committed Oct 8, 2022
1 parent a8f539b commit 51ce0fb
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 41 deletions.
17 changes: 9 additions & 8 deletions src/main/scala/galliaexample/clinvar/ClinvarOriginDecoding.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package galliaexample.clinvar

import aptus.Anything_
import scala.util.chaining._
import aptus.Anything_ // for .assert

// ===========================================================================
object ClinvarOriginDecoding {
Expand Down Expand Up @@ -42,13 +43,13 @@ object ClinvarOriginDecoding {
// ===========================================================================
def breakDownValue(value: Int): Seq[Int] =
value
.toBinaryString // eg for 11: 1011
.reverse // eg for 11: 1101
.zipWithIndex // eg for 11: [(1, 0), (1, 1), (0, 2), (1, 3)]
.filter(_._1 == '1') // eg for 11: [(1, 0), (1, 1), , (1, 3)]
.map (_._2 ) // eg for 11: [ 0 , 1 , 3 ]
.map(math.pow(2, _).toInt) // eg for 11: [ 1, 2 , 8 ] -> the sum of which is indeed 11
.thn(subValues =>
.toBinaryString // eg for 11: 1011
.reverse // eg for 11: 1101
.zipWithIndex // eg for 11: [(1, 0), (1, 1), (0, 2), (1, 3)]
.filter(_._1 == '1') // eg for 11: [(1, 0), (1, 1), , (1, 3)]
.map (_._2 ) // eg for 11: [ 0 , 1 , 3 ]
.map (math.pow(2, _).toInt) // eg for 11: [ 1, 2 , 8 ] -> the sum of which is indeed 11
.pipe(subValues =>
subValues
.sorted
.assert(_ == subValues) /* already be sorted by design */)
Expand Down
35 changes: 18 additions & 17 deletions src/main/scala/galliaexample/clinvar/ClinvarVcf.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package galliaexample.clinvar

import aptus.{Anything_, String_}
import scala.util.chaining._
import aptus.String_ // for .extractGroups utility
import gallia._

// ===========================================================================
Expand All @@ -23,51 +24,51 @@ object ClinvarVcf { // 210102155202
.remove(RS)

// ---------------------------------------------------------------------------
.thn(processDiseaseFields(
.pipe(processDiseaseFields(
CLNDN , /* disease name, eg: "Myasthenic_syndrome,_congenital,_8|not_specified" */
CLNDISDB, /* disease db , eg: "MedGen:C3808739,OMIM:615120|MedGen:CN169374" */
newKey = disease))

.thn(processDiseaseFields(
.pipe(processDiseaseFields(
CLNDNINCL , /* disease name INCL - eg "Small_fiber_neuropathy"; "For included Variant: ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"*/
CLNDISDBINCL, /* disease db INCL - eg "MedGen:C0220754", "OMIM:253260; "For included Variant: Tag-value pairs of disease database name and identifier, e.g. OMIM:NNNNNN" */
newKey = disease_INCL))

// ---------------------------------------------------------------------------
// "untuplify" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
.untuplify1b(CLNSIGINCL ~> 'clinical_significance_for_including)
// "deserialize" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
.deserialize1b(CLNSIGINCL ~> 'clinical_significance_for_including)
.withSplitters(SemanticSeparators.Pipe, SemanticSeparators.Colon)
.asNewKeys(
included_clinvar_variant_id /* meaning: see 210118100341 */,
value /* clinical significance */)

// ---------------------------------------------------------------------------
.untuplify1b(GENEINFO ~> 'genes)
.deserialize1b(GENEINFO ~> 'genes)
.withSplitters(SemanticSeparators.Pipe, SemanticSeparators.Colon)
.asNewKeys(symbol, entrez)

// ---------------------------------------------------------------------------
.untuplify1b(MC ~> 'molecular_consequences)
.deserialize1b(MC ~> 'molecular_consequences)
// inconsistently using pipe here as tuple separator, eg "SO:0001583|missense_variant,SO:0001623|5_prime_UTR_variant" */
.withSplitters(SemanticSeparators.Comma, SemanticSeparators.Pipe)
.asNewKeys(
/* Sequence Ontology ID */ 'term, // eg "SO:0001583"
/* molecular_consequence */ 'name) // eg "missense_variant"

// ---------------------------------------------------------------------------
.untuplify1b(CLNVI)
.deserialize1b(CLNVI)
.withSplitters(SemanticSeparators.Pipe, EntrySplitter)
.asNewKeys(name, id /* mostly internal IDs */)

// ---------------------------------------------------------------------------
.untuplify1b(CLNSIGCONF) // eg "Likely_pathogenic(3)%3BPathogenic(1)%3BUncertain_significance(2)"
.deserialize1b(CLNSIGCONF) // eg "Likely_pathogenic(3)%3BPathogenic(1)%3BUncertain_significance(2)"
.withSplitters(
arraySplitter = "%3B", // see br210112171706 for comma; TODO: figure out the (\d) part, only a 5 distinct values if not for these
entriesSplitter = _.extractGroups(ConflictsRegex).get)
.asNewKeys(value, count)

// ---------------------------------------------------------------------------
.convert (SSR).toNonRequired
.convert (SSR).toOptional
.translate(SSR) // "Variant Suspect Reason Codes. One or more of the following values may be added">
.usingStrict(SsrMapping) // TODO: how come never actually summed, unlike ORIGIN? really only 1, 16 and 17 in just a handful of values...

Expand All @@ -91,9 +92,9 @@ object ClinvarVcf { // 210102155202
'AF_EXAC ~> EXAC , // allele frequencies from ExAC
'AF_TGP ~> `1KGP`) // allele frequencies from KGP
.under ('AF)
.convert ('AF).toNonRequired // won't be required after t210122162650 is addressed
.transformObject('AF).using {
_ .convert(ESP, EXAC, `1KGP`).toNonRequired // won't be required after t210122162650 is addressed
.convert ('AF).toOptional // won't be required after t210122162650 is addressed
.transformEntity('AF).using {
_ .convert(ESP, EXAC, `1KGP`).toOptional // won't be required after t210122162650 is addressed
.forLeafPaths(_.convert(_).toDouble) }

// ---------------------------------------------------------------------------
Expand All @@ -111,15 +112,15 @@ object ClinvarVcf { // 210102155202
.splitBy(SemanticSeparators.Pipe)
.underNewKey(newKey)

.convert(newKey.value).toNonRequired // eg disease_INCL...
.convert(newKey.value).toOptional // eg disease_INCL...

// ---------------------------------------------------------------------------
.transform(_.objz(newKey.value)).using {
.transform(_.entities(newKey.value)).using {
// eg "MedGen:C3808739,OMIM:615120" - "Tag-value pairs of disease database name and identifier, e.g. OMIM:NNNNNN"
_ .removeIfValueFor('terms).is(".")

// "untuplify" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
.untuplify1b('terms)
// "deserialize" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
.deserialize1b('terms)
.withSplitters(SemanticSeparators.Comma, EntrySplitter)
.asNewKeys(database, id) }

Expand Down
22 changes: 11 additions & 11 deletions src/main/scala/galliaexample/clinvar/ClinvarVcfDriver.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package galliaexample
package clinvar

import aptus.Anything_ // for .thn()
import scala.util.chaining._ // for .pipe
import gallia._

// ===========================================================================
Expand Down Expand Up @@ -66,25 +66,25 @@ object ClinvarVcfDriver {

// ===========================================================================
def main(args: Array[String]) {
InputFile
apply(InputFile).write(OutputFile)

.stream(_.lines.iteratorMode)
()
}

// ---------------------------------------------------------------------------
def apply(path: String): HeadS =
path

.stream(_.lines)
.logProgress(1000)
.thn(vcf.Vcf.processLines(
.pipe(vcf.Vcf.processLines(
// INFO keys; could also extract them from VCF header if it is well-formed (not always the case...)
'ALLELEID, 'RS,
'CLNDN, 'CLNDNINCL, 'CLNDISDB, 'CLNDISDBINCL, 'CLNHGVS, 'CLNREVSTAT,
'CLNSIG, 'CLNVC, 'CLNVCSO, 'CLNSIGINCL, 'CLNVI, 'CLNSIGCONF,
'GENEINFO, 'DBVARID, 'ORIGIN, 'MC, 'SSR,
'AF_ESP, 'AF_EXAC, 'AF_TGP))
.map(ClinvarVcf.apply _)

.write(OutputFile)

()
}


}

// ===========================================================================
10 changes: 5 additions & 5 deletions src/main/scala/galliaexample/vcf/Vcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ object Vcf { // 210211093714
_ .filterBy(_line).matches(!_.startsWith("#"))

.fission(_.string(_line))
.as('CHROM, 'POS, 'ID, 'REF, 'ALT, 'QUAL, 'FILTER, 'INFO)
.as("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
.using {
_ .splitXsv('\t' /* accounts for escaping */)
.force.tuple8 }

.convert('POS).toInt // ID is also an integer but not one meant to be used as such
.removeIfValueFor('QUAL, 'FILTER).is(".")
.convert("POS").toInt // ID is also an integer but not one meant to be used as such
.removeIfValueFor("QUAL", "FILTER").is(".")

// ---------------------------------------------------------------------------
// "untuplify" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
.untuplify2a('INFO)
// "deserialize" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
.deserialize2a("INFO")
.withSplitters( // see VCF specification (eg "RS=1235;ALLELEID=...")
entriesSplitter = ";",
entrySplitter = "=")
Expand Down

0 comments on commit 51ce0fb

Please sign in to comment.