Updated to match latest gallia-core changes

galliaproject · Oct 8, 2022 · 51ce0fb · 51ce0fb
1 parent a8f539b
commit 51ce0fb
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 41 deletions.
diff --git a/src/main/scala/galliaexample/clinvar/ClinvarOriginDecoding.scala b/src/main/scala/galliaexample/clinvar/ClinvarOriginDecoding.scala
@@ -1,6 +1,7 @@
 package galliaexample.clinvar
 
-import aptus.Anything_
+import scala.util.chaining._
+import aptus.Anything_ // for .assert
 
 // ===========================================================================
 object ClinvarOriginDecoding {
@@ -42,13 +43,13 @@ object ClinvarOriginDecoding {
   // ===========================================================================
   def breakDownValue(value: Int): Seq[Int] =
      value
-       .toBinaryString            // eg for 11: 1011
-       .reverse                   // eg for 11: 1101
-       .zipWithIndex              // eg for 11: [(1, 0), (1, 1), (0, 2), (1, 3)]
-       .filter(_._1 == '1')       // eg for 11: [(1, 0), (1, 1),       , (1, 3)]
-       .map   (_._2       )       // eg for 11: [    0 ,     1 ,             3 ]
-       .map(math.pow(2, _).toInt) // eg for 11: [    1,      2 ,             8 ] -> the sum of which is indeed 11
-       .thn(subValues =>
+       .toBinaryString             // eg for 11: 1011
+       .reverse                    // eg for 11: 1101
+       .zipWithIndex               // eg for 11: [(1, 0), (1, 1), (0, 2), (1, 3)]
+       .filter(_._1 == '1')        // eg for 11: [(1, 0), (1, 1),       , (1, 3)]
+       .map   (_._2       )        // eg for 11: [    0 ,     1 ,             3 ]
+       .map (math.pow(2, _).toInt) // eg for 11: [    1,      2 ,             8 ] -> the sum of which is indeed 11
+       .pipe(subValues =>
          subValues
            .sorted
            .assert(_ == subValues) /* already be sorted by design */)

diff --git a/src/main/scala/galliaexample/clinvar/ClinvarVcf.scala b/src/main/scala/galliaexample/clinvar/ClinvarVcf.scala
@@ -1,6 +1,7 @@
 package galliaexample.clinvar
 
-import aptus.{Anything_, String_}
+import scala.util.chaining._
+import aptus.String_ // for .extractGroups utility
 import gallia._
 
 // ===========================================================================
@@ -23,51 +24,51 @@ object ClinvarVcf { // 210102155202
       .remove(RS)
 
           // ---------------------------------------------------------------------------
-          .thn(processDiseaseFields(
+          .pipe(processDiseaseFields(
                 CLNDN   , /* disease name, eg: "Myasthenic_syndrome,_congenital,_8|not_specified"   */
                 CLNDISDB, /* disease db  , eg:        "MedGen:C3808739,OMIM:615120|MedGen:CN169374" */
               newKey = disease))
 
-          .thn(processDiseaseFields(
+          .pipe(processDiseaseFields(
                 CLNDNINCL   , /* disease name INCL - eg "Small_fiber_neuropathy";        "For included Variant: ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"*/
                 CLNDISDBINCL, /* disease db   INCL - eg "MedGen:C0220754", "OMIM:253260; "For included Variant: Tag-value pairs of disease database name and identifier, e.g. OMIM:NNNNNN" */
               newKey = disease_INCL))
 
           // ---------------------------------------------------------------------------
-          // "untuplify" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
-          .untuplify1b(CLNSIGINCL ~> 'clinical_significance_for_including)
+          // "deserialize" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
+          .deserialize1b(CLNSIGINCL ~> 'clinical_significance_for_including)
             .withSplitters(SemanticSeparators.Pipe, SemanticSeparators.Colon)
               .asNewKeys(
                   included_clinvar_variant_id  /* meaning: see 210118100341 */,
                   value                        /* clinical significance */)
 
           // ---------------------------------------------------------------------------
-          .untuplify1b(GENEINFO ~> 'genes)
+          .deserialize1b(GENEINFO ~> 'genes)
             .withSplitters(SemanticSeparators.Pipe, SemanticSeparators.Colon)
               .asNewKeys(symbol, entrez)
 
           // ---------------------------------------------------------------------------
-          .untuplify1b(MC ~> 'molecular_consequences)
+          .deserialize1b(MC ~> 'molecular_consequences)
             // inconsistently using pipe here as tuple separator, eg "SO:0001583|missense_variant,SO:0001623|5_prime_UTR_variant" */
             .withSplitters(SemanticSeparators.Comma, SemanticSeparators.Pipe)
               .asNewKeys(
                   /* Sequence Ontology ID  */ 'term, // eg "SO:0001583"
                   /* molecular_consequence */ 'name) // eg "missense_variant"
 
           // ---------------------------------------------------------------------------
-          .untuplify1b(CLNVI)
+          .deserialize1b(CLNVI)
             .withSplitters(SemanticSeparators.Pipe, EntrySplitter)
               .asNewKeys(name, id /* mostly internal IDs */)
 
           // ---------------------------------------------------------------------------
-          .untuplify1b(CLNSIGCONF) // eg "Likely_pathogenic(3)%3BPathogenic(1)%3BUncertain_significance(2)"
+          .deserialize1b(CLNSIGCONF) // eg "Likely_pathogenic(3)%3BPathogenic(1)%3BUncertain_significance(2)"
             .withSplitters(
                 arraySplitter = "%3B", // see br210112171706 for comma; TODO: figure out the (\d) part, only a 5 distinct values if not for these
                   entriesSplitter = _.extractGroups(ConflictsRegex).get)
               .asNewKeys(value, count)
 
           // ---------------------------------------------------------------------------
-          .convert  (SSR).toNonRequired
+          .convert  (SSR).toOptional
           .translate(SSR) // "Variant Suspect Reason Codes. One or more of the following values may be added">
             .usingStrict(SsrMapping) // TODO: how come never actually summed, unlike ORIGIN? really only 1, 16 and 17 in just a handful of values...
 
@@ -91,9 +92,9 @@ object ClinvarVcf { // 210102155202
                 'AF_EXAC ~>  EXAC , // allele frequencies from ExAC
                 'AF_TGP  ~> `1KGP`) // allele frequencies from KGP
               .under        ('AF)
-            .convert        ('AF).toNonRequired           // won't be required after t210122162650 is addressed
-            .transformObject('AF).using {
-              _ .convert(ESP, EXAC, `1KGP`).toNonRequired // won't be required after t210122162650 is addressed
+            .convert        ('AF).toOptional           // won't be required after t210122162650 is addressed
+            .transformEntity('AF).using {
+              _ .convert(ESP, EXAC, `1KGP`).toOptional // won't be required after t210122162650 is addressed
                 .forLeafPaths(_.convert(_).toDouble) }
 
           // ---------------------------------------------------------------------------
@@ -111,15 +112,15 @@ object ClinvarVcf { // 210102155202
           .splitBy(SemanticSeparators.Pipe)
             .underNewKey(newKey)
 
-      .convert(newKey.value).toNonRequired // eg disease_INCL...
+      .convert(newKey.value).toOptional // eg disease_INCL...
 
       // ---------------------------------------------------------------------------
-      .transform(_.objz(newKey.value)).using {
+      .transform(_.entities(newKey.value)).using {
           // eg "MedGen:C3808739,OMIM:615120" - "Tag-value pairs of disease database name and identifier, e.g. OMIM:NNNNNN"
           _ .removeIfValueFor('terms).is(".")
 
-            // "untuplify" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
-            .untuplify1b('terms)
+            // "deserialize" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
+            .deserialize1b('terms)
               .withSplitters(SemanticSeparators.Comma, EntrySplitter)
                 .asNewKeys(database, id) }
 

diff --git a/src/main/scala/galliaexample/clinvar/ClinvarVcfDriver.scala b/src/main/scala/galliaexample/clinvar/ClinvarVcfDriver.scala
@@ -1,7 +1,7 @@
 package galliaexample
 package clinvar
 
-import aptus.Anything_ // for .thn()
+import scala.util.chaining._ // for .pipe
 import gallia._
 
 // ===========================================================================
@@ -66,25 +66,25 @@ object ClinvarVcfDriver {
 
   // ===========================================================================
   def main(args: Array[String]) {
-    InputFile
+    apply(InputFile).write(OutputFile)
 
-        .stream(_.lines.iteratorMode)
+    ()
+  }
+
+  // ---------------------------------------------------------------------------
+  def apply(path: String): HeadS =
+    path
+
+        .stream(_.lines)
         .logProgress(1000)
-        .thn(vcf.Vcf.processLines(
+        .pipe(vcf.Vcf.processLines(
               // INFO keys; could also extract them from VCF header if it is well-formed (not always the case...)
               'ALLELEID, 'RS,
               'CLNDN, 'CLNDNINCL, 'CLNDISDB, 'CLNDISDBINCL, 'CLNHGVS, 'CLNREVSTAT,
               'CLNSIG, 'CLNVC, 'CLNVCSO, 'CLNSIGINCL, 'CLNVI, 'CLNSIGCONF,
               'GENEINFO, 'DBVARID, 'ORIGIN, 'MC, 'SSR,
               'AF_ESP, 'AF_EXAC, 'AF_TGP))
         .map(ClinvarVcf.apply _)
-
-      .write(OutputFile)
-
-    ()
-  }
-
-
 }
 
 // ===========================================================================
diff --git a/src/main/scala/galliaexample/vcf/Vcf.scala b/src/main/scala/galliaexample/vcf/Vcf.scala
@@ -11,17 +11,17 @@ object Vcf { // 210211093714
     _ .filterBy(_line).matches(!_.startsWith("#"))
 
       .fission(_.string(_line))
-          .as('CHROM, 'POS, 'ID, 'REF, 'ALT, 'QUAL, 'FILTER, 'INFO)
+          .as("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
             .using {
               _ .splitXsv('\t' /* accounts for escaping */)
                 .force.tuple8 }
 
-      .convert('POS).toInt // ID is also an integer but not one meant to be used as such
-      .removeIfValueFor('QUAL, 'FILTER).is(".")
+      .convert("POS").toInt // ID is also an integer but not one meant to be used as such
+      .removeIfValueFor("QUAL", "FILTER").is(".")
 
       // ---------------------------------------------------------------------------
-      // "untuplify" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
-      .untuplify2a('INFO)
+      // "deserialize" -> see https://github.com/galliaproject/gallia-core#why-does-the-terminology-sometimes-sound-funny-or-full-on-neological
+      .deserialize2a("INFO")
         .withSplitters( // see VCF specification (eg "RS=1235;ALLELEID=...")
             entriesSplitter = ";",
               entrySplitter = "=")