From 089874917ad2eb5e95edac66b2f102f2d10e363a Mon Sep 17 00:00:00 2001 From: Oliver Maus Date: Fri, 15 Nov 2024 17:15:11 +0100 Subject: [PATCH] Add PRIDE package v1.0.3 --- StagingArea/pride/pride@1.0.3.fsx | 349 ++++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 StagingArea/pride/pride@1.0.3.fsx diff --git a/StagingArea/pride/pride@1.0.3.fsx b/StagingArea/pride/pride@1.0.3.fsx new file mode 100644 index 0000000..ba5d602 --- /dev/null +++ b/StagingArea/pride/pride@1.0.3.fsx @@ -0,0 +1,349 @@ +let []PACKAGE_METADATA = """(* +--- +Name: pride +Summary: Validates if the ARC contains the necessary metadata to be publishable via PRIDE. +Description: | + Validates if the ARC contains the necessary metadata to be publishable via PRIDE. + The following metadata is required: + - Investigation has title and description + - Investigation has Keywords comment in correct format + - All persons in Investigation Contacts must have a first name, last name, affiliation and valid email + - Study has protocol, tissue & species in correct format + - Assay has technology type, instrument model, and fixed and/or variable modification in correct format +MajorVersion: 1 +MinorVersion: 0 +PatchVersion: 3 +Publish: true +Authors: + - FullName: Oliver Maus + Email: maus@nfdi4plants.org + Affiliation: DataPLANT + - FullName: Christopher Lux + Email: lux@csbiology.de + Affiliation: RPTU Kaiserslautern + AffiliationLink: http://rptu.de/startseite +Tags: + - Name: validation + - Name: pride + - Name: proteomics +ReleaseNotes: | + - process graph tokens correctly resolved +--- +*)""" + + +#r "nuget: ARCExpect, 4.0.1" + + +open ControlledVocabulary +open Expecto +open ARCExpect +open ARCTokenization +open ARCTokenization.StructuralOntology +open System.IO + + +// Input: +let arcDir = Directory.GetCurrentDirectory() + + +// Values: +let absoluteDirectoryPaths = FileSystem.parseARCFileSystem arcDir + +let investigationMetadata = + absoluteDirectoryPaths + |> Investigation.parseMetadataSheetsFromTokens() arcDir + |> List.concat + +let studyMetadata = + absoluteDirectoryPaths + |> Study.parseMetadataSheetsFromTokens() arcDir + |> List.concat + +let assayMetadata = + absoluteDirectoryPaths + |> Assay.parseMetadataSheetsFromTokens() arcDir + |> List.concat + +let studyProcessGraphTokens = + try + absoluteDirectoryPaths + |> Study.parseProcessGraphColumnsFromTokens arcDir + |> Seq.collect Map.values + |> List.concat + with + | _ -> List.empty + +let assayProcessGraphTokens = + try + absoluteDirectoryPaths + |> Assay.parseProcessGraphColumnsFromTokens arcDir + |> Seq.collect Map.values + |> List.concat + with + | _ -> List.empty + +let organismTokens = + studyProcessGraphTokens + |> List.tryFind (fun cvpList -> cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("OBI:0100026","organism","OBI"))) + |> Option.defaultValue [] + +let tissueTokens = + studyProcessGraphTokens + |> List.tryFind (fun cvpList -> cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("NCIT:C12801","Tissue","NCIT"))) + |> Option.defaultValue [] + +let instrumentTokens = + assayProcessGraphTokens + |> List.tryFind (fun cvpList -> cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("MS:1000031","instrument model","MS"))) + |> Option.defaultValue [] + +let modTokens = + assayProcessGraphTokens + |> List.filter ( + fun cvpList -> + cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("MS:1003021","Fixed modification","MS")) || + cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("MS:1003022","Variable modification","MS")) + ) + |> List.concat + +let techTypeName = CvTerm.create("ASSMSO:00000011", "Assay Technology Type", "ASSMSO") +let techTypeTAN = CvTerm.create("ASSMSO:00000013", "Assay Technology Type Term Accession Number", "ASSMSO") +let techTypeTSR = CvTerm.create("ASSMSO:00000015", "Assay Technology Type Term Source REF", "ASSMSO") + + +// Helper functions (to deposit in ARCExpect later): + +open System.Text + + +let characterLimit (lowerLimit : int option) (upperLimit : int option) = + match lowerLimit, upperLimit with + | None, None -> System.Text.RegularExpressions.Regex(@"^.{0,}$") + | Some ll, None -> System.Text.RegularExpressions.Regex($"^.{{{ll},}}$") + | None, Some ul -> System.Text.RegularExpressions.Regex($"^.{{0,{ul}}}$") + | Some ll, Some ul -> System.Text.RegularExpressions.Regex($"^.{{{ll},{ul}}}$") + + +type ErrorMessage with + + static member ofIParamCollection error iParamCollection = + + let iParam = Seq.head iParamCollection + + let str = new StringBuilder() + str.AppendFormat("['{0}', ..] {1}\n", Param.getCvName iParam, error) |> ignore + + match Param.tryGetValueOfCvParamAttr "FilePath" iParam with + | Some path -> + str.AppendFormat(" > filePath '{0}'\n", path) |> ignore + | None -> () + + match Param.tryGetValueOfCvParamAttr "Worksheet" iParam with + | Some sheet -> + str.AppendFormat(" > sheet '{0}'", sheet) |> ignore + | None -> () + + match Param.tryGetValueOfCvParamAttr "Row" iParam with + | Some row -> + str.AppendFormat(" > row '{0}'", row) |> ignore + | None -> () + + match Param.tryGetValueOfCvParamAttr "Column" iParam with + | Some column -> + str.AppendFormat(" > column '{0}'", column) |> ignore + | None -> () + + match Param.tryGetValueOfCvParamAttr "Line" iParam with + | Some line -> + str.AppendFormat(" > line '{0}'", line) |> ignore + | None -> () + + match Param.tryGetValueOfCvParamAttr "Position" iParam with + | Some position -> + str.AppendFormat(" > position '{0}'", position) |> ignore + | None -> () + str.ToString() + + +type Validate.ParamCollection with + + static member AllTermsSatisfyPredicate (projection : #IParam -> bool) (paramCollection : #seq<#IParam>) = + match Seq.forall projection paramCollection with + | true -> () + | false -> + ErrorMessage.ofIParamCollection $"does not satisfy the requirements" paramCollection + |> Expecto.Tests.failtestNoStackf "%s" + + +// Validation Cases: +let investigationCases = + testList INVMSO.``Investigation Metadata``.INVESTIGATION.key.Name [ + // Investigation has title + ARCExpect.validationCase (TestID.Name INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Title``.Name) { + investigationMetadata + |> Validate.ParamCollection.ContainsNonKeyParamWithTerm + INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Title`` + } + + // Investigation has description + ARCExpect.validationCase (TestID.Name INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Description``.Name) { + investigationMetadata + |> Validate.ParamCollection.ContainsNonKeyParamWithTerm + INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Description`` + } + + // Investigation has contacts with name, last name, affiliation and email + // Investigation Person First Name + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``.Name} exists") { + investigationMetadata + |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name`` + } + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``.Name} is not empty") { + investigationMetadata + |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``) + |> Seq.iter Validate.Param.ValueIsNotEmpty + } + + // Investigation Person Last Name + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``.Name} exists") { + investigationMetadata + |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name`` + } + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``.Name} is not empty") { + investigationMetadata + |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``) + |> Seq.iter Validate.Param.ValueIsNotEmpty + } + + // Investigation Person Affiliation + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``.Name} exists") { + investigationMetadata + |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation`` + } + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``.Name} is not empty") { + investigationMetadata + |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``) + |> Seq.filter (Param.getValueAsString >> (<>) "Metadata Section Key") + |> Seq.iter Validate.Param.ValueIsNotEmpty + } + + // Investigation Person Email + ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Email``.Name} exists") { + investigationMetadata + |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Email`` + } + ARCExpect.validationCase (TestID.Name INVMSO.``Investigation Metadata``. ``INVESTIGATION CONTACTS``.``Investigation Person Email``.Name) { + investigationMetadata + |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Email``) + |> Seq.filter (Param.getValueAsString >> (<>) "Metadata Section Key") + |> Seq.iter (Validate.Param.ValueMatchesRegex StringValidationPattern.email) + } + ] + +let studyCases = + testList STDMSO.``Study Metadata``.STUDY.key.Name [ + // Study has protocol in correct format + ARCExpect.validationCase (TestID.Name STDMSO.``Study Metadata``.``STUDY PROTOCOLS``.key.Name) { + studyMetadata + |> Validate.ParamCollection.ContainsParamWithTerm STDMSO.``Study Metadata``.``STUDY PROTOCOLS``.key + } + + // Study protocol is in correct format + ARCExpect.validationCase (TestID.Name "STUDY PROTOCOLS description") { + studyMetadata|> Seq.filter (fun iparam -> Param.getTerm iparam = STDMSO.``Study Metadata``.``STUDY PROTOCOLS``.``Study Protocol Description``) + |> Seq.filter (Param.getValueAsString >> (<>) "Metadata Section Key") + |> Seq.iter (Validate.Param.ValueMatchesRegex (characterLimit (Some 50) (Some 500))) + } + + // Study has tissue header in process graph + ARCExpect.validationCase (TestID.Name "Tissue") { + tissueTokens + |> Validate.ParamCollection.SatisfiesPredicate ( + fun iparams -> + iparams + |> List.exists (fun iparam -> iparam.Value = ParamValue.CvValue (CvTerm.create("NCIT:C12801","Tissue","NCIT"))) + ) + } + + // Study has tissue values as terms (CvValues) + ARCExpect.validationCase (TestID.Name "Tissue terms") { + tissueTokens + |> Validate.ParamCollection.AllTermsSatisfyPredicate (fun ip -> match ip.Value with CvValue _ -> true | _ -> false) + } + + // Study has species in correct format + ARCExpect.validationCase (TestID.Name "Organism") { + organismTokens + |> Validate.ParamCollection.SatisfiesPredicate ( + fun iparams -> + iparams + |> List.exists (fun iparam -> iparam.Value = ParamValue.CvValue (CvTerm.create("OBI:0100026","organism","OBI"))) + ) + } + + // Study has species values as terms (CvValues) + ARCExpect.validationCase (TestID.Name "Organism terms") { + tissueTokens + |> Validate.ParamCollection.AllTermsSatisfyPredicate (fun ip -> match ip.Value with CvValue _ -> true | _ -> false) + } + ] + +let assayCases = + testList ASSMSO.``Assay Metadata``.ASSAY.key.Name [ + + // Assay has technology type in correct format + ARCExpect.validationCase (TestID.Name techTypeTAN.Name) { + assayMetadata + |> Validate.ParamCollection.ContainsParamWithTerm techTypeTAN + } + ARCExpect.validationCase (TestID.Name techTypeName.Name) { + assayMetadata + |> Validate.ParamCollection.ContainsParamWithTerm techTypeName + } + ARCExpect.validationCase (TestID.Name techTypeTSR.Name) { + assayMetadata + |> Validate.ParamCollection.ContainsParamWithTerm techTypeTSR + } + + // Assay has instrument model + ARCExpect.validationCase (TestID.Name "instrument model") { + instrumentTokens + |> Validate.ParamCollection.SatisfiesPredicate ( + fun iparams -> + iparams + |> List.exists (fun iparam -> iparam.Value = ParamValue.CvValue (CvTerm.create("MS:1000031","instrument model","MS"))) + ) + } + + // Assay has instrument model in correct format + ARCExpect.validationCase (TestID.Name "instrument model terms") { + instrumentTokens + |> Validate.ParamCollection.AllTermsSatisfyPredicate (fun ip -> match ip.Value with CvValue _ -> true | _ -> false) + } + + // Assay has fixed or variable modification in correct format + ARCExpect.validationCase (TestID.Name "modification") { + modTokens + |> Validate.ParamCollection.SatisfiesPredicate ( + fun iparams -> + iparams + |> List.exists ( + fun iparam -> + iparam.Value = ParamValue.CvValue (CvTerm.create("MS:1003021","Fixed modification","MS")) || + iparam.Value = ParamValue.CvValue (CvTerm.create("MS:1003022","Variable modification","MS")) + ) + ) + } + ] + + +// Execution: + +Setup.ValidationPackage( + metadata = Setup.Metadata(PACKAGE_METADATA), + CriticalValidationCases = [investigationCases; studyCases; assayCases] +) +|> Execute.ValidationPipeline( + basePath = arcDir +) \ No newline at end of file