From 366410790ad8c402b756e245530a54a89ec7fbc2 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 19 Mar 2024 14:32:47 +0900 Subject: [PATCH 01/12] Update set mzQC for validator --- docs/pages/examples.md | 4 +- .../{set-of-runs.mzQC.md => intro_set.md} | 6 +- .../{set-of-runs.mzQC => intro_set.mzQC} | 158 ++++++++++-------- 3 files changed, 92 insertions(+), 76 deletions(-) rename docs/pages/worked-examples/{set-of-runs.mzQC.md => intro_set.md} (99%) rename specification_documents/examples/{set-of-runs.mzQC => intro_set.mzQC} (61%) diff --git a/docs/pages/examples.md b/docs/pages/examples.md index bd09aeca..6883446c 100644 --- a/docs/pages/examples.md +++ b/docs/pages/examples.md @@ -6,8 +6,8 @@ permalink: /examples/ Here are a number of worked examples, that, each for its own use-case, go step-by-step through the different parts of a mzQC. -- [Single mass spectrometry run](intro_run/) -- [Sets of runs](set-of-runs/) +- [Representing QC data for an individual mass spectrometry run](intro_run/) +- [Deriving QC data from multiple related mass spectrometry runs](intro_set/) - [QC sample mzQC](QC2-sample-example/) - [in mzML](mzml-mzqc-example/) - [Using USI with mzQC](USI-example/) diff --git a/docs/pages/worked-examples/set-of-runs.mzQC.md b/docs/pages/worked-examples/intro_set.md similarity index 99% rename from docs/pages/worked-examples/set-of-runs.mzQC.md rename to docs/pages/worked-examples/intro_set.md index dfc12f61..88ec2351 100644 --- a/docs/pages/worked-examples/set-of-runs.mzQC.md +++ b/docs/pages/worked-examples/intro_set.md @@ -1,7 +1,7 @@ --- layout: page -title: "Multi-Run (i.e. sets) Example of mzQC" -permalink: /examples/set-of-runs/ +title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" +permalink: /examples/intro_set/ --- Here, we describe an mzQC JSON document used to convey QC data which is computed on a set of runs, i.e. @@ -456,4 +456,4 @@ On the other hand, ommitting the `healthy`/`diseased` setQualities is not sensib } ``` ### This is the mzQC file once again, in full: -**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/set-of-runs.mzQC)** \ No newline at end of file +**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/set-of-runs.mzQC)** diff --git a/specification_documents/examples/set-of-runs.mzQC b/specification_documents/examples/intro_set.mzQC similarity index 61% rename from specification_documents/examples/set-of-runs.mzQC rename to specification_documents/examples/intro_set.mzQC index 003ece56..4903d5ae 100644 --- a/specification_documents/examples/set-of-runs.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -4,15 +4,15 @@ "creationDate": "2020-12-01T14:19:09Z", "contactName": "Chris Bielow", "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", + "description": "A simple mzQC file containing information for a set of multiple mass spectrometry runs.", "setQualities": [ { "metadata": { "label": "healthy", "inputFiles": [ { - "name": "tr1_healthy", - "location": "file:///C:/msdata/techRep1_healthy.mzML", + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -26,8 +26,8 @@ ] }, { - "name": "tr2_healthy", - "location": "file:///C:/msdata/techRep2_healthy.mzML", + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -41,8 +41,8 @@ ] }, { - "name": "tr3_healthy", - "location": "file:///C:/msdata/techRep3_healthy.mzML", + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -60,23 +60,29 @@ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" }, { "accession": "MS:1000799", "name": "custom unreleased software tool", - "value": "mzqc-pylib", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", "version": "0", - "uri": "https://hupo-psi.github.io/mzQC/unknown.html" + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000270", + "accession": "MS:4000XXX", "name": "protein contaminant intensity ratio", - "value": "0.25" + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.25, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } } ] }, @@ -85,8 +91,8 @@ "label": "diseased", "inputFiles": [ { - "name": "tr1_diseased", - "location": "file:///C:/msdata/techRep1_diseased.mzML", + "name": "techRep1_diseased", + "location": "file://C:/msdata/techRep1_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -100,8 +106,8 @@ ] }, { - "name": "tr2_diseased", - "location": "file:///C:/msdata/techRep2_diseased.mzML", + "name": "techRep2_diseased", + "location": "file://C:/msdata/techRep2_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -115,8 +121,8 @@ ] }, { - "name": "tr3_diseased", - "location": "file:///C:/msdata/techRep3_diseased.mzML", + "name": "techRep3_diseased", + "location": "file://C:/msdata/techRep3_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -134,16 +140,29 @@ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" + }, + { + "accession": "MS:1000799", + "name": "custom unreleased software tool", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", + "version": "0", + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000270", + "accession": "MS:4000XXX", "name": "protein contaminant intensity ratio", - "value": "0.31" + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.31, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } } ] }, @@ -152,8 +171,8 @@ "label": "all", "inputFiles": [ { - "name": "tr1_healthy", - "location": "file:///C:/msdata/techRep1_healthy.mzML", + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -167,8 +186,8 @@ ] }, { - "name": "tr2_healthy", - "location": "file:///C:/msdata/techRep2_healthy.mzML", + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -182,8 +201,8 @@ ] }, { - "name": "tr3_healthy", - "location": "file:///C:/msdata/techRep3_healthy.mzML", + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -197,8 +216,8 @@ ] }, { - "name": "tr1_diseased", - "location": "file:///C:/msdata/techRep1_diseased.mzML", + "name": "techRep1_diseased", + "location": "file://C:/msdata/techRep1_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -212,8 +231,8 @@ ] }, { - "name": "tr2_diseased", - "location": "file:///C:/msdata/techRep2_diseased.mzML", + "name": "techRep2_diseased", + "location": "file://C:/msdata/techRep2_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -227,8 +246,8 @@ ] }, { - "name": "tr3_diseased", - "location": "file:///C:/msdata/techRep3_diseased.mzML", + "name": "techRep3_diseased", + "location": "file://C:/msdata/techRep3_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -240,65 +259,67 @@ "value": "2012-02-03 15:00:41" } ] + }, + { + "name": "proteinGroups", + "location": "file://C:/msdata/proteinGroups.txt", + "fileFormat": { + "accession": "MS:1002130", + "name": "identification file format" + }, + "fileProperties": [ + { + "accession": "MS:1000747", + "name": "completion time", + "value": "2012-02-03 18:00:41" + } + ] } ], "analysisSoftware": [ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" + }, + { + "accession": "MS:1000799", + "name": "custom unreleased software tool", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", + "version": "0", + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": [ - "tr1_healthy", - "tr2_healthy", - "tr3_healthy", - "tr1_diseased", - "tr2_diseased", - "tr3_diseased" - ], - "group-label": [ - "healthy", - "healthy", - "healthy", - "diseased", - "diseased", - "diseased" - ] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", + "accession": "MS:4000091", + "name": "principal component analysis of MaxQuant's protein group lfq intensities", + "description": "A table with the PCA results of MaxQuant's protein group lfq intensities.", "value": { - "group-label": [ + "MS:4000086": [ "healthy", "diseased" ], - "PCA Dimension 1": [ - 47.22, - -30.22 + "MS:4000081": [ + 47.2, + -30.2 ], - "PCA Dimension 2": [ + "MS:4000082": [ 29.1, -36.5 ], - "PCA Dimension 3": [ + "MS:4000083": [ 3.8, -7.3 ], - "PCA Dimension 4": [ + "MS:4000084": [ -7.7, - 5.55 + 5.6 ], - "PCA Dimension 5": [ + "MS:4000085": [ 140.6, -64.1 ] @@ -308,15 +329,10 @@ } ], "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/mzQC/blob/main/cv/qc-cv.obo", - "version": "1.0.0" - }, { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.71/psi-ms.obo", - "version": "4.1.71" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", + "version": "4.1.130" } ] } From 49f4133fe50d02d7d0f9d4387a41aa1a367e4a8b Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 19 Mar 2024 15:30:45 +0900 Subject: [PATCH 02/12] Update set description --- .../{MultiSet_PCA.png => intro_set_pca.png} | Bin docs/pages/worked-examples/intro_run.md | 10 +- docs/pages/worked-examples/intro_set.md | 535 ++++-------------- .../examples/intro_set.mzQC | 6 +- 4 files changed, 134 insertions(+), 417 deletions(-) rename docs/pages/figures/{MultiSet_PCA.png => intro_set_pca.png} (100%) diff --git a/docs/pages/figures/MultiSet_PCA.png b/docs/pages/figures/intro_set_pca.png similarity index 100% rename from docs/pages/figures/MultiSet_PCA.png rename to docs/pages/figures/intro_set_pca.png diff --git a/docs/pages/worked-examples/intro_run.md b/docs/pages/worked-examples/intro_run.md index cfc92104..76c19d93 100644 --- a/docs/pages/worked-examples/intro_run.md +++ b/docs/pages/worked-examples/intro_run.md @@ -12,6 +12,7 @@ Here, we'll walk through the key components of an mzQC file, which uses a JSON-b You can explore the complete mzQC file [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_run.mzQC), to see all of the elements in their context. An mzQC file starts with the root element `mzQC`: + ``` { "mzQC": { @@ -23,6 +24,7 @@ An mzQC file starts with the root element `mzQC`: Within `mzQC`, there are three main sections: 1. **General file information:** These attributes provide essential details about the mzQC file itself. + ``` "version": "1.0.0", "creationDate": "2020-12-01T11:56:34Z", @@ -33,6 +35,7 @@ Within `mzQC`, there are three main sections: 2. **Controlled vocabulary (CV) references:** This section points to standardized vocabularies used to ensure consistent metric definitions across files. It is typically included at the end of the mzQC file. + ``` "controlledVocabularies": [ { @@ -44,6 +47,7 @@ It is typically included at the end of the mzQC file. ``` 3. **Quality metrics for the run:** This core part of the file captures the QC metrics specific to the run being described. + ``` "runQualities": [ { @@ -55,6 +59,7 @@ It is typically included at the end of the mzQC file. In the `runQualities` section, you may find multiple `runQuality` elements, each corresponding to a unique mass spectrometry run. For simplicity, this example only includes a single run in the mzQC file. First, this includes a `metadata` part detailing the run specifics, such as the source files and software used in analysis: + ``` "metadata": { "inputFiles": [ @@ -67,6 +72,7 @@ First, this includes a `metadata` part detailing the run specifics, such as the ``` Digging a bit deeper, for example, the `inputFiles` array describes each file contributing to the run, including details like file name, location (URI), format, and properties—all standardized using CV terms. + ``` "inputFiles": [ { @@ -101,6 +107,7 @@ Finally, the `qualityMetrics` array lists the metrics derived from the run, each Metrics can take various forms, such as single values, tuples (arrays of values), or more complex structures like matrices or tables, depending on the information being conveyed. For example, a single valued metric: + ``` { "accession": "MS:4000059", @@ -111,10 +118,11 @@ For example, a single valued metric: "accession": "UO:0000189", "name": "count unit" } -} +}, ``` And a tuple metric: + ``` { "accession": "MS:4000069", diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index 88ec2351..b9be2d4e 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -4,456 +4,165 @@ title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" permalink: /examples/intro_set/ --- -Here, we describe an mzQC JSON document used to convey QC data which is computed on a set of runs, i.e. -is **only interpretable in the context of this set** (group). -Of course, QC metrics which refer to each run individually can also be stored, also in the same mzQC file -(see our example `individual-runs.mzQC.md` on how to do that), but this example is about group/set metrics. +This page describes how to use mzQC for analyzing groups, or "sets," of mass spectrometry runs. +This builds upon our understanding of [using mzQC for individual runs](https://hupo-psi.github.io/mzQC/examples/intro_run/), extending it to how you can analyze and represent data from multiple runs together. +Think of a "set" as a bundle of experiments that you want to examine collectively. -Find the complete example file at the bottom of this document or in the example folder. +> [!TIP] +> Sets are versatile! +> You can group runs together, but you can also group sets within other sets. +> This allows for a structured hierarchy in your analysis, like grouping technical replicates under biological ones and then comparing across conditions. + +Discover the full example of an mzQC file for a set [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_set.mzQC). + +The structure of an mzQC file for a set mirrors that for a single run, starting with the root element `mzQC`: -The basic structure of our mzQC file is identical to the `individual-runs.mzQC` example, i.e. -the documents main anchor is between the outer curly brackets: ``` -{ "mzQC": - { +{ + "mzQC": { ... } } ``` -Within this main anchor, there are usually the following sections: -a) general information about the file, -``` - "version": "1.0.0", - "creationDate": "2020-12-21T11:56:34", - "contactName": "Chris Bielow", - "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", -``` +Within `mzQC`, there are three main sections: -b) reference information for controlled vocabularies (cv) at the bottom, -``` - "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo", - "version": "0.1.0" - }, - { - "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo", - "version": "4.1.7" - } - ] -``` -and (now in addition or as replacement) to the `runQualities` of the `individual-runs.mzQC` we have -c) information about the QC metrics computed on **a set of runs**. -``` - "setQualities": [ - { - ... - } - ] -``` -In fact, `setQualities` can contain one or more `setQuality` objects, each defining a different set of runs. -E.g. if you have three technical replicates for two conditions for at total of six runs, you might want to subsume three runs into a set, one for each condition and report the total number of proteins you identified, or the percentage of total intensity attributable to contaminants). Each `setQuality` object is an element of a JSON array, thus it is not explicitly named (i.e. there is no "setQuality" key in the mzQC file). -For the purpose of this example, we will use **three** `setQuality` objects (there could be none, only one or more than two though): +1. **General file information:** These attributes provide essential details about the mzQC file itself. ``` - the **healthy** set: tr1_healthy, tr2_healthy, tr3_healthy - the **diseased** set: tr1_diseased, tr2_diseased, tr3_diseased - the **all** set: tr1_healthy, tr2_healthy, tr3_healthy, tr1_diseased, tr2_diseased, tr3_diseased +"version": "1.0.0", +"creationDate": "2020-12-01T14:19:09Z", +"contactName": "Chris Bielow", +"contactAddress": "chris.bielow@bsc.fu-berlin.de", +"description": "A simple mzQC file containing information for a set of multiple mass spectrometry runs.", ``` -How you define (and name) each set, is up to you and depends on your experimental design and the kind of comparisons you want to make. +2. **Controlled vocabulary (CV) references:** This section points to standardized vocabularies used to ensure consistent metric definitions across files. +It is typically included at the end of the mzQC file. -A `setQuality` represents QC data that must be viewed in the context of all the runs of this set/group. I.e. the data is only valid within the context of the runs it comprises. E.g. it would be invalid to define a set of three runs and report their individual MS1 scan counts as a 3-tuple -- because this information can clearly be attributed to individual runs and thus belongs in three separate `runQuality` objects, rather than a single `setQuality`. -Similar to `runQuality`, a `setQuality` also contains `metadata` about the set of runs (its input file**s**, the software used, etc). -You can give the set a unique name using the `label` attribute. Here is how a `setQuality` object looks like: -``` - { - "metadata": { - "label": "healthy" - "inputFiles": - ... - }, - "qualityMetrics": [ - ... - ] - } -``` -The `inputFiles` consist of an array of `inputFile` objects, describing the source files with structured information about the file's name, format, location and other properties, defined via cv terms. ``` - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\msdata\techRep1_healthy.mzML", - ... - }, - { - "name": "tr2_healthy", - "location": "c:\msdata\techRep2_healthy.mzML", - ... - }, - { - "name": "tr3_healthy", - "location": "c:\msdata\techRep3_healthy.mzML", - ... - } - ] +"controlledVocabularies": [ + { + "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", + "version": "4.1.130" + } +] ``` -The `inputFile` object is only sketched here. It can contain a lot more information, such as file format and further properties. See the full example below or `individual-runs.mzQC` for details. -In `qualityMetrics`, we will store the actual QC information for a particular `setQuality`. Each `qualityMetric` has an `accession` and the corresponding `name` as defined by the QC controlled vocabulary (see `qc-cv.obo`). They should be represented exactly as stated in the .obo file. The `value` carries the actual information and can be either a single value, a tuple of values, a matrix or table. Below, we will look at single values and tables. +3. **Quality metrics for the set:** This core part of the file captures the QC metrics specific to the set being described. -Lets start with our first metric `Protein contaminant intensity ratio`. It describes the relative intensity (in [0, 1]) of all contaminant proteins (from all runs in the set) -- the higher the value the more contaminants are present in the runs of the set. ``` - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": 0.25 +"setQualities": [ + { + ... + } +] ``` -We compute this metric for each set, in our case for the `healthy` as well as the `diseased` set, but not for the `all` set (because we want to keep the example small). But in general, what metrics you compute is up to you. +Each element within `setQualities` defines a distinct set, enabling the comparison of, say, different experimental conditions or replicate groups. + +A set's QC data is contextual—it makes sense within the bounds of the group. +For instance, it wouldn't be right to lump individual run metrics like MS1 scan counts for several runs into a single set metric; those belong to individual run analyses. + +Imagine you have several technical replicates from an experiment with two conditions, and you're interested in grouping these by technical replicates. +You might end up with sets for "healthy" and "diseased" conditions, plus a combined "all" set for overarching analyses. +As an example, we'll use three different groupings: +1. The "healthy" set, consisting of technical replicates "techRep1_healthy", "techRep2_healthy", "techRep3_healthy". +2. The "diseased" set, consisting of technical replicates "techRep1_diseased", "techRep2_diseased", "techRep3_diseased". +3. The "all" set, combining both the "healthy" and "diseased" set. -Our second example is a principal component analysis (PCA) result matrix. -The `setQuality` where this PCA metric will be stored, references **all** runs as input files. -The input table for a PCA computation can be found, for example, in MaxQuant's proteinGroups.txt output file. To stick with this example, the table in proteinGroups.txt has rows (proteins) and columns (groups, e.g. `healthy` or `diseased`), and the values in the table are protein abundances. Thus, MaxQuant has already aggregated the data from rawfiles(=runs) belonging to a certain group for us (e.g. by averaging the protein abundances). Now your QC software can derive a new table using PCA, where each group is represented by PCA coordinates. +These labels are important, acting as descriptive tags for each set, guiding your analysis. +Therefore, it is recommended to use a descriptive label, for example based on the experimental design or the kind of comparisons you want to make. -First, let's see what the PCA plot would look like: -![ Typically, the first two PCA dimensions are plotted, as shown here: Each data point in the plot represents one set(group), e.g. `diseased` or `healthy`.](../../pages/figures/MultiSet_PCA.png) -Now, let's look at the mzQC data which allows to create this plot: We use two separate metrics. One named `group of runs` to associate runs to groups, and secondly a `PCA table` metric to store the PCA data (the first 5 principal components for each group). ``` - "setQualities": [ - ..., - { - ..., - - "qualityMetrics": [ - { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": ["tr1_healthy", "tr2_healthy", "tr3_healthy" , "tr1_diseased", "tr2_diseased", "tr3_diseased"], - "group-label": ["healthy" , "healthy" , "healthy" , "diseased" , "diseased" , "diseased"] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", - "value": { - "group-label": ["healthy", "diseased"], - "PCA Dimension 1": [47.22, -30.22], - "PCA Dimension 2": [29.1, -36.5], - "PCA Dimension 3": [3.8, -7.3], - "PCA Dimension 4": [-7.7, 5.55], - "PCA Dimension 5": [140.6, -64.1] - } - } - } - ] - +"metadata": { + "label": "healthy", + "inputFiles": [ + ... + ] +}, +"qualityMetrics": [ + ... ] ``` -Note: the `group of runs` metric can be defined only once per `setQuality`, but can be referenced in many metrics (here, for our `PCA table`) in that context. +`inputFiles` lists the specific files contributing to a set, with all the technical details neatly described using CV terms. -If you look closely, we somewhat defined the group `healthy` twice. Once as an individual `setQuality` and once via the `group of runs` qualityMetric in the `all` set. -There is no easy way around this. If we were to omit the `all` set, we'd need to distribute the columns of the PCA table metric into separate `setQuality` objects (and whoever wants to plot it, needs to puzzle it back together; not ideal). -On the other hand, ommitting the `healthy`/`diseased` setQualities is not sensible either, because then there would be only the `all` setQuality where all data for different subsets would need to reside. +``` +"inputFiles": [ + { + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", + ... + }, + { + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", + ... + }, + { + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", + ... + } +], +``` +Let's dive into an example metric, like the "protein contaminant intensity ratio," indicating how much of your sample is taken up by known contaminants. A higher value suggests more contamination: +``` +{ + "accession": "MS:4000XXX", + "name": "protein contaminant intensity ratio", + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.25, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } +} +``` + +For complex analyses, such as comparing protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). +mzQC can store PCA results, capturing the variation between these two states. + +First, let's have a look at what the PCA plot would look like, plotting the first two principal components: +![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) +Next, we'll look at how mzQC can encapsulate such analysis, storing the the first five principal components as a table metric, referenced by the previously defined set labels. -### This is the mzQC file once again, in full: ``` { - "mzQC": { - "version": "1.0.0", - "creationDate": "2020-12-01T14:19:09", - "contactName": "Chris Bielow", - "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", - "setQualities": [ - { - "metadata": { - "label": "healthy", - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\\msdata\\techRep1_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 11:00:41" - } - ] - }, - { - "name": "tr2_healthy", - "location": "c:\\msdata\\techRep2_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 13:00:41" - } - ] - }, - { - "name": "tr3_healthy", - "location": "c:\\msdata\\techRep3_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": "0.25" - } - ] - }, - - { - "metadata": { - "label": "diseased", - "inputFiles": [ - { - "name": "tr1_diseased", - "location": "c:\\msdata\\techRep1_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 12:00:41" - } - ] - }, - { - "name": "tr2_diseased", - "location": "c:\\msdata\\techRep2_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr3_diseased", - "location": "c:\\msdata\\techRep3_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 15:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": "0.31" - } - ] - }, - - { - "metadata": { - "label": "all", - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\\msdata\\techRep1_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 11:00:41" - } - ] - }, - { - "name": "tr2_healthy", - "location": "c:\\msdata\\techRep2_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 13:00:41" - } - ] - }, - { - "name": "tr3_healthy", - "location": "c:\\msdata\\techRep3_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr1_diseased", - "location": "c:\\msdata\\techRep1_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 12:00:41" - } - ] - }, - { - "name": "tr2_diseased", - "location": "c:\\msdata\\techRep2_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr3_diseased", - "location": "c:\\msdata\\techRep3_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 15:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": ["tr1_healthy", "tr2_healthy", "tr3_healthy" , "tr1_diseased", "tr2_diseased", "tr3_diseased"], - "group-label": ["healthy" , "healthy" , "healthy" , "diseased" , "diseased" , "diseased"] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", - "value": { - "group-label": ["healthy", "diseased"], - "PCA Dimension 1": [47.22, -30.22], - "PCA Dimension 2": [29.1, -36.5], - "PCA Dimension 3": [3.8, -7.3], - "PCA Dimension 4": [-7.7, 5.55], - "PCA Dimension 5": [140.6, -64.1] - } - } - ] - } - + "accession": "MS:4000090", + "name": "principal component analysis of MaxQuant's protein group raw intensities", + "description": "A table with the PCA results of MaxQuant's protein group raw intensities.", + "value": { + "MS:4000086": [ + "healthy", + "diseased" + ], + "MS:4000081": [ + 47.2, + -30.2 + ], + "MS:4000082": [ + 29.1, + -36.5 + ], + "MS:4000083": [ + 3.8, + -7.3 + ], + "MS:4000084": [ + -7.7, + 5.6 ], - "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo", - "version": "0.1.0" - }, - { - "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo", - "version": "4.1.7" - } + "MS:4000085": [ + 140.6, + -64.1 ] } } ``` -### This is the mzQC file once again, in full: -**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/set-of-runs.mzQC)** diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 4903d5ae..799a7039 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -295,9 +295,9 @@ }, "qualityMetrics": [ { - "accession": "MS:4000091", - "name": "principal component analysis of MaxQuant's protein group lfq intensities", - "description": "A table with the PCA results of MaxQuant's protein group lfq intensities.", + "accession": "MS:4000090", + "name": "principal component analysis of MaxQuant's protein group raw intensities", + "description": "A table with the PCA results of MaxQuant's protein group raw intensities.", "value": { "MS:4000086": [ "healthy", From 509f9c0261347b0204b830bfe8e192c5f7e8c0dc Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 20 Mar 2024 09:33:00 +0900 Subject: [PATCH 03/12] Add temporary accession number --- docs/pages/worked-examples/intro_set.md | 12 +++++------ .../examples/intro_set.mzQC | 20 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index b9be2d4e..b117a61e 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -44,8 +44,8 @@ It is typically included at the end of the mzQC file. "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", - "version": "4.1.130" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", + "version": "4.1.144" } ] ``` @@ -113,13 +113,13 @@ Let's dive into an example metric, like the "protein contaminant intensity ratio ``` { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ``` diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 799a7039..90920a98 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -75,13 +75,13 @@ }, "qualityMetrics": [ { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -155,13 +155,13 @@ }, "qualityMetrics": [ { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", - "version": "4.1.130" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", + "version": "4.1.144" } ] } From 7d646a594d16815ee51c41580b42b3189f013bf4 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 10:33:58 +0200 Subject: [PATCH 04/12] Update example --- docs/pages/worked-examples/intro_set.md | 72 +++++++++++++++---- .../examples/intro_set.mzQC | 8 +-- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index b117a61e..70be2920 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -4,9 +4,9 @@ title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" permalink: /examples/intro_set/ --- -This page describes how to use mzQC for analyzing groups, or "sets," of mass spectrometry runs. +In mzQC, collections of mass spectrometry runs are grouped into what are termed "sets." This builds upon our understanding of [using mzQC for individual runs](https://hupo-psi.github.io/mzQC/examples/intro_run/), extending it to how you can analyze and represent data from multiple runs together. -Think of a "set" as a bundle of experiments that you want to examine collectively. +Think of a "set" as a bundle of runs that you want to examine collectively, such as technical and biological replicates. > [!TIP] > Sets are versatile! @@ -64,15 +64,17 @@ Each element within `setQualities` defines a distinct set, enabling the comparis A set's QC data is contextual—it makes sense within the bounds of the group. For instance, it wouldn't be right to lump individual run metrics like MS1 scan counts for several runs into a single set metric; those belong to individual run analyses. +Instead, set metrics reflect the collective characteristics of all runs within the set, offering insights into the overall experimental quality. Imagine you have several technical replicates from an experiment with two conditions, and you're interested in grouping these by technical replicates. You might end up with sets for "healthy" and "diseased" conditions, plus a combined "all" set for overarching analyses. As an example, we'll use three different groupings: + 1. The "healthy" set, consisting of technical replicates "techRep1_healthy", "techRep2_healthy", "techRep3_healthy". 2. The "diseased" set, consisting of technical replicates "techRep1_diseased", "techRep2_diseased", "techRep3_diseased". 3. The "all" set, combining both the "healthy" and "diseased" set. -These labels are important, acting as descriptive tags for each set, guiding your analysis. +These labels are important, acting as tags for each set, guiding your analysis. Therefore, it is recommended to use a descriptive label, for example based on the experimental design or the kind of comparisons you want to make. ``` @@ -109,24 +111,58 @@ Therefore, it is recommended to use a descriptive label, for example based on th ], ``` -Let's dive into an example metric, like the "protein contaminant intensity ratio," indicating how much of your sample is taken up by known contaminants. A higher value suggests more contamination: +Let's dive into an example metric, like the "protein contaminant intensity ratio." +This metric quantifies the abundance arising from known contaminant proteins (like keratins from skin or BSA from sample buffers) compared to the total abundance across all proteins in the sample. +High levels of contaminants can indicate issues with sample preparation or handling, leading to potential biases in the data analysis. ``` { - "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", - "value": 0.25, - "unit": { - "accession": "UO:0000191", - "name": "fraction" - } + "metadata": { + "label": "healthy", + ... + }, + "qualityMetrics": [ + { + "accession": "MS:4000177", + "name": "protein contaminant intensity ratio", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "value": 0.25, + "unit": { + "accession": "UO:0000191", + "name": "fraction" + } + } + ] +}, +{ + "metadata": { + "label": "diseased", + ... + }, + "qualityMetrics": [ + { + "accession": "MS:4000177", + "name": "protein contaminant intensity ratio", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "value": 0.31, + "unit": { + "accession": "UO:0000191", + "name": "fraction" + } + } + ] } ``` -For complex analyses, such as comparing protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). +While this metric can be calculated for each run individually, here we have aggregated that information across both the "healthy" and "diseased" sets instead. + +For our second example, we'll use the "all" set that combines the previous "healthy" and "diseased" sets. +To compare protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). mzQC can store PCA results, capturing the variation between these two states. +For this we extracted protein abundances from the `proteinGroups.txt` file specified as an input file to the "all" set. +This file is produced by MaxQuant and contains quantified protein intensities along with other identification information for each protein group detected in the experiment. + First, let's have a look at what the PCA plot would look like, plotting the first two principal components: ![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) @@ -166,3 +202,13 @@ Next, we'll look at how mzQC can encapsulate such analysis, storing the the firs } } ``` + +Note how the principal components are represented as columns in a table, with each column defined by a CV term. +Additionally, the label is represented by CV term `MS:4000086`, in this case referring to the previous "healthy" and "diseased" sets. +This label can refer to any input files or metadata labels defined in the same mzQC file. +Consequently, we could also have performed the PCA analysis on each input file separately, in which cases the labels would have been the names of the individual input files ("techRep1_healthy", "techRep2_healthy", ..., "techRep3_diseased"). +Thus, the table metric can have a flexible number of rows, based on the input of this set and the grouping level used. + +> [!WARNING] +> It would not have been valid to perform a PCA on only the three healthy samples or only the three diseased samples. +> As mentioned previously, QC metrics in sets need to relate to _all_ elements in the set, and the current set includes both the healthy and diseased subsets. diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 90920a98..aafc985b 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -76,7 +76,7 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", + "name": "contaminant protein abundance fraction", "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { @@ -156,7 +156,7 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", + "name": "contaminant protein abundance fraction", "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", - "version": "4.1.144" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.157/psi-ms.obo", + "version": "4.1.157" } ] } From 0944fe17edc7056ba397dc550f220c6e898ac16a Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 11:53:26 +0200 Subject: [PATCH 05/12] Update mzML example --- docs/pages/examples.md | 9 +- .../pages/worked-examples/adv_mzqc_in_mzml.md | 73 ++++++++++++++++ .../worked-examples/mzml-mzqc-example.md | 87 ------------------- ...zqc-example.mzML => adv_mzqc_in_mzml.mzML} | 18 ++-- 4 files changed, 88 insertions(+), 99 deletions(-) create mode 100644 docs/pages/worked-examples/adv_mzqc_in_mzml.md delete mode 100644 docs/pages/worked-examples/mzml-mzqc-example.md rename specification_documents/examples/{mzml-mzqc-example.mzML => adv_mzqc_in_mzml.mzML} (99%) diff --git a/docs/pages/examples.md b/docs/pages/examples.md index 6883446c..0b4d73b5 100644 --- a/docs/pages/examples.md +++ b/docs/pages/examples.md @@ -4,11 +4,14 @@ title: "mzQC Examples" permalink: /examples/ --- -Here are a number of worked examples, that, each for its own use-case, go step-by-step through the different parts of a mzQC. +The following use cases provide several hands-on examples of how mzQC files are structured and can be used: - [Representing QC data for an individual mass spectrometry run](intro_run/) - [Deriving QC data from multiple related mass spectrometry runs](intro_set/) - [QC sample mzQC](QC2-sample-example/) -- [in mzML](mzml-mzqc-example/) -- [Using USI with mzQC](USI-example/) - [Batch correction](metabo-batches/) + +Additionally, for more advanced usage, mzQC can closely interoperate with several other file formats developed by the Proteomics Standards Initiative: + +- [Using USI with mzQC](USI-example/) +- [Incorporating QC metrics in mzML files](adv_mzqc_in_mzml/) diff --git a/docs/pages/worked-examples/adv_mzqc_in_mzml.md b/docs/pages/worked-examples/adv_mzqc_in_mzml.md new file mode 100644 index 00000000..d8da875c --- /dev/null +++ b/docs/pages/worked-examples/adv_mzqc_in_mzml.md @@ -0,0 +1,73 @@ +--- +layout: page +title: "Incorporating QC Metrics in mzML Files" +permalink: /examples/adv_mzqc_in_mzml/ +--- + +While QC metrics in the PSI-MS controlled vocabulary are primarily intended for use in mzQC files, they can also be embedded directly within other file formats developed by the Proteomics Standards Initiative, such as [mzML](https://github.com/HUPO-PSI/mzML) and [mzIdentML](https://github.com/HUPO-PSI/mzIdentML) files. +This integration is particularly useful when it's preferred to store a limited set of QC metrics alongside the data they describe, thereby enhancing data integrity and accessibility. + +You can view a comprehensive example of an mzML file incorporating QC metrics [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/adv_mzqc_in_mzml.mzml). +Below, we detail the steps and elements involved in this process. + +1. **Source file specification** + +Define the source of the QC metrics using a `sourceFile` element. +This specifies the mzQC file as an input file, similarly to how other input files are handled within mzML: + +``` + + + +``` + +2. **Software and data processing** + +Document the software and data processing steps utilized to generate the mzQC file and compute the QC metrics: + +``` + + + +``` + +And: + +``` + + + + + +``` + +3. **Inclusion of QC metrics** + +Include the QC metrics at appropriate levels within the mzML structure: + +- **Run-level metrics** + +Metrics that relate to all spectra in the file are embedded at the `run` level using a `cvParam`: + +``` + + + ... + +``` + +- **Individual spectrum metrics** + +For metrics that relate to individual spectra, include these metrics at the `spectrum` level using a `cvParam`: + +``` + + ... + + ... + +``` + +Repeat for each spectrum as necessary, adjusting the spectrum ID and corresponding values. + +Thus, the key insight for embedding QC metrics in alternative file formats is that because they are backed by terms in the PSI-MS controlled vocabulary, they can be directly included using the respective functionalities for CV terms, such as `cvParam`. diff --git a/docs/pages/worked-examples/mzml-mzqc-example.md b/docs/pages/worked-examples/mzml-mzqc-example.md deleted file mode 100644 index 6a6ba29f..00000000 --- a/docs/pages/worked-examples/mzml-mzqc-example.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -layout: page -title: "Example of mzQC metrics in mzML" -permalink: /examples/mzml-mzqc-example/ ---- - -QC metrics from _mzQC_ can also be incorporated into other PSI formats, like _mzML_ and _mzIdentML_, if it is preferable to keep all information in one file. -The following describes a tiny _mzML_, for which a QC tool calculated id-free QC metrics for all individual spectra of the run and a set of charge distribution metrics. -Due to the shared design properties of _mzQC_, metrics can be directly represented as XML `` elements, however their location bound to mzML schema constraints. - -The first addition to the _mzML_ is the source from which the metrics came from, for reference to more QC information on the run and documentation. -``` - - - -``` -Next, the metrics computed for the whole run can be be deposited as a child of the `` element itself. Metric objects from mzQC can be directly translated into `` elements. -``` - - - -``` -Please note that for example brevity purposes the _mzML_ was truncated to 3 spectra, however the metrics were calculated on the whole to reflect realistic values. Also, the NativeID format was truncated to the index of `spectrum=` bit. - -Next, the spectra metrics can be put as children of `` elements, in the same fashion as for the run metrics. -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - WCz3rG/BckDqpo+35sJyQE1J4Cc+w3JAzDyhysXEckCitDODWsVyQOu042tD0nJAoiDOt3fTckDdtIVDYtRyQNSyBfjM33JAss+e6yTgckBGwpA0uOByQLX5rnNO4XJAUhwGHlHickAA/uSNj+JyQOp/TXok43JAfnKJ7snxckBdMMTdffJyQInw7FO383JA942OrqH0ckDkroJ5wf9yQOGVLX5kAHNAyBdr3PoAc0AsP5cpzwJzQH+W4Nn6A3NA4KDR50wNc0BBJnBrCRJzQMhy89O8EnNAA4HnMPkTc0DMyXmnThRzQIjL8eF5H3NAs1lC6KMgc0DkNauWOSFzQC+RxY/KInNAnuTtGjskc0CSoTZhzyRzQBzsWZxGMXNATQIOz6Ezc0BQR1bMOTRzQMVRKDDbNHNABej+Abk/c0BgO7Hz40BzQPDBIlJ5QXNAx93i9XpEc0DKA4MRFktzQC4GikqkU3NAalcXnoFUc0ADETkZDVVzQKp4LDA7W3NAzXDVjIlgc0CylY18RWNzQPN7YCNRcnNArDyBouVyc0Akr1XQt3RzQLxed6ZMdXNAJ2zwrmp2c0AQbl5JeYBzQEqNPR+lgHNArW6Vrg2Bc0AzZ11fhpBzQLv575hNknNADMD2lc6Tc0AaHVYFY5RzQGYaar/3lHNAu1AmQ7mgc0A+yY+eT6FzQIhdJOByoXNAqJtmpFuic0CqlhXN5qJzQEJQa2MGpXNAOeX1aTSxc0CKWJUYgbFzQGTOTE64s3NAb2k1wQ20c0CbfOcq+cBzQPwWEC89wnNA3J+h4tDCc0B2c5zBJsNzQHeOg0zX0XNAVpkkrDTTc0B8bAYo99NzQLB4EZji1HNA7kcidqLgc0Be+mq/OOFzQDyqDkFk4XNAQlhFlzXjc0AGrX3FNfRzQATXfM+O9HNA0q+4ouQAdEBzoM8meQF0QOoq6kOkAXRA3fqSKuQDdECEpScBiBF0QFfd2rZvEnRAolR4AU0TdEBeU/tJeRR0QGLL01F7HXRAp2G4v0sgdEDCY20mjSB0QBTrB7a5IXRArLUW67okdEBqOQru2DJ0QHz1aSRTQHRAbmBIjXpDdEBcyUeP9lR0QFQT27iLVXRAfNzuGrhgdECyf1ZtCWJ0QGM3bnK+YnRAYtAy3k1kdEB79QdQjWh0QLzrZRsjcHRAekkegUFxdEDFrm9QOXN0QOjpEDY3dXRAbO7dwPiAdEA+VFt2JYF0QF79wNhOgXRAUvpvq5KCdEA6LCK/FpB0QBYyhcrPoHRApQzDPDehdEBISWquu6J0QOZZyejRonRAkC1TXw+jdEC/UnslCbB0QGyerZbcsnRABhOAdeXAdEDFoEKsecF0QCPstVXswXRAltEjt3nEdEDyijcZuMR0QMqpzeidyHRAwAMSuv7OdEAU+QAU49N0QBK6z0TO1HRA2nD7diThdEBdjaP9t+F0QNL8885O5XRAzG68Po3zdEAuzHjbePR0QILWIcGkAHVAQkOz384AdUBwJBBGZAF1QF7oxYUFEnVA8C6UUjwTdUBxagGCJRR1QDpHsVq6I3VAF8gujs01dUDOCvK3gFF1QIaqgOxNVHVAJU0UOHdVdUD2o6BlOGF1QBdQYlQAcHVAtUiAFHhydUCXCguajXR1QPG3igx6gXVAxvPmBq6BdUBypc61qIN1QABtU+SahHVA6rnDl7yRdUBuqToLb5J1QMa9kUU9k3VAhzv6DiOhdUDq0v8luaF1QNbIUsViwXVAVoHaSPjBdUB8HQPMzdN1QJ6jQu/k33VAoHzVngjidUCkpRlWSAJ2QHhOjIXSA3ZAanuIHlUSdkANQR2lTRR2QNUlN90LFnZAU17cXIgidkDFcA4unCN2QMhuDDyRJHZAbKizD0ordkC5aB+7HzF2QGbMDSckM3ZAPn3xDlIzdkA2JciglzN2QC3GU5udNHZA3svMrx1BdkAXeXqQeUR2QCDPuSn8TnZA7AgXCMtQdkC0hYgAElF2QBynKYHNVHZADlmLCRBhdkCFXuPSG2F2QNcgd1HaZHZAo5A9qo6CdkAR0lpFvJN2QHNDNGG3sHZAiXUR+MOydkBWTnOWDbR2QO71RS6wtHZAzqkj1L/AdkBE6Ixz5MF2QLvVQjwaxHZAcH/Av87OdkBSAr8s9OF2QFQWpZqR83ZAAFjo90/0dkAFafXzNAJ3QPA6SncBEndAil+IUT0Sd0B+XCNBDiJ3QH5cm1W6JHdAAJCrGg01d0AgSCatnUF3QLg7yWQbRXdAUEBHi0tRd0B47pUck1F3QFg57YUpVXdAQbM+4Udhd0Dbhh6Jj2F3QMZ7NjQob3dAvIL77490d0BY75HNeIJ3QHB03L8JkndAO/3WIevCd0BgCJO2i9Z3QGG241dQ8ndATq7mFU4VeEAoAtuobiJ4QDkGphOFI3hAuK1deg1CeEBMhMa4ekV4QCg9YY7LUXhArBErPIlVeEBnXklU+mB4QFAs7Uy3YXhA9HvKv7dieECyF52FOnR4QPZubY9ndHhA8DGUr4t0eEBuLMkC03R4QGDbNxT+dHhAYtCVNaWBeEBizxsHmoR4QMAwayBQj3hAbEs/NZiReED6xcoXp5R4QM9m1w+6pHhASEQ6Dc7AeEDIRpwh9sF4QAypoWq11HhAANHLOQL0eEB/U9+xjPV4QMFSWRYRBHlA4gDNKpoFeUC7Wy3xwCN5QMCOVz27M3lACdKf84tCeUBUGSmX4VB5QJS/9ec3VHlATpxzLM1UeUAuAJW76mB5QHKJqxPbZHlAH2qjVUNzeUAS+c7s9YF5QFhB9ta/snlAcmzZYULUeUDqXfSizNV5QLWK2IRR5HlATVvnaPgFekD+kLnN3CZ6QEpkSFkMNXpALlpbjBpFekDag3dZLVB6QP1oUFaCUnpAml12YStVekAWKlYWOG16QIzUWRmLcnpAVmlNgQmdekA6+8yKYqJ6QCdaF0eqp3pArv2hXTOsekDAgrUZBq16QBI+g1UNtnpA/VRVWCzDekDGwb0F2NN6QGmoGV155HpA8oL4WjXuekAq6YJkKg57QKFOscFMFXtAgpnq2lole0BWgSx+Hy57QG6i6qJUMHtA4inJgMAze0D8joS9JJV7QJLh+616xntAgtrrHK7Te0BGJ8nWt9t7QL7cwylF4ntALrpHbrnje0Ak35E2/uN7QF7noQWo53tAnpNappfxe0BO3Bg9jPV7QFAv4iyPAXxAarAQqdkBfEDUCC7j8wF8QIjFdRWZBXxABHLuttERfECQ/b5jyyF8QOoLaVHMQXxAaoAd39pRfEAi/sZ4DmJ8QDahp/aTm3xAEm140sOyfEDCbzSrD7Z8QGWbdMjM1XxA3qsV0VfifEClN6NtFvJ8QK7G+Z5V8nxAanGjYAACfUASHp+ASwJ9QJIHu6ZFEn1AtoxqsDwifUCXXx7YoTF9QPxCiKSgU31A/FSO+SzDfUAWNHd51+J9QLK8T2h6831AaXD2y4/1fUBT9Ek/jlV+QFsxQ6KZZX5Avlhs6dPvfkCIFNDxAxt/QFeQmTARM39A37p88FA4f0BWOGmdqj1/QH/YCjUDQ39AN03r11pIf0D+bujCt3F/QEA+vl6xgX9Avl0drqODf0CgRmjfYpF/QN7yq/DdkX9AdzLI5paTf0DSpP6v5Zl/QDzflyBeoX9AtFzgU86hf0BLKH6pVLF/QEeBq+mQs39AcwnRDJbdf0B2ESVQ7+J/QJK5jnhC6H9Aw3lPI57tf0C6+7u6gxGAQDrzMpEwFIBAvoqhRNUWgECyX/l95S+AQAJAdSXXNoBAot5+k+s3gED4EUGEgjmAQPIJiKwpPIBAjMKVO9Y+gEAMpBgQ90iAQEPHhRnnUIBAxtjeAhRRgEDhVu/k5ViAQMijus0OWYBASPYvLylvgEBUdfaTDnqAQBQWG40TfoBAFuZz9hyPgECsULnubJGAQCfO4HoZlIBAwRq4uMOWgEAW4xmwbJmAQMq3HbQYnIBAI3Uu14apgEDiaaNgUsGAQCIKm1NTyYBAsMeJVkzRgEBCRgKYYNGAQJz1AbxK2YBAMlgZ55jZgEC299d5RdyAQAhwZRry3oBAAZgqPkXhgECm/z7EmeGAQPcV10H26IBARmaiWUrvgECfcfm9PPOAQJp9qhhA/4BAArtdoT0HgUCcgt1BPw+BQByecoIeHoFAFtU6CzQfgUCAjONlISKBQB5c0iE2J4FANe7kV7wxgUDq7FRvVY+BQGcZ/flVl4FASNqJU+ehgUBwup5abqiBQI7J/z3sqYFAFHl6c0qvgUBMSDjFfdOBQHDJd3TYGIJADo16tNYggkCUIHrdzyiCQNHIkAekMIJAi/x8bsh5gkCOOJXvy32CQJjG0+3AgYJASDI8XdKBgkBwfZmfxoWCQCCWfT5DiYJAJJI4v0ORgkA6DOBVPJmCQOLnNrJUmYJABofqBw+hgkBj+hfVOqGCQNfiL7cNqYJAyI5w4TOpgkA3OhXJB7GCQIpazjSu+YJA6l8/cHkRg0Bg9A0mehmDQAZAQZByIYNAGJ95wnEpg0A6P7CcaTGDQDy+W6fCVYNAruu6HkDSg0C9P2+W/2iEQDmxQSv+cIRAPMT6U/h4hEBRt8eCatmEQD0cEFFq4YRAoBE0MmPphEDmIN68NvGEQGJKjqZk8YRAL+GAkTT5hEAMm5RKWvmEQAps2RIxAYVAtHDac6BhhUDF7I8FoWmFQCZfYMiYcYVAWLxLiZh5hUBaDlTYk4GFQLKM98cXeoZAssMQWye5hkDg04zVkCmHQNR0nmmRMYdAaKdVzIo5h0A0NQoLXUGHQHuNvPeKQYdAMjFGQxpeh0B8ZnwSHmKHQMxGunAhZodAVP4Fcjlqh0AQPlIDZa6HQNBmFrrHsYdAz+Am22iyh0AQZofhx7mHQPRRB/bAwYdAuppRtT5OiEA0v5eWQlKIQPaySh91rYhAvtv7nCmwiECHo5X4G9aIQA== - - - - - - a3BWReO5k0SWhb1EWvvWRE9oi0RWsB5HY6+0RNFygESwlQJFDlp/RJLrH0UkS4dEx59sRQuju0RV8pBEKkqYRfgjiERoJs5EuoiWRAKoD0X1cJdEs5XlRPOcJUWSxR1HCAqcRPKLnUSKg0lGVwuaRaObmkQ3gtlEdnUbRb/8+UaEGM9EIHjERFjTq0V05UBFWLm8RA0ieET9oapEp4x6RLhUFkXLV49FdPcYRsmFRkX8SLlF3STLRH0En0RTcftF6xCGRAHnLUWie51EB9V6RPMAoEQ9SR5F1AiLRNRQAUfD3uBEIhkFRfd7PUVYEllHCWyPRMqQG0Xgb5tGeR3xRUtYwUR8vJVEf1brRSkooUSh3RxFFmOvROFNlESrR6JFFRaIRWDLBUWEnp1E0b2mRaSHnkibowJFfZ8XRwOEEEXD5KRFt62JRJRCBUaGjLZE7xf+RDVoCUU03qhEC/3yRFUsmUarcK1E79FjRKEwykSg3SlGqfZXRQ+nlESY14BETTaQRKzBhkQ7ZBJFSPebRMkeWUSUFoRE1SqcRLV31kTrZxFF1xS9RWJl30Tf75FEwwHFRIPOc0QOkKhFDMNSRwhA3UT8t85EbcQ0RYcdfUR1MgFG/Z2KRLCaykVyY9BE68O+RFAh10Q29MBFKEYZRcoDgES+XIhEakOwRFf820SkerZEQBOoRAinAUUzIyJFV8DARAUdhkSL/qpEObaQRMm3JUU4XLFEEnfDRWYRl0TLuZdEvoLjRKiLAEXE5dxEZe6PRC+FAEXvpodEfD8/RTgP3UR/aK5EtfPvRP6GVEVIXoVEgltPRaWLSEY7xxdFc//0RexFwUQGnhFFp9KaRNprt0SeNZVE4R7iRBsZl0SiWJZEF0YaRRnf8EQyUZ5E6oOHRc35jEW4891EfMi2ROhTjkSO5NFE55TKRDCiv0R0ixZGYmDYRboD8EbxVYlEl3sYRSOvlkQStsFE2jKgRT63jUXI26ZEidW+RA8pj0UIRWNGjvmJREj6hkTfXBJFOjGNRCY5t0SWK7NE26G+RLpqmkXGJJ1EPYzoRPmqf0SOvL9EGvJBRcw32kTwfy9FU7WvRGu5/ET6d8BGf4eBRNsfjkUV/n1EARkNSBQR10YP0/FG7QrCRQ7BoEbrUzdF09a2RAAASEX/RZxFN72ERAbzw0TLJKREW7aJRLGMqkRE+fxEaKKQRMgvmUXUBqFETlQeRe43hUYSLbBFlyZHRT0cI0UYK3FFngyvRHxzR0V84OZFf+5iScRXV0VZmK9E7Mo6RTm6ZEjD5aBEXBPYRDgEvUbAJ6VE4/V4RQXly0SfZhdFd4q6RcpuIEZEkJVEcnnkREIqVEVV5WxEJ52nRC+T1kT9la1E9Rm8RdnGukTUhsdEzAG6RL/mokQpG3dFSvYERps9B0Wo68ZEjJa5RD6QtkQifOVHQRflRsFvuETOD59E2iopRT8xlkRvq4pE7xniRQLp4kViLyRFP4mLRCTRzkRTnc5FgD+dRMbusETkPpBEK7Y5RZTmBEUEAWxGRC9DRWe4hETORs5FaK6NRGNoAEUg66tE9+6QRexek0SozqNEimSKRIOcs0QJuvFEycDFRRD+qkaKL69E4opIRtx4/USoxLFF5rSfRaDFnkQSj+xF1jCWRPRRzESM3rxEd8rTRGNMiURz/qlEXBP7R0S/AUWuU/5G5oNLRdxszUZk4J9FMu3NRHXQkkRj8qBEIUrkRC+H80Rs85FEw6UqRfOBDUWtJ4NEbJaZRMehhUQMwJVFjV/nRpvPd0ZRZNJFFqv4RPDku0UlwaxE+3iRROBFR0YbXgRGYm+WRFUbwEQfBIdFTK6qRGUGLUWCXlpF0lSkRpVaJEbqeUxFb3IKRZ6/jUXGqmdFQhS0REBhi0Tm4WpGXPuxRF/HLEYtLrNFg4qBRfFPAEWaPVFFs+SGRvLQ6ESAu+FFLjDERFIj2kRUKXJEDMs7RWCtOEcKEyhHADCQRtVT20U3RjxF1htFRVa8gEhKwtpH+ud/R1huOkVUhmNGjpC4RlbsoUYRartFxHepRQj98ESircVE8+SRRNo2u0TWjzJGfSsaRfCKx0SNS+hFdx/gRflbHEWW45xEdUHBRKXL7UWhB+lErOosRTt840QpzoREaTk7Re5eykTitWNG6FWmRTV7LEVfP+pE4qjuRtqQnUYOtIVFFcpURVPvDEVpE9xHl3hKRy647UZ2+yFFhNrcRQE7EkaRz6FErvMyRU5yqkSG9c1EYV3fR890NkeaONxGq1YARrVJDkUfmRtFONqGRNS1/0Wz1FdFVXkSRQqDHEfFgY9GEmRMRhEakUVEnmxFxREKRYIGnUTz6ulEGWMGR3pmc0Y95h9GvveDRdMC6kSi5bBEhhiNRJiIVkY+cvBFRy+6Rci0gESVLLpEZ+oTRuKtyUWTpyBFOUeURGa2HUW4xApGshXURHdyg0V4D1hFY7SrRGjFrkQwDf9E/4+zRHfdzEQ= - - - - - ... -``` -As you can seen in the `` elements, it contains extra information in `cvRef`. -This attribute is a reference to a data entry registering the controlled vocabularies used in the file much like in _mzQC_. -This has to be added to finalise the _mzML_. -The `` elements are again very similar to the respective entries in _mzQC_, making translation easy. -``` - - - - - - - -``` -Note that you should also add your software to `` and ``. - -### This is the mzQC file once again, in full: -**[mzml-mzqc-example.mzML](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/mzml-mzqc-example.mzML)** \ No newline at end of file diff --git a/specification_documents/examples/mzml-mzqc-example.mzML b/specification_documents/examples/adv_mzqc_in_mzml.mzML similarity index 99% rename from specification_documents/examples/mzml-mzqc-example.mzML rename to specification_documents/examples/adv_mzqc_in_mzml.mzML index 62dc41a5..8414cbd7 100644 --- a/specification_documents/examples/mzml-mzqc-example.mzML +++ b/specification_documents/examples/adv_mzqc_in_mzml.mzML @@ -1,7 +1,7 @@ - + @@ -11,7 +11,7 @@ - + @@ -29,7 +29,7 @@ - + @@ -69,8 +69,8 @@ - - + + @@ -320,7 +320,7 @@ - + @@ -338,7 +338,7 @@ - + @@ -378,7 +378,7 @@ - + @@ -418,7 +418,7 @@ - + From 18613ab268333349b61d7bd7f9e6b7713a70fb2f Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 12:04:59 +0200 Subject: [PATCH 06/12] Add clarification about table metrics --- docs/pages/worked-examples/adv_mzqc_in_mzml.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/pages/worked-examples/adv_mzqc_in_mzml.md b/docs/pages/worked-examples/adv_mzqc_in_mzml.md index d8da875c..1a136143 100644 --- a/docs/pages/worked-examples/adv_mzqc_in_mzml.md +++ b/docs/pages/worked-examples/adv_mzqc_in_mzml.md @@ -70,4 +70,7 @@ For metrics that relate to individual spectra, include these metrics at the `spe Repeat for each spectrum as necessary, adjusting the spectrum ID and corresponding values. -Thus, the key insight for embedding QC metrics in alternative file formats is that because they are backed by terms in the PSI-MS controlled vocabulary, they can be directly included using the respective functionalities for CV terms, such as `cvParam`. +Note that because QC metrics in mzQC files are typically encoded at the level of runs rather than individual spectra, most spectrum-level QC metrics are defined in the PSI-MS controlled vocabulary as tabular metrics with rows for all spectra. +Therefore, when directly associating these metrics with a specific spectrum, the tables should contain a single entry only for this spectrum. + +The key insight for embedding QC metrics in alternative file formats is that because they are backed by terms in the PSI-MS controlled vocabulary, they can be directly included using the respective functionalities for CV terms, such as `cvParam`. From 61e17ba4f4148f045ca6f2dbec012e3549c64971 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 19 Mar 2024 14:32:47 +0900 Subject: [PATCH 07/12] Update set mzQC for validator --- docs/pages/examples.md | 4 +- .../{set-of-runs.mzQC.md => intro_set.md} | 4 +- .../{set-of-runs.mzQC => intro_set.mzQC} | 158 ++++++++++-------- 3 files changed, 91 insertions(+), 75 deletions(-) rename docs/pages/worked-examples/{set-of-runs.mzQC.md => intro_set.md} (99%) rename specification_documents/examples/{set-of-runs.mzQC => intro_set.mzQC} (61%) diff --git a/docs/pages/examples.md b/docs/pages/examples.md index bd09aeca..6883446c 100644 --- a/docs/pages/examples.md +++ b/docs/pages/examples.md @@ -6,8 +6,8 @@ permalink: /examples/ Here are a number of worked examples, that, each for its own use-case, go step-by-step through the different parts of a mzQC. -- [Single mass spectrometry run](intro_run/) -- [Sets of runs](set-of-runs/) +- [Representing QC data for an individual mass spectrometry run](intro_run/) +- [Deriving QC data from multiple related mass spectrometry runs](intro_set/) - [QC sample mzQC](QC2-sample-example/) - [in mzML](mzml-mzqc-example/) - [Using USI with mzQC](USI-example/) diff --git a/docs/pages/worked-examples/set-of-runs.mzQC.md b/docs/pages/worked-examples/intro_set.md similarity index 99% rename from docs/pages/worked-examples/set-of-runs.mzQC.md rename to docs/pages/worked-examples/intro_set.md index e692cf6a..208dd956 100644 --- a/docs/pages/worked-examples/set-of-runs.mzQC.md +++ b/docs/pages/worked-examples/intro_set.md @@ -1,7 +1,7 @@ --- layout: page -title: "Multi-Run (i.e. sets) Example of mzQC" -permalink: /examples/set-of-runs/ +title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" +permalink: /examples/intro_set/ --- Here, we describe an mzQC JSON document used to convey QC data which is computed on a set of runs, i.e. diff --git a/specification_documents/examples/set-of-runs.mzQC b/specification_documents/examples/intro_set.mzQC similarity index 61% rename from specification_documents/examples/set-of-runs.mzQC rename to specification_documents/examples/intro_set.mzQC index 003ece56..4903d5ae 100644 --- a/specification_documents/examples/set-of-runs.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -4,15 +4,15 @@ "creationDate": "2020-12-01T14:19:09Z", "contactName": "Chris Bielow", "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", + "description": "A simple mzQC file containing information for a set of multiple mass spectrometry runs.", "setQualities": [ { "metadata": { "label": "healthy", "inputFiles": [ { - "name": "tr1_healthy", - "location": "file:///C:/msdata/techRep1_healthy.mzML", + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -26,8 +26,8 @@ ] }, { - "name": "tr2_healthy", - "location": "file:///C:/msdata/techRep2_healthy.mzML", + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -41,8 +41,8 @@ ] }, { - "name": "tr3_healthy", - "location": "file:///C:/msdata/techRep3_healthy.mzML", + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -60,23 +60,29 @@ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" }, { "accession": "MS:1000799", "name": "custom unreleased software tool", - "value": "mzqc-pylib", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", "version": "0", - "uri": "https://hupo-psi.github.io/mzQC/unknown.html" + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000270", + "accession": "MS:4000XXX", "name": "protein contaminant intensity ratio", - "value": "0.25" + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.25, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } } ] }, @@ -85,8 +91,8 @@ "label": "diseased", "inputFiles": [ { - "name": "tr1_diseased", - "location": "file:///C:/msdata/techRep1_diseased.mzML", + "name": "techRep1_diseased", + "location": "file://C:/msdata/techRep1_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -100,8 +106,8 @@ ] }, { - "name": "tr2_diseased", - "location": "file:///C:/msdata/techRep2_diseased.mzML", + "name": "techRep2_diseased", + "location": "file://C:/msdata/techRep2_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -115,8 +121,8 @@ ] }, { - "name": "tr3_diseased", - "location": "file:///C:/msdata/techRep3_diseased.mzML", + "name": "techRep3_diseased", + "location": "file://C:/msdata/techRep3_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -134,16 +140,29 @@ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" + }, + { + "accession": "MS:1000799", + "name": "custom unreleased software tool", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", + "version": "0", + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000270", + "accession": "MS:4000XXX", "name": "protein contaminant intensity ratio", - "value": "0.31" + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.31, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } } ] }, @@ -152,8 +171,8 @@ "label": "all", "inputFiles": [ { - "name": "tr1_healthy", - "location": "file:///C:/msdata/techRep1_healthy.mzML", + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -167,8 +186,8 @@ ] }, { - "name": "tr2_healthy", - "location": "file:///C:/msdata/techRep2_healthy.mzML", + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -182,8 +201,8 @@ ] }, { - "name": "tr3_healthy", - "location": "file:///C:/msdata/techRep3_healthy.mzML", + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -197,8 +216,8 @@ ] }, { - "name": "tr1_diseased", - "location": "file:///C:/msdata/techRep1_diseased.mzML", + "name": "techRep1_diseased", + "location": "file://C:/msdata/techRep1_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -212,8 +231,8 @@ ] }, { - "name": "tr2_diseased", - "location": "file:///C:/msdata/techRep2_diseased.mzML", + "name": "techRep2_diseased", + "location": "file://C:/msdata/techRep2_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -227,8 +246,8 @@ ] }, { - "name": "tr3_diseased", - "location": "file:///C:/msdata/techRep3_diseased.mzML", + "name": "techRep3_diseased", + "location": "file://C:/msdata/techRep3_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -240,65 +259,67 @@ "value": "2012-02-03 15:00:41" } ] + }, + { + "name": "proteinGroups", + "location": "file://C:/msdata/proteinGroups.txt", + "fileFormat": { + "accession": "MS:1002130", + "name": "identification file format" + }, + "fileProperties": [ + { + "accession": "MS:1000747", + "name": "completion time", + "value": "2012-02-03 18:00:41" + } + ] } ], "analysisSoftware": [ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" + }, + { + "accession": "MS:1000799", + "name": "custom unreleased software tool", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", + "version": "0", + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": [ - "tr1_healthy", - "tr2_healthy", - "tr3_healthy", - "tr1_diseased", - "tr2_diseased", - "tr3_diseased" - ], - "group-label": [ - "healthy", - "healthy", - "healthy", - "diseased", - "diseased", - "diseased" - ] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", + "accession": "MS:4000091", + "name": "principal component analysis of MaxQuant's protein group lfq intensities", + "description": "A table with the PCA results of MaxQuant's protein group lfq intensities.", "value": { - "group-label": [ + "MS:4000086": [ "healthy", "diseased" ], - "PCA Dimension 1": [ - 47.22, - -30.22 + "MS:4000081": [ + 47.2, + -30.2 ], - "PCA Dimension 2": [ + "MS:4000082": [ 29.1, -36.5 ], - "PCA Dimension 3": [ + "MS:4000083": [ 3.8, -7.3 ], - "PCA Dimension 4": [ + "MS:4000084": [ -7.7, - 5.55 + 5.6 ], - "PCA Dimension 5": [ + "MS:4000085": [ 140.6, -64.1 ] @@ -308,15 +329,10 @@ } ], "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/mzQC/blob/main/cv/qc-cv.obo", - "version": "1.0.0" - }, { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.71/psi-ms.obo", - "version": "4.1.71" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", + "version": "4.1.130" } ] } From c1c0f3704f1e7b487b62c5e68fc93a310004b713 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 19 Mar 2024 15:30:45 +0900 Subject: [PATCH 08/12] Update set description --- .../{MultiSet_PCA.png => intro_set_pca.png} | Bin docs/pages/worked-examples/intro_run.md | 10 +- docs/pages/worked-examples/intro_set.md | 538 ++++-------------- .../examples/intro_set.mzQC | 6 +- 4 files changed, 138 insertions(+), 416 deletions(-) rename docs/pages/figures/{MultiSet_PCA.png => intro_set_pca.png} (100%) diff --git a/docs/pages/figures/MultiSet_PCA.png b/docs/pages/figures/intro_set_pca.png similarity index 100% rename from docs/pages/figures/MultiSet_PCA.png rename to docs/pages/figures/intro_set_pca.png diff --git a/docs/pages/worked-examples/intro_run.md b/docs/pages/worked-examples/intro_run.md index cfc92104..76c19d93 100644 --- a/docs/pages/worked-examples/intro_run.md +++ b/docs/pages/worked-examples/intro_run.md @@ -12,6 +12,7 @@ Here, we'll walk through the key components of an mzQC file, which uses a JSON-b You can explore the complete mzQC file [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_run.mzQC), to see all of the elements in their context. An mzQC file starts with the root element `mzQC`: + ``` { "mzQC": { @@ -23,6 +24,7 @@ An mzQC file starts with the root element `mzQC`: Within `mzQC`, there are three main sections: 1. **General file information:** These attributes provide essential details about the mzQC file itself. + ``` "version": "1.0.0", "creationDate": "2020-12-01T11:56:34Z", @@ -33,6 +35,7 @@ Within `mzQC`, there are three main sections: 2. **Controlled vocabulary (CV) references:** This section points to standardized vocabularies used to ensure consistent metric definitions across files. It is typically included at the end of the mzQC file. + ``` "controlledVocabularies": [ { @@ -44,6 +47,7 @@ It is typically included at the end of the mzQC file. ``` 3. **Quality metrics for the run:** This core part of the file captures the QC metrics specific to the run being described. + ``` "runQualities": [ { @@ -55,6 +59,7 @@ It is typically included at the end of the mzQC file. In the `runQualities` section, you may find multiple `runQuality` elements, each corresponding to a unique mass spectrometry run. For simplicity, this example only includes a single run in the mzQC file. First, this includes a `metadata` part detailing the run specifics, such as the source files and software used in analysis: + ``` "metadata": { "inputFiles": [ @@ -67,6 +72,7 @@ First, this includes a `metadata` part detailing the run specifics, such as the ``` Digging a bit deeper, for example, the `inputFiles` array describes each file contributing to the run, including details like file name, location (URI), format, and properties—all standardized using CV terms. + ``` "inputFiles": [ { @@ -101,6 +107,7 @@ Finally, the `qualityMetrics` array lists the metrics derived from the run, each Metrics can take various forms, such as single values, tuples (arrays of values), or more complex structures like matrices or tables, depending on the information being conveyed. For example, a single valued metric: + ``` { "accession": "MS:4000059", @@ -111,10 +118,11 @@ For example, a single valued metric: "accession": "UO:0000189", "name": "count unit" } -} +}, ``` And a tuple metric: + ``` { "accession": "MS:4000069", diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index 208dd956..c9bd0630 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -4,456 +4,170 @@ title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" permalink: /examples/intro_set/ --- -Here, we describe an mzQC JSON document used to convey QC data which is computed on a set of runs, i.e. -is **only interpretable in the context of this set** (group). -Of course, QC metrics which refer to each run individually can also be stored, also in the same mzQC file -(see our example `individual-runs.mzQC.md` on how to do that), but this example is about group/set metrics. +This page describes how to use mzQC for analyzing groups, or "sets," of mass spectrometry runs. +This builds upon our understanding of [using mzQC for individual runs](https://hupo-psi.github.io/mzQC/examples/intro_run/), extending it to how you can analyze and represent data from multiple runs together. +Think of a "set" as a bundle of experiments that you want to examine collectively. -Find the complete example file at the bottom of this document or in the example folder. +> [!TIP] +> Sets are versatile! +> You can group runs together, but you can also group sets within other sets. +> This allows for a structured hierarchy in your analysis, like grouping technical replicates under biological ones and then comparing across conditions. + +Discover the full example of an mzQC file for a set [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_set.mzQC). + +The structure of an mzQC file for a set mirrors that for a single run, starting with the root element `mzQC`: -The basic structure of our mzQC file is identical to the `individual-runs.mzQC` example, i.e. -the documents main anchor is between the outer curly brackets: ``` -{ "mzQC": - { +{ + "mzQC": { ... } } ``` -Within this main anchor, there are usually the following sections: -a) general information about the file, -``` - "version": "1.0.0", - "creationDate": "2020-12-21T11:56:34", - "contactName": "Chris Bielow", - "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", -``` +Within `mzQC`, there are three main sections: -b) reference information for controlled vocabularies (cv) at the bottom, -``` - "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo", - "version": "0.1.0" - }, - { - "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo", - "version": "4.1.7" - } - ] -``` -and (now in addition or as replacement) to the `runQualities` of the `individual-runs.mzQC` we have -c) information about the QC metrics computed on **a set of runs**. -``` - "setQualities": [ - { - ... - } - ] -``` -In fact, `setQualities` can contain one or more `setQuality` objects, each defining a different set of runs. -E.g. if you have three technical replicates for two conditions for at total of six runs, you might want to subsume three runs into a set, one for each condition and report the total number of proteins you identified, or the percentage of total intensity attributable to contaminants). Each `setQuality` object is an element of a JSON array, thus it is not explicitly named (i.e. there is no "setQuality" key in the mzQC file). -For the purpose of this example, we will use **three** `setQuality` objects (there could be none, only one or more than two though): +1. **General file information:** These attributes provide essential details about the mzQC file itself. ``` - the **healthy** set: tr1_healthy, tr2_healthy, tr3_healthy - the **diseased** set: tr1_diseased, tr2_diseased, tr3_diseased - the **all** set: tr1_healthy, tr2_healthy, tr3_healthy, tr1_diseased, tr2_diseased, tr3_diseased +"version": "1.0.0", +"creationDate": "2020-12-01T14:19:09Z", +"contactName": "Chris Bielow", +"contactAddress": "chris.bielow@bsc.fu-berlin.de", +"description": "A simple mzQC file containing information for a set of multiple mass spectrometry runs.", ``` -How you define (and name) each set, is up to you and depends on your experimental design and the kind of comparisons you want to make. +2. **Controlled vocabulary (CV) references:** This section points to standardized vocabularies used to ensure consistent metric definitions across files. +It is typically included at the end of the mzQC file. -A `setQuality` represents QC data that must be viewed in the context of all the runs of this set/group. I.e. the data is only valid within the context of the runs it comprises. E.g. it would be invalid to define a set of three runs and report their individual MS1 scan counts as a 3-tuple -- because this information can clearly be attributed to individual runs and thus belongs in three separate `runQuality` objects, rather than a single `setQuality`. -Similar to `runQuality`, a `setQuality` also contains `metadata` about the set of runs (its input file**s**, the software used, etc). -You can give the set a unique name using the `label` attribute. Here is how a `setQuality` object looks like: -``` - { - "metadata": { - "label": "healthy" - "inputFiles": - ... - }, - "qualityMetrics": [ - ... - ] - } -``` -The `inputFiles` consist of an array of `inputFile` objects, describing the source files with structured information about the file's name, format, location and other properties, defined via cv terms. ``` - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\msdata\techRep1_healthy.mzML", - ... - }, - { - "name": "tr2_healthy", - "location": "c:\msdata\techRep2_healthy.mzML", - ... - }, - { - "name": "tr3_healthy", - "location": "c:\msdata\techRep3_healthy.mzML", - ... - } - ] +"controlledVocabularies": [ + { + "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", + "version": "4.1.130" + } +] ``` -The `inputFile` object is only sketched here. It can contain a lot more information, such as file format and further properties. See the full example below or `individual-runs.mzQC` for details. -In `qualityMetrics`, we will store the actual QC information for a particular `setQuality`. Each `qualityMetric` has an `accession` and the corresponding `name` as defined by the QC controlled vocabulary (see `qc-cv.obo`). They should be represented exactly as stated in the .obo file. The `value` carries the actual information and can be either a single value, a tuple of values, a matrix or table. Below, we will look at single values and tables. +3. **Quality metrics for the set:** This core part of the file captures the QC metrics specific to the set being described. -Lets start with our first metric `Protein contaminant intensity ratio`. It describes the relative intensity (in [0, 1]) of all contaminant proteins (from all runs in the set) -- the higher the value the more contaminants are present in the runs of the set. ``` - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": 0.25 +"setQualities": [ + { + ... + } +] ``` -We compute this metric for each set, in our case for the `healthy` as well as the `diseased` set, but not for the `all` set (because we want to keep the example small). But in general, what metrics you compute is up to you. +Each element within `setQualities` defines a distinct set, enabling the comparison of, say, different experimental conditions or replicate groups. + +A set's QC data is contextual—it makes sense within the bounds of the group. +For instance, it wouldn't be right to lump individual run metrics like MS1 scan counts for several runs into a single set metric; those belong to individual run analyses. + +Imagine you have several technical replicates from an experiment with two conditions, and you're interested in grouping these by technical replicates. +You might end up with sets for "healthy" and "diseased" conditions, plus a combined "all" set for overarching analyses. +As an example, we'll use three different groupings: +1. The "healthy" set, consisting of technical replicates "techRep1_healthy", "techRep2_healthy", "techRep3_healthy". +2. The "diseased" set, consisting of technical replicates "techRep1_diseased", "techRep2_diseased", "techRep3_diseased". +3. The "all" set, combining both the "healthy" and "diseased" set. -Our second example is a principal component analysis (PCA) result matrix. -The `setQuality` where this PCA metric will be stored, references **all** runs as input files. -The input table for a PCA computation can be found, for example, in MaxQuant's proteinGroups.txt output file. To stick with this example, the table in proteinGroups.txt has rows (proteins) and columns (groups, e.g. `healthy` or `diseased`), and the values in the table are protein abundances. Thus, MaxQuant has already aggregated the data from rawfiles(=runs) belonging to a certain group for us (e.g. by averaging the protein abundances). Now your QC software can derive a new table using PCA, where each group is represented by PCA coordinates. +These labels are important, acting as descriptive tags for each set, guiding your analysis. +Therefore, it is recommended to use a descriptive label, for example based on the experimental design or the kind of comparisons you want to make. -First, let's see what the PCA plot would look like: -![ Typically, the first two PCA dimensions are plotted, as shown here: Each data point in the plot represents one set(group), e.g. `diseased` or `healthy`.](../../pages/figures/MultiSet_PCA.png) -Now, let's look at the mzQC data which allows to create this plot: We use two separate metrics. One named `group of runs` to associate runs to groups, and secondly a `PCA table` metric to store the PCA data (the first 5 principal components for each group). ``` - "setQualities": [ - ..., - { - ..., - - "qualityMetrics": [ - { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": ["tr1_healthy", "tr2_healthy", "tr3_healthy" , "tr1_diseased", "tr2_diseased", "tr3_diseased"], - "group-label": ["healthy" , "healthy" , "healthy" , "diseased" , "diseased" , "diseased"] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", - "value": { - "group-label": ["healthy", "diseased"], - "PCA Dimension 1": [47.22, -30.22], - "PCA Dimension 2": [29.1, -36.5], - "PCA Dimension 3": [3.8, -7.3], - "PCA Dimension 4": [-7.7, 5.55], - "PCA Dimension 5": [140.6, -64.1] - } - } - } - ] - +"metadata": { + "label": "healthy", + "inputFiles": [ + ... + ] +}, +"qualityMetrics": [ + ... ] ``` -Note: the `group of runs` metric can be defined only once per `setQuality`, but can be referenced in many metrics (here, for our `PCA table`) in that context. +`inputFiles` lists the specific files contributing to a set, with all the technical details neatly described using CV terms. + +``` +"inputFiles": [ + { + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", + ... + }, + { + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", + ... + }, + { + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", + ... + } +], +``` + +Let's dive into an example metric, like the "protein contaminant intensity ratio," indicating how much of your sample is taken up by known contaminants. A higher value suggests more contamination: -If you look closely, we somewhat defined the group `healthy` twice. Once as an individual `setQuality` and once via the `group of runs` qualityMetric in the `all` set. -There is no easy way around this. If we were to omit the `all` set, we'd need to distribute the columns of the PCA table metric into separate `setQuality` objects (and whoever wants to plot it, needs to puzzle it back together; not ideal). -On the other hand, ommitting the `healthy`/`diseased` setQualities is not sensible either, because then there would be only the `all` setQuality where all data for different subsets would need to reside. +``` +{ + "accession": "MS:4000XXX", + "name": "protein contaminant intensity ratio", + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.25, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } +} +``` +For complex analyses, such as comparing protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). +mzQC can store PCA results, capturing the variation between these two states. +First, let's have a look at what the PCA plot would look like, plotting the first two principal components: +![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) +Next, we'll look at how mzQC can encapsulate such analysis, storing the the first five principal components as a table metric, referenced by the previously defined set labels. -### This is the mzQC file once again, in full: ``` { - "mzQC": { - "version": "1.0.0", - "creationDate": "2020-12-01T14:19:09", - "contactName": "Chris Bielow", - "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", - "setQualities": [ - { - "metadata": { - "label": "healthy", - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\\msdata\\techRep1_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 11:00:41" - } - ] - }, - { - "name": "tr2_healthy", - "location": "c:\\msdata\\techRep2_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 13:00:41" - } - ] - }, - { - "name": "tr3_healthy", - "location": "c:\\msdata\\techRep3_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": "0.25" - } - ] - }, - - { - "metadata": { - "label": "diseased", - "inputFiles": [ - { - "name": "tr1_diseased", - "location": "c:\\msdata\\techRep1_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 12:00:41" - } - ] - }, - { - "name": "tr2_diseased", - "location": "c:\\msdata\\techRep2_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr3_diseased", - "location": "c:\\msdata\\techRep3_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 15:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": "0.31" - } - ] - }, - - { - "metadata": { - "label": "all", - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\\msdata\\techRep1_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 11:00:41" - } - ] - }, - { - "name": "tr2_healthy", - "location": "c:\\msdata\\techRep2_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 13:00:41" - } - ] - }, - { - "name": "tr3_healthy", - "location": "c:\\msdata\\techRep3_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr1_diseased", - "location": "c:\\msdata\\techRep1_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 12:00:41" - } - ] - }, - { - "name": "tr2_diseased", - "location": "c:\\msdata\\techRep2_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr3_diseased", - "location": "c:\\msdata\\techRep3_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 15:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": ["tr1_healthy", "tr2_healthy", "tr3_healthy" , "tr1_diseased", "tr2_diseased", "tr3_diseased"], - "group-label": ["healthy" , "healthy" , "healthy" , "diseased" , "diseased" , "diseased"] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", - "value": { - "group-label": ["healthy", "diseased"], - "PCA Dimension 1": [47.22, -30.22], - "PCA Dimension 2": [29.1, -36.5], - "PCA Dimension 3": [3.8, -7.3], - "PCA Dimension 4": [-7.7, 5.55], - "PCA Dimension 5": [140.6, -64.1] - } - } - ] - } - + "accession": "MS:4000090", + "name": "principal component analysis of MaxQuant's protein group raw intensities", + "description": "A table with the PCA results of MaxQuant's protein group raw intensities.", + "value": { + "MS:4000086": [ + "healthy", + "diseased" + ], + "MS:4000081": [ + 47.2, + -30.2 + ], + "MS:4000082": [ + 29.1, + -36.5 + ], + "MS:4000083": [ + 3.8, + -7.3 + ], + "MS:4000084": [ + -7.7, + 5.6 ], - "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo", - "version": "0.1.0" - }, - { - "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo", - "version": "4.1.7" - } + "MS:4000085": [ + 140.6, + -64.1 ] } } ``` +<<<<<<< HEAD ### This is the mzQC file once again, in full: -**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/set-of-runs.mzQC)** \ No newline at end of file +**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/set-of-runs.mzQC)** +======= +>>>>>>> 49f4133 (Update set description) diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 4903d5ae..799a7039 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -295,9 +295,9 @@ }, "qualityMetrics": [ { - "accession": "MS:4000091", - "name": "principal component analysis of MaxQuant's protein group lfq intensities", - "description": "A table with the PCA results of MaxQuant's protein group lfq intensities.", + "accession": "MS:4000090", + "name": "principal component analysis of MaxQuant's protein group raw intensities", + "description": "A table with the PCA results of MaxQuant's protein group raw intensities.", "value": { "MS:4000086": [ "healthy", From 3047bef528ac930e42492d100b12e05368c611ff Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 20 Mar 2024 09:33:00 +0900 Subject: [PATCH 09/12] Add temporary accession number --- docs/pages/worked-examples/intro_set.md | 12 +++++------ .../examples/intro_set.mzQC | 20 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index c9bd0630..597fac70 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -44,8 +44,8 @@ It is typically included at the end of the mzQC file. "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", - "version": "4.1.130" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", + "version": "4.1.144" } ] ``` @@ -113,13 +113,13 @@ Let's dive into an example metric, like the "protein contaminant intensity ratio ``` { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ``` diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 799a7039..90920a98 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -75,13 +75,13 @@ }, "qualityMetrics": [ { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -155,13 +155,13 @@ }, "qualityMetrics": [ { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", - "version": "4.1.130" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", + "version": "4.1.144" } ] } From 7469484f531ddee642f0f08e5f1f15c0ab98258c Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 10:33:58 +0200 Subject: [PATCH 10/12] Update example --- docs/pages/worked-examples/intro_set.md | 72 +++++++++++++++---- .../examples/intro_set.mzQC | 8 +-- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index 597fac70..d48dbce7 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -4,9 +4,9 @@ title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" permalink: /examples/intro_set/ --- -This page describes how to use mzQC for analyzing groups, or "sets," of mass spectrometry runs. +In mzQC, collections of mass spectrometry runs are grouped into what are termed "sets." This builds upon our understanding of [using mzQC for individual runs](https://hupo-psi.github.io/mzQC/examples/intro_run/), extending it to how you can analyze and represent data from multiple runs together. -Think of a "set" as a bundle of experiments that you want to examine collectively. +Think of a "set" as a bundle of runs that you want to examine collectively, such as technical and biological replicates. > [!TIP] > Sets are versatile! @@ -64,15 +64,17 @@ Each element within `setQualities` defines a distinct set, enabling the comparis A set's QC data is contextual—it makes sense within the bounds of the group. For instance, it wouldn't be right to lump individual run metrics like MS1 scan counts for several runs into a single set metric; those belong to individual run analyses. +Instead, set metrics reflect the collective characteristics of all runs within the set, offering insights into the overall experimental quality. Imagine you have several technical replicates from an experiment with two conditions, and you're interested in grouping these by technical replicates. You might end up with sets for "healthy" and "diseased" conditions, plus a combined "all" set for overarching analyses. As an example, we'll use three different groupings: + 1. The "healthy" set, consisting of technical replicates "techRep1_healthy", "techRep2_healthy", "techRep3_healthy". 2. The "diseased" set, consisting of technical replicates "techRep1_diseased", "techRep2_diseased", "techRep3_diseased". 3. The "all" set, combining both the "healthy" and "diseased" set. -These labels are important, acting as descriptive tags for each set, guiding your analysis. +These labels are important, acting as tags for each set, guiding your analysis. Therefore, it is recommended to use a descriptive label, for example based on the experimental design or the kind of comparisons you want to make. ``` @@ -109,24 +111,58 @@ Therefore, it is recommended to use a descriptive label, for example based on th ], ``` -Let's dive into an example metric, like the "protein contaminant intensity ratio," indicating how much of your sample is taken up by known contaminants. A higher value suggests more contamination: +Let's dive into an example metric, like the "protein contaminant intensity ratio." +This metric quantifies the abundance arising from known contaminant proteins (like keratins from skin or BSA from sample buffers) compared to the total abundance across all proteins in the sample. +High levels of contaminants can indicate issues with sample preparation or handling, leading to potential biases in the data analysis. ``` { - "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", - "value": 0.25, - "unit": { - "accession": "UO:0000191", - "name": "fraction" - } + "metadata": { + "label": "healthy", + ... + }, + "qualityMetrics": [ + { + "accession": "MS:4000177", + "name": "protein contaminant intensity ratio", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "value": 0.25, + "unit": { + "accession": "UO:0000191", + "name": "fraction" + } + } + ] +}, +{ + "metadata": { + "label": "diseased", + ... + }, + "qualityMetrics": [ + { + "accession": "MS:4000177", + "name": "protein contaminant intensity ratio", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "value": 0.31, + "unit": { + "accession": "UO:0000191", + "name": "fraction" + } + } + ] } ``` -For complex analyses, such as comparing protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). +While this metric can be calculated for each run individually, here we have aggregated that information across both the "healthy" and "diseased" sets instead. + +For our second example, we'll use the "all" set that combines the previous "healthy" and "diseased" sets. +To compare protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). mzQC can store PCA results, capturing the variation between these two states. +For this we extracted protein abundances from the `proteinGroups.txt` file specified as an input file to the "all" set. +This file is produced by MaxQuant and contains quantified protein intensities along with other identification information for each protein group detected in the experiment. + First, let's have a look at what the PCA plot would look like, plotting the first two principal components: ![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) @@ -171,3 +207,13 @@ Next, we'll look at how mzQC can encapsulate such analysis, storing the the firs **[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/set-of-runs.mzQC)** ======= >>>>>>> 49f4133 (Update set description) + +Note how the principal components are represented as columns in a table, with each column defined by a CV term. +Additionally, the label is represented by CV term `MS:4000086`, in this case referring to the previous "healthy" and "diseased" sets. +This label can refer to any input files or metadata labels defined in the same mzQC file. +Consequently, we could also have performed the PCA analysis on each input file separately, in which cases the labels would have been the names of the individual input files ("techRep1_healthy", "techRep2_healthy", ..., "techRep3_diseased"). +Thus, the table metric can have a flexible number of rows, based on the input of this set and the grouping level used. + +> [!WARNING] +> It would not have been valid to perform a PCA on only the three healthy samples or only the three diseased samples. +> As mentioned previously, QC metrics in sets need to relate to _all_ elements in the set, and the current set includes both the healthy and diseased subsets. diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 90920a98..aafc985b 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -76,7 +76,7 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", + "name": "contaminant protein abundance fraction", "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { @@ -156,7 +156,7 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", + "name": "contaminant protein abundance fraction", "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", - "version": "4.1.144" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.157/psi-ms.obo", + "version": "4.1.157" } ] } From fe2776295bf0132ba3fa2542fb32754995dc49a9 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 11:53:26 +0200 Subject: [PATCH 11/12] Update mzML example --- docs/pages/examples.md | 9 +- .../pages/worked-examples/adv_mzqc_in_mzml.md | 73 ++++++++++++++++ .../worked-examples/mzml-mzqc-example.md | 87 ------------------- ...zqc-example.mzML => adv_mzqc_in_mzml.mzML} | 18 ++-- 4 files changed, 88 insertions(+), 99 deletions(-) create mode 100644 docs/pages/worked-examples/adv_mzqc_in_mzml.md delete mode 100644 docs/pages/worked-examples/mzml-mzqc-example.md rename specification_documents/examples/{mzml-mzqc-example.mzML => adv_mzqc_in_mzml.mzML} (99%) diff --git a/docs/pages/examples.md b/docs/pages/examples.md index 6883446c..0b4d73b5 100644 --- a/docs/pages/examples.md +++ b/docs/pages/examples.md @@ -4,11 +4,14 @@ title: "mzQC Examples" permalink: /examples/ --- -Here are a number of worked examples, that, each for its own use-case, go step-by-step through the different parts of a mzQC. +The following use cases provide several hands-on examples of how mzQC files are structured and can be used: - [Representing QC data for an individual mass spectrometry run](intro_run/) - [Deriving QC data from multiple related mass spectrometry runs](intro_set/) - [QC sample mzQC](QC2-sample-example/) -- [in mzML](mzml-mzqc-example/) -- [Using USI with mzQC](USI-example/) - [Batch correction](metabo-batches/) + +Additionally, for more advanced usage, mzQC can closely interoperate with several other file formats developed by the Proteomics Standards Initiative: + +- [Using USI with mzQC](USI-example/) +- [Incorporating QC metrics in mzML files](adv_mzqc_in_mzml/) diff --git a/docs/pages/worked-examples/adv_mzqc_in_mzml.md b/docs/pages/worked-examples/adv_mzqc_in_mzml.md new file mode 100644 index 00000000..d8da875c --- /dev/null +++ b/docs/pages/worked-examples/adv_mzqc_in_mzml.md @@ -0,0 +1,73 @@ +--- +layout: page +title: "Incorporating QC Metrics in mzML Files" +permalink: /examples/adv_mzqc_in_mzml/ +--- + +While QC metrics in the PSI-MS controlled vocabulary are primarily intended for use in mzQC files, they can also be embedded directly within other file formats developed by the Proteomics Standards Initiative, such as [mzML](https://github.com/HUPO-PSI/mzML) and [mzIdentML](https://github.com/HUPO-PSI/mzIdentML) files. +This integration is particularly useful when it's preferred to store a limited set of QC metrics alongside the data they describe, thereby enhancing data integrity and accessibility. + +You can view a comprehensive example of an mzML file incorporating QC metrics [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/adv_mzqc_in_mzml.mzml). +Below, we detail the steps and elements involved in this process. + +1. **Source file specification** + +Define the source of the QC metrics using a `sourceFile` element. +This specifies the mzQC file as an input file, similarly to how other input files are handled within mzML: + +``` + + + +``` + +2. **Software and data processing** + +Document the software and data processing steps utilized to generate the mzQC file and compute the QC metrics: + +``` + + + +``` + +And: + +``` + + + + + +``` + +3. **Inclusion of QC metrics** + +Include the QC metrics at appropriate levels within the mzML structure: + +- **Run-level metrics** + +Metrics that relate to all spectra in the file are embedded at the `run` level using a `cvParam`: + +``` + + + ... + +``` + +- **Individual spectrum metrics** + +For metrics that relate to individual spectra, include these metrics at the `spectrum` level using a `cvParam`: + +``` + + ... + + ... + +``` + +Repeat for each spectrum as necessary, adjusting the spectrum ID and corresponding values. + +Thus, the key insight for embedding QC metrics in alternative file formats is that because they are backed by terms in the PSI-MS controlled vocabulary, they can be directly included using the respective functionalities for CV terms, such as `cvParam`. diff --git a/docs/pages/worked-examples/mzml-mzqc-example.md b/docs/pages/worked-examples/mzml-mzqc-example.md deleted file mode 100644 index a88869ec..00000000 --- a/docs/pages/worked-examples/mzml-mzqc-example.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -layout: page -title: "Example of mzQC metrics in mzML" -permalink: /examples/mzml-mzqc-example/ ---- - -QC metrics from _mzQC_ can also be incorporated into other PSI formats, like _mzML_ and _mzIdentML_, if it is preferable to keep all information in one file. -The following describes a tiny _mzML_, for which a QC tool calculated id-free QC metrics for all individual spectra of the run and a set of charge distribution metrics. -Due to the shared design properties of _mzQC_, metrics can be directly represented as XML `` elements, however their location bound to mzML schema constraints. - -The first addition to the _mzML_ is the source from which the metrics came from, for reference to more QC information on the run and documentation. -``` - - - -``` -Next, the metrics computed for the whole run can be be deposited as a child of the `` element itself. Metric objects from mzQC can be directly translated into `` elements. -``` - - - -``` -Please note that for example brevity purposes the _mzML_ was truncated to 3 spectra, however the metrics were calculated on the whole to reflect realistic values. Also, the NativeID format was truncated to the index of `spectrum=` bit. - -Next, the spectra metrics can be put as children of `` elements, in the same fashion as for the run metrics. -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - WCz3rG/BckDqpo+35sJyQE1J4Cc+w3JAzDyhysXEckCitDODWsVyQOu042tD0nJAoiDOt3fTckDdtIVDYtRyQNSyBfjM33JAss+e6yTgckBGwpA0uOByQLX5rnNO4XJAUhwGHlHickAA/uSNj+JyQOp/TXok43JAfnKJ7snxckBdMMTdffJyQInw7FO383JA942OrqH0ckDkroJ5wf9yQOGVLX5kAHNAyBdr3PoAc0AsP5cpzwJzQH+W4Nn6A3NA4KDR50wNc0BBJnBrCRJzQMhy89O8EnNAA4HnMPkTc0DMyXmnThRzQIjL8eF5H3NAs1lC6KMgc0DkNauWOSFzQC+RxY/KInNAnuTtGjskc0CSoTZhzyRzQBzsWZxGMXNATQIOz6Ezc0BQR1bMOTRzQMVRKDDbNHNABej+Abk/c0BgO7Hz40BzQPDBIlJ5QXNAx93i9XpEc0DKA4MRFktzQC4GikqkU3NAalcXnoFUc0ADETkZDVVzQKp4LDA7W3NAzXDVjIlgc0CylY18RWNzQPN7YCNRcnNArDyBouVyc0Akr1XQt3RzQLxed6ZMdXNAJ2zwrmp2c0AQbl5JeYBzQEqNPR+lgHNArW6Vrg2Bc0AzZ11fhpBzQLv575hNknNADMD2lc6Tc0AaHVYFY5RzQGYaar/3lHNAu1AmQ7mgc0A+yY+eT6FzQIhdJOByoXNAqJtmpFuic0CqlhXN5qJzQEJQa2MGpXNAOeX1aTSxc0CKWJUYgbFzQGTOTE64s3NAb2k1wQ20c0CbfOcq+cBzQPwWEC89wnNA3J+h4tDCc0B2c5zBJsNzQHeOg0zX0XNAVpkkrDTTc0B8bAYo99NzQLB4EZji1HNA7kcidqLgc0Be+mq/OOFzQDyqDkFk4XNAQlhFlzXjc0AGrX3FNfRzQATXfM+O9HNA0q+4ouQAdEBzoM8meQF0QOoq6kOkAXRA3fqSKuQDdECEpScBiBF0QFfd2rZvEnRAolR4AU0TdEBeU/tJeRR0QGLL01F7HXRAp2G4v0sgdEDCY20mjSB0QBTrB7a5IXRArLUW67okdEBqOQru2DJ0QHz1aSRTQHRAbmBIjXpDdEBcyUeP9lR0QFQT27iLVXRAfNzuGrhgdECyf1ZtCWJ0QGM3bnK+YnRAYtAy3k1kdEB79QdQjWh0QLzrZRsjcHRAekkegUFxdEDFrm9QOXN0QOjpEDY3dXRAbO7dwPiAdEA+VFt2JYF0QF79wNhOgXRAUvpvq5KCdEA6LCK/FpB0QBYyhcrPoHRApQzDPDehdEBISWquu6J0QOZZyejRonRAkC1TXw+jdEC/UnslCbB0QGyerZbcsnRABhOAdeXAdEDFoEKsecF0QCPstVXswXRAltEjt3nEdEDyijcZuMR0QMqpzeidyHRAwAMSuv7OdEAU+QAU49N0QBK6z0TO1HRA2nD7diThdEBdjaP9t+F0QNL8885O5XRAzG68Po3zdEAuzHjbePR0QILWIcGkAHVAQkOz384AdUBwJBBGZAF1QF7oxYUFEnVA8C6UUjwTdUBxagGCJRR1QDpHsVq6I3VAF8gujs01dUDOCvK3gFF1QIaqgOxNVHVAJU0UOHdVdUD2o6BlOGF1QBdQYlQAcHVAtUiAFHhydUCXCguajXR1QPG3igx6gXVAxvPmBq6BdUBypc61qIN1QABtU+SahHVA6rnDl7yRdUBuqToLb5J1QMa9kUU9k3VAhzv6DiOhdUDq0v8luaF1QNbIUsViwXVAVoHaSPjBdUB8HQPMzdN1QJ6jQu/k33VAoHzVngjidUCkpRlWSAJ2QHhOjIXSA3ZAanuIHlUSdkANQR2lTRR2QNUlN90LFnZAU17cXIgidkDFcA4unCN2QMhuDDyRJHZAbKizD0ordkC5aB+7HzF2QGbMDSckM3ZAPn3xDlIzdkA2JciglzN2QC3GU5udNHZA3svMrx1BdkAXeXqQeUR2QCDPuSn8TnZA7AgXCMtQdkC0hYgAElF2QBynKYHNVHZADlmLCRBhdkCFXuPSG2F2QNcgd1HaZHZAo5A9qo6CdkAR0lpFvJN2QHNDNGG3sHZAiXUR+MOydkBWTnOWDbR2QO71RS6wtHZAzqkj1L/AdkBE6Ixz5MF2QLvVQjwaxHZAcH/Av87OdkBSAr8s9OF2QFQWpZqR83ZAAFjo90/0dkAFafXzNAJ3QPA6SncBEndAil+IUT0Sd0B+XCNBDiJ3QH5cm1W6JHdAAJCrGg01d0AgSCatnUF3QLg7yWQbRXdAUEBHi0tRd0B47pUck1F3QFg57YUpVXdAQbM+4Udhd0Dbhh6Jj2F3QMZ7NjQob3dAvIL77490d0BY75HNeIJ3QHB03L8JkndAO/3WIevCd0BgCJO2i9Z3QGG241dQ8ndATq7mFU4VeEAoAtuobiJ4QDkGphOFI3hAuK1deg1CeEBMhMa4ekV4QCg9YY7LUXhArBErPIlVeEBnXklU+mB4QFAs7Uy3YXhA9HvKv7dieECyF52FOnR4QPZubY9ndHhA8DGUr4t0eEBuLMkC03R4QGDbNxT+dHhAYtCVNaWBeEBizxsHmoR4QMAwayBQj3hAbEs/NZiReED6xcoXp5R4QM9m1w+6pHhASEQ6Dc7AeEDIRpwh9sF4QAypoWq11HhAANHLOQL0eEB/U9+xjPV4QMFSWRYRBHlA4gDNKpoFeUC7Wy3xwCN5QMCOVz27M3lACdKf84tCeUBUGSmX4VB5QJS/9ec3VHlATpxzLM1UeUAuAJW76mB5QHKJqxPbZHlAH2qjVUNzeUAS+c7s9YF5QFhB9ta/snlAcmzZYULUeUDqXfSizNV5QLWK2IRR5HlATVvnaPgFekD+kLnN3CZ6QEpkSFkMNXpALlpbjBpFekDag3dZLVB6QP1oUFaCUnpAml12YStVekAWKlYWOG16QIzUWRmLcnpAVmlNgQmdekA6+8yKYqJ6QCdaF0eqp3pArv2hXTOsekDAgrUZBq16QBI+g1UNtnpA/VRVWCzDekDGwb0F2NN6QGmoGV155HpA8oL4WjXuekAq6YJkKg57QKFOscFMFXtAgpnq2lole0BWgSx+Hy57QG6i6qJUMHtA4inJgMAze0D8joS9JJV7QJLh+616xntAgtrrHK7Te0BGJ8nWt9t7QL7cwylF4ntALrpHbrnje0Ak35E2/uN7QF7noQWo53tAnpNappfxe0BO3Bg9jPV7QFAv4iyPAXxAarAQqdkBfEDUCC7j8wF8QIjFdRWZBXxABHLuttERfECQ/b5jyyF8QOoLaVHMQXxAaoAd39pRfEAi/sZ4DmJ8QDahp/aTm3xAEm140sOyfEDCbzSrD7Z8QGWbdMjM1XxA3qsV0VfifEClN6NtFvJ8QK7G+Z5V8nxAanGjYAACfUASHp+ASwJ9QJIHu6ZFEn1AtoxqsDwifUCXXx7YoTF9QPxCiKSgU31A/FSO+SzDfUAWNHd51+J9QLK8T2h6831AaXD2y4/1fUBT9Ek/jlV+QFsxQ6KZZX5Avlhs6dPvfkCIFNDxAxt/QFeQmTARM39A37p88FA4f0BWOGmdqj1/QH/YCjUDQ39AN03r11pIf0D+bujCt3F/QEA+vl6xgX9Avl0drqODf0CgRmjfYpF/QN7yq/DdkX9AdzLI5paTf0DSpP6v5Zl/QDzflyBeoX9AtFzgU86hf0BLKH6pVLF/QEeBq+mQs39AcwnRDJbdf0B2ESVQ7+J/QJK5jnhC6H9Aw3lPI57tf0C6+7u6gxGAQDrzMpEwFIBAvoqhRNUWgECyX/l95S+AQAJAdSXXNoBAot5+k+s3gED4EUGEgjmAQPIJiKwpPIBAjMKVO9Y+gEAMpBgQ90iAQEPHhRnnUIBAxtjeAhRRgEDhVu/k5ViAQMijus0OWYBASPYvLylvgEBUdfaTDnqAQBQWG40TfoBAFuZz9hyPgECsULnubJGAQCfO4HoZlIBAwRq4uMOWgEAW4xmwbJmAQMq3HbQYnIBAI3Uu14apgEDiaaNgUsGAQCIKm1NTyYBAsMeJVkzRgEBCRgKYYNGAQJz1AbxK2YBAMlgZ55jZgEC299d5RdyAQAhwZRry3oBAAZgqPkXhgECm/z7EmeGAQPcV10H26IBARmaiWUrvgECfcfm9PPOAQJp9qhhA/4BAArtdoT0HgUCcgt1BPw+BQByecoIeHoFAFtU6CzQfgUCAjONlISKBQB5c0iE2J4FANe7kV7wxgUDq7FRvVY+BQGcZ/flVl4FASNqJU+ehgUBwup5abqiBQI7J/z3sqYFAFHl6c0qvgUBMSDjFfdOBQHDJd3TYGIJADo16tNYggkCUIHrdzyiCQNHIkAekMIJAi/x8bsh5gkCOOJXvy32CQJjG0+3AgYJASDI8XdKBgkBwfZmfxoWCQCCWfT5DiYJAJJI4v0ORgkA6DOBVPJmCQOLnNrJUmYJABofqBw+hgkBj+hfVOqGCQNfiL7cNqYJAyI5w4TOpgkA3OhXJB7GCQIpazjSu+YJA6l8/cHkRg0Bg9A0mehmDQAZAQZByIYNAGJ95wnEpg0A6P7CcaTGDQDy+W6fCVYNAruu6HkDSg0C9P2+W/2iEQDmxQSv+cIRAPMT6U/h4hEBRt8eCatmEQD0cEFFq4YRAoBE0MmPphEDmIN68NvGEQGJKjqZk8YRAL+GAkTT5hEAMm5RKWvmEQAps2RIxAYVAtHDac6BhhUDF7I8FoWmFQCZfYMiYcYVAWLxLiZh5hUBaDlTYk4GFQLKM98cXeoZAssMQWye5hkDg04zVkCmHQNR0nmmRMYdAaKdVzIo5h0A0NQoLXUGHQHuNvPeKQYdAMjFGQxpeh0B8ZnwSHmKHQMxGunAhZodAVP4Fcjlqh0AQPlIDZa6HQNBmFrrHsYdAz+Am22iyh0AQZofhx7mHQPRRB/bAwYdAuppRtT5OiEA0v5eWQlKIQPaySh91rYhAvtv7nCmwiECHo5X4G9aIQA== - - - - - - a3BWReO5k0SWhb1EWvvWRE9oi0RWsB5HY6+0RNFygESwlQJFDlp/RJLrH0UkS4dEx59sRQuju0RV8pBEKkqYRfgjiERoJs5EuoiWRAKoD0X1cJdEs5XlRPOcJUWSxR1HCAqcRPKLnUSKg0lGVwuaRaObmkQ3gtlEdnUbRb/8+UaEGM9EIHjERFjTq0V05UBFWLm8RA0ieET9oapEp4x6RLhUFkXLV49FdPcYRsmFRkX8SLlF3STLRH0En0RTcftF6xCGRAHnLUWie51EB9V6RPMAoEQ9SR5F1AiLRNRQAUfD3uBEIhkFRfd7PUVYEllHCWyPRMqQG0Xgb5tGeR3xRUtYwUR8vJVEf1brRSkooUSh3RxFFmOvROFNlESrR6JFFRaIRWDLBUWEnp1E0b2mRaSHnkibowJFfZ8XRwOEEEXD5KRFt62JRJRCBUaGjLZE7xf+RDVoCUU03qhEC/3yRFUsmUarcK1E79FjRKEwykSg3SlGqfZXRQ+nlESY14BETTaQRKzBhkQ7ZBJFSPebRMkeWUSUFoRE1SqcRLV31kTrZxFF1xS9RWJl30Tf75FEwwHFRIPOc0QOkKhFDMNSRwhA3UT8t85EbcQ0RYcdfUR1MgFG/Z2KRLCaykVyY9BE68O+RFAh10Q29MBFKEYZRcoDgES+XIhEakOwRFf820SkerZEQBOoRAinAUUzIyJFV8DARAUdhkSL/qpEObaQRMm3JUU4XLFEEnfDRWYRl0TLuZdEvoLjRKiLAEXE5dxEZe6PRC+FAEXvpodEfD8/RTgP3UR/aK5EtfPvRP6GVEVIXoVEgltPRaWLSEY7xxdFc//0RexFwUQGnhFFp9KaRNprt0SeNZVE4R7iRBsZl0SiWJZEF0YaRRnf8EQyUZ5E6oOHRc35jEW4891EfMi2ROhTjkSO5NFE55TKRDCiv0R0ixZGYmDYRboD8EbxVYlEl3sYRSOvlkQStsFE2jKgRT63jUXI26ZEidW+RA8pj0UIRWNGjvmJREj6hkTfXBJFOjGNRCY5t0SWK7NE26G+RLpqmkXGJJ1EPYzoRPmqf0SOvL9EGvJBRcw32kTwfy9FU7WvRGu5/ET6d8BGf4eBRNsfjkUV/n1EARkNSBQR10YP0/FG7QrCRQ7BoEbrUzdF09a2RAAASEX/RZxFN72ERAbzw0TLJKREW7aJRLGMqkRE+fxEaKKQRMgvmUXUBqFETlQeRe43hUYSLbBFlyZHRT0cI0UYK3FFngyvRHxzR0V84OZFf+5iScRXV0VZmK9E7Mo6RTm6ZEjD5aBEXBPYRDgEvUbAJ6VE4/V4RQXly0SfZhdFd4q6RcpuIEZEkJVEcnnkREIqVEVV5WxEJ52nRC+T1kT9la1E9Rm8RdnGukTUhsdEzAG6RL/mokQpG3dFSvYERps9B0Wo68ZEjJa5RD6QtkQifOVHQRflRsFvuETOD59E2iopRT8xlkRvq4pE7xniRQLp4kViLyRFP4mLRCTRzkRTnc5FgD+dRMbusETkPpBEK7Y5RZTmBEUEAWxGRC9DRWe4hETORs5FaK6NRGNoAEUg66tE9+6QRexek0SozqNEimSKRIOcs0QJuvFEycDFRRD+qkaKL69E4opIRtx4/USoxLFF5rSfRaDFnkQSj+xF1jCWRPRRzESM3rxEd8rTRGNMiURz/qlEXBP7R0S/AUWuU/5G5oNLRdxszUZk4J9FMu3NRHXQkkRj8qBEIUrkRC+H80Rs85FEw6UqRfOBDUWtJ4NEbJaZRMehhUQMwJVFjV/nRpvPd0ZRZNJFFqv4RPDku0UlwaxE+3iRROBFR0YbXgRGYm+WRFUbwEQfBIdFTK6qRGUGLUWCXlpF0lSkRpVaJEbqeUxFb3IKRZ6/jUXGqmdFQhS0REBhi0Tm4WpGXPuxRF/HLEYtLrNFg4qBRfFPAEWaPVFFs+SGRvLQ6ESAu+FFLjDERFIj2kRUKXJEDMs7RWCtOEcKEyhHADCQRtVT20U3RjxF1htFRVa8gEhKwtpH+ud/R1huOkVUhmNGjpC4RlbsoUYRartFxHepRQj98ESircVE8+SRRNo2u0TWjzJGfSsaRfCKx0SNS+hFdx/gRflbHEWW45xEdUHBRKXL7UWhB+lErOosRTt840QpzoREaTk7Re5eykTitWNG6FWmRTV7LEVfP+pE4qjuRtqQnUYOtIVFFcpURVPvDEVpE9xHl3hKRy647UZ2+yFFhNrcRQE7EkaRz6FErvMyRU5yqkSG9c1EYV3fR890NkeaONxGq1YARrVJDkUfmRtFONqGRNS1/0Wz1FdFVXkSRQqDHEfFgY9GEmRMRhEakUVEnmxFxREKRYIGnUTz6ulEGWMGR3pmc0Y95h9GvveDRdMC6kSi5bBEhhiNRJiIVkY+cvBFRy+6Rci0gESVLLpEZ+oTRuKtyUWTpyBFOUeURGa2HUW4xApGshXURHdyg0V4D1hFY7SrRGjFrkQwDf9E/4+zRHfdzEQ= - - - - - ... -``` -As you can seen in the `` elements, it contains extra information in `cvRef`. -This attribute is a reference to a data entry registering the controlled vocabularies used in the file much like in _mzQC_. -This has to be added to finalise the _mzML_. -The `` elements are again very similar to the respective entries in _mzQC_, making translation easy. -``` - - - - - - - -``` -Note that you should also add your software to `` and ``. - -### This is the mzQC file once again, in full: -**[mzml-mzqc-example.mzML](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/mzml-mzqc-example.mzML)** \ No newline at end of file diff --git a/specification_documents/examples/mzml-mzqc-example.mzML b/specification_documents/examples/adv_mzqc_in_mzml.mzML similarity index 99% rename from specification_documents/examples/mzml-mzqc-example.mzML rename to specification_documents/examples/adv_mzqc_in_mzml.mzML index 62dc41a5..8414cbd7 100644 --- a/specification_documents/examples/mzml-mzqc-example.mzML +++ b/specification_documents/examples/adv_mzqc_in_mzml.mzML @@ -1,7 +1,7 @@ - + @@ -11,7 +11,7 @@ - + @@ -29,7 +29,7 @@ - + @@ -69,8 +69,8 @@ - - + + @@ -320,7 +320,7 @@ - + @@ -338,7 +338,7 @@ - + @@ -378,7 +378,7 @@ - + @@ -418,7 +418,7 @@ - + From d0481fb388bc4319d4400ec34caf8454b2154c7f Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 12:04:59 +0200 Subject: [PATCH 12/12] Add clarification about table metrics --- docs/pages/worked-examples/adv_mzqc_in_mzml.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/pages/worked-examples/adv_mzqc_in_mzml.md b/docs/pages/worked-examples/adv_mzqc_in_mzml.md index d8da875c..1a136143 100644 --- a/docs/pages/worked-examples/adv_mzqc_in_mzml.md +++ b/docs/pages/worked-examples/adv_mzqc_in_mzml.md @@ -70,4 +70,7 @@ For metrics that relate to individual spectra, include these metrics at the `spe Repeat for each spectrum as necessary, adjusting the spectrum ID and corresponding values. -Thus, the key insight for embedding QC metrics in alternative file formats is that because they are backed by terms in the PSI-MS controlled vocabulary, they can be directly included using the respective functionalities for CV terms, such as `cvParam`. +Note that because QC metrics in mzQC files are typically encoded at the level of runs rather than individual spectra, most spectrum-level QC metrics are defined in the PSI-MS controlled vocabulary as tabular metrics with rows for all spectra. +Therefore, when directly associating these metrics with a specific spectrum, the tables should contain a single entry only for this spectrum. + +The key insight for embedding QC metrics in alternative file formats is that because they are backed by terms in the PSI-MS controlled vocabulary, they can be directly included using the respective functionalities for CV terms, such as `cvParam`.