From 4ff7700088495e40b37dd1ea6bc2af0dec9bef32 Mon Sep 17 00:00:00 2001 From: Mathias Walzer Date: Wed, 21 Aug 2024 09:33:24 +0100 Subject: [PATCH] Update QC2 sample example (#231) * updated QC2 sample examples, file validation fixes, explanation corrections * QC2 sample example markdown formatting * Update example * Update file names * Spelling * Fix duplicate file names * Update CV version --------- Co-authored-by: Wout Bittremieux --- docs/pages/examples.md | 2 +- .../figures/{LJCC.png => intro_qc2_ljcc.png} | Bin .../QC2-sample-example.mzQC.md | 56 ----- docs/pages/worked-examples/intro_qc2.md | 222 ++++++++++++++++++ ...QC2-sample-example.mzQC => intro_qc2.mzQC} | 48 +++- 5 files changed, 263 insertions(+), 65 deletions(-) rename docs/pages/figures/{LJCC.png => intro_qc2_ljcc.png} (100%) delete mode 100644 docs/pages/worked-examples/QC2-sample-example.mzQC.md create mode 100644 docs/pages/worked-examples/intro_qc2.md rename specification_documents/examples/{QC2-sample-example.mzQC => intro_qc2.mzQC} (66%) diff --git a/docs/pages/examples.md b/docs/pages/examples.md index 2f1f11ca..9488d869 100644 --- a/docs/pages/examples.md +++ b/docs/pages/examples.md @@ -8,7 +8,7 @@ The following use cases provide several hands-on examples of how mzQC files are - [Representing QC data for an individual mass spectrometry run](intro_run/) - [Deriving QC data from multiple related mass spectrometry runs](intro_set/) -- [QC sample mzQC](QC2-sample-example/) +- [Tracking instrument performance using controlled QC samples](intro_qc2/) - [Batch correction](metabo-batches/) Additionally, for more advanced usage, mzQC can closely interoperate with several other file formats developed by the Proteomics Standards Initiative: diff --git a/docs/pages/figures/LJCC.png b/docs/pages/figures/intro_qc2_ljcc.png similarity index 100% rename from docs/pages/figures/LJCC.png rename to docs/pages/figures/intro_qc2_ljcc.png diff --git a/docs/pages/worked-examples/QC2-sample-example.mzQC.md b/docs/pages/worked-examples/QC2-sample-example.mzQC.md deleted file mode 100644 index afae5f15..00000000 --- a/docs/pages/worked-examples/QC2-sample-example.mzQC.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -layout: page -title: "QC Sample-Run Example of mzQC" -permalink: /examples/QC2-sample-example/ ---- - -Here, we describe details of a mzQC JSON document used for a QC sample mass spectrometry run. -For description of the general structure of mzQC, see the Single-Run Example of mzQC. -Find the complete file at the bottom of this document or in the example folder. -The mzQC file is made from the acquision of a QC2 sample as described in [QCloud: A cloud-based quality control system for mass spectrometry-based proteomics laboratories](https://doi.org/10.1371/journal.pone.0189209). -Optional (detailed) descriptions about the file can be placed into mzQC next to the general information about the file. -``` - "description": "This is an example of an mzQC file produced from a proteomics QC2 sample. 20 ug dried Pierce HeLa protein digest standard from Thermo Fisher Scientific (Part number: 88329) are dissolved in 200 uL of 0.1% formic acid in water to a final concentration of 100 ng/uL. A total amount of 1 uL (100ng) is injected per analysis.", -``` -The metrics describe simple values like the cornerstone numbers of the acquisition and identification process, but also information specific to the QC method employed. Here, it is the mass accuracies and MS1 feature areas of selected peptides. With regular runs like this, the instrument's performance can be reliably monitored and maintenance interventions scheduled before valuable samples are wasted on an instrument running at sub-par performance. -``` - { - "accession":"MS:4000078", - "name":"QC2 sample mass accuracies", - "value":{ - "MS:1003169":["YAEAVTR","STLTDSLVC(Carbamidomethyl)K","SLADELALVDVLEDK","NPDDITNEEYGEFYK","LAVDEEENADNNTK","FEELNMDLFR","EAALSTALSEK","DDVAQTDLLQIDPNFGSK","RFPGYDSESK","EATTEFSVDAR","EQFLDGDGWTSR"], - "MS:4000072":[-0.2346854518740762,-0.08024023890884578,-0.1322012562867409,-0.2259441806378488,-0.10596535779273217,0.28345130855013684,-0.08600783742175504,-0.3683484942567654,-0.03348194493295555,-0.41789282666789496,-0.12794363836212685] - } - }, - { - "accession":"MS:4000079", - "name":"QC2 sample intensities", - "value":{ - "MS:1003169":["YAEAVTR","STLTDSLVC(Carbamidomethyl)K","SLADELALVDVLEDK","NPDDITNEEYGEFYK","LAVDEEENADNNTK","FEELNMDLFR","EAALSTALSEK","DDVAQTDLLQIDPNFGSK","RFPGYDSESK","EATTEFSVDAR","EQFLDGDGWTSR"], - "MS:1001844":[1234940000.0,922790000.0,80819100.0,478714000.0,254935000.0,52841200.0,243597000.0,24581800.0,707504000.0,129063000.0,205583000.0] - } - } -``` -The individual peptides' values are stored in a table, that is defined by the respective metric cv term. In case of the feature areas, there is a column indicating the peptide and another column for the respective feature area. -``` -[Term] -id: MS:4000079 -name: QC2 sample intensities -def: "Observed intensities for the peptides of a QC2 sample measurement within 5 ppm and +/- 240 s RT tolerance. Different metrics of observed intensities are possible, at least one must be present. The table should contain the peptides as defined in the parent QC2 sample metric term, missing are interpreted as not detected." [PSI:MS] -is_a: MS:4000005 ! table -relationship: has_metric_category MS:4000076 ! QC2 sample metric -relationship: has_metric_category MS:4000008 ! ID based metric -relationship: has_column MS:1003169 ! proforma peptidoform sequence -relationship: has_optional_column MS:1001858 ! XIC area -relationship: has_optional_column MS:1001859 ! normalized XIC area -relationship: has_optional_column MS:1001844 ! MS1 feature area -relationship: has_optional_column MS:1001843 ! MS1 feature maximum intensity -relationship: has_optional_column MS:1003085 ! previous MSn-1 scan precursor intensity -``` -Since each column is in turn defined by a cv term, the column can also be assigned an expected value type and unit. In this case the feature area column is expected to contain values of `MS:1001844 - MS1 feature area`s. This concept allows for easier automated metric consumption and even generic plotting of graphs. With a collection consecutive QC2 sample mzQC files, a plot like a Levey-Jennings Control Chart are easily achieved. - -![Levey-Jennings Control Chart](../../pages/figures/LJCC.png) - - -### This is the mzQC file once again, in full: -**[QC2-sample-example.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/QC2-sample-example.mzQC)** \ No newline at end of file diff --git a/docs/pages/worked-examples/intro_qc2.md b/docs/pages/worked-examples/intro_qc2.md new file mode 100644 index 00000000..157a0069 --- /dev/null +++ b/docs/pages/worked-examples/intro_qc2.md @@ -0,0 +1,222 @@ +--- +layout: page +title: "Introduction to mzQC – Tracking Instrument Performance" +permalink: /examples/intro_qc2/ +--- + +This document outlines the utilization of an mzQC file for quality control (QC) of a mass spectrometry proteomics experiment. +The mzQC file discussed here is derived from a QC2 sample, following protocols established in the publication, [QCloud: A cloud-based quality control system for mass spectrometry-based proteomics laboratories](https://doi.org/10.1371/journal.pone.0189209). +A QC2 sample is defined as a high complexity sample that mimics real samples analyzed in a proteomics laboratory, and is meant to be injected 1–5 times per week as a sample to test system suitability. + +Here we demonstrate how real-life QC metrics are calculated for a single mass spectrometry run using tools such as QCloud. +You can view the complete structure of this mzQC example [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_qc2.mzQC). + +## File description + +Our mzQC file example provides a detailed description of its contents and purpose, which allows users to understand the context and relevance of the QC metrics: + +```json +"description": "This is an example of an mzQC file produced from a proteomics QC2 sample. 20 ug dried Pierce HeLa protein digest standard from Thermo Fisher Scientific (Part number: 88329) are dissolved in 200 uL of 0.1% formic acid in water to a final concentration of 100 ng/uL. A total amount of 1 uL (100ng) is injected per analysis.", +``` + +## Input files specification + +The mzQC file lists necessary input files including both the raw mass spectrometry data file (mzML) and the peptide identification data file (mzIdentML). +The latter is required for deriving ID-based QC metrics later on. + +```json +"inputFiles": [ + { + "location": "file://tmp/QC2_18052020.mzML", + "name": "QC2_18052020_mzML", + "fileFormat": { + "accession": "MS:1000584", + "name": "mzML format" + }, + "fileProperties": [ + { + "accession": "MS:1000747", + "name": "completion time", + "value": "2020-05-18 09:20:48" + }, + { + "accession": "MS:1000569", + "name": "SHA-1", + "value": "fbe692c887404179518089abc670484c" + }, + { + "accession": "MS:1000031", + "name": "instrument model", + "value": "LTQ Orbitrap Velos" + } + ] + }, + { + "location": "file://tmp/QC2_18052020.mzid", + "name": "QC2_18052020_mzId", + "fileFormat": { + "accession": "MS:1002073", + "name": "mzIdentML format" + } + } +], +``` + +## Metrics calculation + +First, the mzQC file includes single-value metrics that provide quantifiable data on the MS data acquisition process, such as the number of MS2 spectra, identified spectra, peptides, and proteins: + +```json +{ + "accession": "MS:4000060", + "name": "number of MS2 spectra", + "description": "The number of MS2 events in the run.", + "value": 62299, + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } +}, +{ + "accession": "MS:1003251", + "name": "count of identified spectra", + "description": "The number of spectra that pass the threshold to be considered identified with sufficient confidence.", + "value": 24765, + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } +}, +{ + "accession": "MS:1003250", + "name": "count of identified peptidoforms", + "description": "The number of peptidoforms that pass the threshold to be considered identified with sufficient confidence.", + "value": 22241, + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } +}, +{ + "accession": "MS:1002404", + "name": "count of identified proteins", + "description": "The number of proteins that have been identified, which must match the number of groups that pass the threshold in the file.", + "value": "5504", + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } +}, +``` + +Next, the file includes metrics on precursor mass accuracies and sample intensities for selected peptides from the QC2 sample: + +```json +{ + "accession": "MS:4000078", + "name": "QC2 sample mass accuracies", + "description": "Observed mass accuracy for the peptides of a QC2 sample measurement. The table should contain the peptides as described in the QC2 sample metric term, missing are interpreted as not detected.", + "value": { + "MS:1003169": [ + "YAEAVTR", + "STLTDSLVC[Carbamidomethyl]K", + "SLADELALVDVLEDK", + "NPDDITNEEYGEFYK", + "LAVDEEENADNNTK", + "FEELNMDLFR", + "EAALSTALSEK", + "DDVAQTDLLQIDPNFGSK", + "RFPGYDSESK", + "EATTEFSVDAR", + "EQFLDGDGWTSR" + ], + "MS:4000072": [ + -0.2346854518740762, + -0.08024023890884578, + -0.1322012562867409, + -0.2259441806378488, + -0.10596535779273217, + 0.28345130855013684, + -0.08600783742175504, + -0.3683484942567654, + -0.03348194493295555, + -0.41789282666789496, + -0.12794363836212685 + ] + } +}, +{ + "accession": "MS:4000079", + "name": "QC2 sample intensities", + "description": "Observed intensities for the peptides of a QC2 sample measurement within 5 ppm and +/- 240 s RT tolerance. Different metrics of observed intensities are possible, at least one must be present. The table should contain the peptides as defined in the parent QC2 sample metric term, missing are interpreted as not detected.", + "value": { + "MS:1003169": [ + "YAEAVTR", + "STLTDSLVC[Carbamidomethyl]K", + "SLADELALVDVLEDK", + "NPDDITNEEYGEFYK", + "LAVDEEENADNNTK", + "FEELNMDLFR", + "EAALSTALSEK", + "DDVAQTDLLQIDPNFGSK", + "RFPGYDSESK", + "EATTEFSVDAR", + "EQFLDGDGWTSR" + ], + "MS:1001844": [ + 1234940000, + 922790000, + 80819100, + 478714000, + 254935000, + 52841200, + 243597000, + 24581800, + 707504000, + 129063000, + 205583000 + ] + } +} +``` + +These metrics are structured as tables within the mzQC document, with each row representing a peptide and columns detailing the specific metric values. + +## Controlled vocabulary definition + +All QC metrics in an mzQC file should be backed by a formal definition in a controlled vocabulary (CV) or ontology. +By default, mzQC sources its metrics from the [PSI-MS CV](https://github.com/HUPO-PSI/psi-ms-CV/). +For example, the "QC2 sample intensities" metric is formally defined in the PSI-MS CV as follows: + +``` +[Term] +id: MS:4000079 +name: QC2 sample intensities +def: "Observed intensities for the peptides of a QC2 sample measurement within 5 ppm and +/- 240 s RT tolerance. Different metrics of observed intensities are possible, at least one must be present. The table should contain the peptides as defined in the parent QC2 sample metric term, missing are interpreted as not detected." [PSI:MS] +is_a: MS:4000005 ! table +relationship: has_metric_category MS:4000076 ! QC2 sample metric +relationship: has_metric_category MS:4000008 ! ID based metric +relationship: has_column MS:1003169 ! proforma peptidoform sequence +relationship: has_optional_column MS:1001858 ! XIC area +relationship: has_optional_column MS:1001859 ! normalized XIC area +relationship: has_optional_column MS:1001844 ! MS1 feature area +relationship: has_optional_column MS:1001843 ! MS1 feature maximum intensity +relationship: has_optional_column MS:1003085 ! previous MSn-1 scan precursor intensity +``` + +This CV term is structured to capture multiple aspects of peptide detection and quantification, which include: + +- [ProForma](https://github.com/HUPO-PSI/ProForma) peptidoform sequence: Mandatory column denoting the peptides that were detected. +- Abundance measurements: Various optional columns can be used to record the peptide abundances using different strategies. Typically only one of those optional columns will be present. + +The example above records the peptide intensities based on the MS1 feature areas. Thus, the second column for the metric has name `MS:1001844`, corresponding to the definition of this metric. + +## Visualization and data analysis + +The structured data in mzQC allows for effective visualization and analysis, such as plotting trends across multiple peptides, samples, or experiments. +This can help identify any deviations or potential issues with the mass spectrometry process, prompting timely maintenance and calibration actions to maintain optimal performance. +For example, Levey-Jennings charts can be used to enable quick visual assessment of instrument stability or drift, critical for high-stakes or high-throughput proteomics workflows: + +![Levey-Jennings control chart](../../pages/figures/intro_qc2_ljcc.png) + +This example demonstrates how QC information in mzQC files helps in monitoring instrument performance, ensuring that maintenance is proactive and timely, thereby preserving the integrity and effectiveness of subsequent analyses. diff --git a/specification_documents/examples/QC2-sample-example.mzQC b/specification_documents/examples/intro_qc2.mzQC similarity index 66% rename from specification_documents/examples/QC2-sample-example.mzQC rename to specification_documents/examples/intro_qc2.mzQC index 21e8b2c3..27617724 100644 --- a/specification_documents/examples/QC2-sample-example.mzQC +++ b/specification_documents/examples/intro_qc2.mzQC @@ -11,7 +11,7 @@ "inputFiles": [ { "location": "file://tmp/QC2_18052020.mzML", - "name": "QC type 2 sample", + "name": "QC2_18052020_mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -33,20 +33,30 @@ "value": "LTQ Orbitrap Velos" } ] + }, + { + "location": "file://tmp/QC2_18052020.mzid", + "name": "QC2_18052020_mzId", + "fileFormat": { + "accession": "MS:1002073", + "name": "mzIdentML format" + } } ], "analysisSoftware": [ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" }, { "accession": "MS:1000799", "name": "custom unreleased software tool", - "version": "0", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", "value": "mzqc-pylib", + "version": "0", "uri": "https://hupo-psi.github.io/mzQC/unknown.html" } ] @@ -55,26 +65,47 @@ { "accession": "MS:4000060", "name": "number of MS2 spectra", - "value": 62299 + "description": "The number of MS2 events in the run.", + "value": 62299, + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } }, { "accession": "MS:1003251", "name": "count of identified spectra", - "value": 24765 + "description": "The number of spectra that pass the threshold to be considered identified with sufficient confidence.", + "value": 24765, + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } }, { "accession": "MS:1003250", "name": "count of identified peptidoforms", - "value": 22241 + "description": "The number of peptidoforms that pass the threshold to be considered identified with sufficient confidence.", + "value": 22241, + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } }, { "accession": "MS:1002404", "name": "count of identified proteins", - "value": "5504" + "description": "The number of proteins that have been identified, which must match the number of groups that pass the threshold in the file.", + "value": "5504", + "unit": { + "accession": "UO:0000189", + "name": "count unit" + } }, { "accession": "MS:4000078", "name": "QC2 sample mass accuracies", + "description": "Observed mass accuracy for the peptides of a QC2 sample measurement. The table should contain the peptides as described in the QC2 sample metric term, missing are interpreted as not detected.", "value": { "MS:1003169": [ "YAEAVTR", @@ -107,6 +138,7 @@ { "accession": "MS:4000079", "name": "QC2 sample intensities", + "description": "Observed intensities for the peptides of a QC2 sample measurement within 5 ppm and +/- 240 s RT tolerance. Different metrics of observed intensities are possible, at least one must be present. The table should contain the peptides as defined in the parent QC2 sample metric term, missing are interpreted as not detected.", "value": { "MS:1003169": [ "YAEAVTR", @@ -142,8 +174,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.79/psi-ms.obo", - "version": "4.1.79" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.157/psi-ms.obo", + "version": "4.1.172" } ] }