From 05bfcc0908031569a06044a63079053677a05448 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Wed, 13 Mar 2024 10:10:44 +0000 Subject: [PATCH 01/46] Fix the way vite runs in dev mode so that browser tools work --- frontend/Dockerfile | 2 +- frontend/package.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 1e777afc..7cd84b37 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -27,4 +27,4 @@ COPY ./vite.config.js /code/longue_vue/vite.config.js COPY ./index.html /code/longue_vue/index.html COPY ./.env /code/longue_vue/.env -CMD ["npm", "exec", "vite", "--", "--host", "--port", "80", "--base", "/ui/"] +CMD ["npm", "run", "dev", "--", "--host", "--port", "80", "--base", "/ui/"] diff --git a/frontend/package.json b/frontend/package.json index 1e1f1172..7617a3cb 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -9,9 +9,9 @@ "npm": ">=8.11.0" }, "scripts": { - "dev": "vite --port 3000", + "dev": "vite --port 80 --mode development", "build": "vite build", - "preview": "vite preview --port 3000", + "preview": "vite preview --port 80", "test": "vitest run", "coverage": "vitest run --coverage", "test:e2e:ci": "start-server-and-test preview http://localhost:3000/ 'cypress run --e2e'", From 59c25027793658b82462c2ba1ea77df08dadcc4d Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 2 May 2024 13:39:17 +0000 Subject: [PATCH 02/46] Put in some syntactic sugar checksum types to enable differentiation between use of id_product in mlwh tables --- lang_qc/endpoints/pacbio_well.py | 9 +++++---- lang_qc/util/type_checksum.py | 16 ++++++++++++++++ tests/test_checksum_type.py | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index f9d49573..27534f6a 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Adam Blanchet @@ -40,13 +40,14 @@ from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState, QcStateBasic +from lang_qc.models.pacbio.qc_data import QCPoolMetrics from lang_qc.util.auth import check_user from lang_qc.util.errors import ( InconsistentInputError, InvalidDictValueError, RunNotFoundError, ) -from lang_qc.util.type_checksum import ChecksumSHA256 +from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256, PacBioProductSHA256 """ A collection of API endpoints that are specific to the PacBio sequencing @@ -173,7 +174,7 @@ def get_wells_in_run( response_model=PacBioWellFull, ) def get_seq_metrics( - id_product: ChecksumSHA256, + id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db), qcdb_session: Session = Depends(get_qc_db), ) -> PacBioWellFull: @@ -210,7 +211,7 @@ def get_seq_metrics( status_code=status.HTTP_201_CREATED, ) def claim_qc( - id_product: ChecksumSHA256, + id_product: PacBioWellSHA256, user: User = Depends(check_user), qcdb_session: Session = Depends(get_qc_db), mlwhdb_session: Session = Depends(get_mlwh_db), diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py index a704d3e6..15b8bdcd 100644 --- a/lang_qc/util/type_checksum.py +++ b/lang_qc/util/type_checksum.py @@ -40,3 +40,19 @@ def validate(cls, v, _): def __repr__(self): return f"ChecksumSHA256({super().__repr__()})" + + +class PacBioWellSHA256(ChecksumSHA256): + """ + A checksum generated from the coordinates of a single well on a plate in a PacBio run + """ + pass + + +class PacBioProductSHA256(ChecksumSHA256): + """ + A checksum generated from the combination of run, well, plate and any tags required for deplexing + See `npg_id_generation.pac_bio.PacBioEntity`. + Tags only contribute to the checksum when samples are multiplexed. + """ + pass \ No newline at end of file diff --git a/tests/test_checksum_type.py b/tests/test_checksum_type.py index aba01f72..41474f75 100644 --- a/tests/test_checksum_type.py +++ b/tests/test_checksum_type.py @@ -7,7 +7,7 @@ class ChecksumSHA256User(BaseModel): - product_chcksm: ChecksumSHA256 + product_chcksm: ChecksumSHA256 | None = None def test_valid_checksum(): From 486faca2e5629578a77ba1741e64567e6145fb10 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 10 May 2024 15:44:11 +0000 Subject: [PATCH 03/46] Undo type fix, it seems to impact how the code works. Disturbing. --- tests/test_checksum_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_checksum_type.py b/tests/test_checksum_type.py index 41474f75..aba01f72 100644 --- a/tests/test_checksum_type.py +++ b/tests/test_checksum_type.py @@ -7,7 +7,7 @@ class ChecksumSHA256User(BaseModel): - product_chcksm: ChecksumSHA256 | None = None + product_chcksm: ChecksumSHA256 def test_valid_checksum(): From 2b9dc32659af72ac08afce95c36befa9ced07636 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 10 May 2024 15:47:04 +0000 Subject: [PATCH 04/46] Define a response model for pool metrics for a given well --- lang_qc/models/pacbio/qc_data.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 3fe13e5e..8f64b842 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Marina Gourtovaia @@ -23,6 +23,7 @@ from pydantic import BaseModel, ConfigDict, Field from lang_qc.db.mlwh_schema import PacBioRunWellMetrics +from lang_qc.util.type_checksum import PacBioProductSHA256 # Pydantic prohibits us from defining these as @classmethod or @staticmethod @@ -153,3 +154,31 @@ def from_orm(cls, obj: PacBioRunWellMetrics): qc_data[name]["value"] = getattr(obj, name, None) return cls.model_validate(qc_data) + + +class SampleDeplexingStats(BaseModel): + """ + A representation of metrics for one product, some direct from the DB and others inferred + + For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence. + As a result tag2_name will be None for most data in or before 2024. + """ + + id_product: PacBioProductSHA256 + tag1_name: str | None + tag2_name: str | None + hifi_read_bases: int | None + hifi_num_reads: int | None + hifi_read_length_mean: float | None + hifi_bases_percent: float | None + percentage_total_reads: float | None + + +class QCPoolMetrics(BaseModel): + pool_coeff_of_variance: float | None = Field( + title="Coefficient of variance for reads in the pool", + description="Percentage of the standard deviation w.r.t. mean, reported when the pool is larger than one", + ) + products: list[SampleDeplexingStats] = Field( + title="List of products and their metrics" + ) From 3b3c32aecb6a6a8223b74b9b09da93e82868e659 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 14 May 2024 16:07:47 +0000 Subject: [PATCH 05/46] Allow WellWh helper to compute pool metrics --- lang_qc/db/helper/wells.py | 45 ++++++- tests/fixtures/sample_data.py | 205 +++++++++++++++++++++++++++++ tests/test_pac_bio_qc_data_well.py | 32 +++++ 3 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/sample_data.py diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 976dd632..48a3042a 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Marina Gourtovaia @@ -21,6 +21,7 @@ import logging from datetime import date, datetime, timedelta +from statistics import mean, stdev from typing import ClassVar, List from pydantic import BaseModel, ConfigDict, Field @@ -33,11 +34,13 @@ ) from lang_qc.db.mlwh_schema import PacBioRunWellMetrics from lang_qc.db.qc_schema import QcState, QcStateDict, QcType +from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState as QcStateModel from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError +from lang_qc.util.type_checksum import PacBioWellSHA256 """ This package is using an undocumented feature of Pydantic, type @@ -64,7 +67,7 @@ class WellWh(BaseModel): # The TestClient seems to be keeping these instances alive and changing them. def get_mlwh_well_by_product_id( - self, id_product: str + self, id_product: PacBioWellSHA256 ) -> PacBioRunWellMetrics | None: """ Returns a well row record from the well metrics table or @@ -77,6 +80,44 @@ def get_mlwh_well_by_product_id( ) ).scalar_one_or_none() + def get_metrics_by_well_product_id( + self, id_product: PacBioWellSHA256 + ) -> QCPoolMetrics | None: + well = self.get_mlwh_well_by_product_id(id_product) + if well: + product_metrics = well.pac_bio_product_metrics + if len(product_metrics) == 1: + return None + + cov: float | None + if any(p.hifi_num_reads is None for p in product_metrics): + cov = None + else: + hifi_reads = [prod.hifi_num_reads for prod in product_metrics] + cov = stdev(hifi_reads) / mean(hifi_reads) * 100 + + return QCPoolMetrics( + pool_coeff_of_variance=cov, + products=[ + SampleDeplexingStats( + id_product=prod.id_pac_bio_product, + tag1_name=prod.pac_bio_run.tag_identifier, + tag2_name=prod.pac_bio_run.tag2_identifier, + hifi_read_bases=prod.hifi_read_bases, + hifi_num_reads=prod.hifi_num_reads, + hifi_read_length_mean=prod.hifi_read_length_mean, + hifi_bases_percent=prod.hifi_bases_percent, + percentage_total_reads=( + prod.hifi_num_reads / well.hifi_num_reads * 100 + if well.hifi_num_reads + else None + ), + ) + for prod in product_metrics + ], + ) + return None + def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: """ Get recent not QC-ed completed wells from the mlwh database. diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py new file mode 100644 index 00000000..15ed2242 --- /dev/null +++ b/tests/fixtures/sample_data.py @@ -0,0 +1,205 @@ +from datetime import datetime + +import pytest +from npg_id_generation.pac_bio import PacBioEntity + +from lang_qc.db.mlwh_schema import ( + PacBioProductMetrics, + PacBioRun, + PacBioRunWellMetrics, + Sample, + Study, +) + + +@pytest.fixture(scope="function", params=["AAAAAAAA", None]) +def simplex_run(mlwhdb_test_session): + """ + A single sample, well, run mlwh fixture that provides both an explicit tag1 + for the sample, and an implicit default tag (when the PacBio instrument is + run with default barcodes) + """ + run_name = "RUN" + well_label = "A1" + plate_number = 1 + tag1 = mlwhdb_test_session.param + + common_run_attribs = { + "recorded_at": datetime.now(), + "last_updated": datetime.now(), + "pipeline_id_lims": "nobody cares", + "cost_code": "probably ToL", + "id_lims": 1, + "plate_uuid_lims": "uuid1", + "well_uuid_lims": "uuid1", + "pac_bio_library_tube_id_lims": "id", + "pac_bio_library_tube_uuid": "uuid", + "pac_bio_library_tube_name": "bob", + } + + well_metrics_a1 = PacBioRunWellMetrics( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + instrument_type="Revio", + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + ).hash_product_id(), + ) + + study = Study( + id_lims="id", + id_study_lims="1", + ) + + # This run-well-plate has one singly tagged sample + simplex_run = PacBioRun( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + id_pac_bio_run_lims=0, + sample=Sample( + id_lims="id", + id_sample_lims="1", + ), + study=study, + plate_barcode="ABCD", + pac_bio_product_metrics=[ + PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=10, + hifi_read_length_mean=90, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_a1, + ) + ], + **common_run_attribs + ) + mlwhdb_test_session.add(simplex_run) + mlwhdb_test_session.commit() + + +@pytest.fixture(scope="function") +def multiplexed_run(mlwhdb_test_session): + "runs for several (2) samples in one run-well-plate" + + run_name = "RUN" + well_label = "B1" + plate_number = 1 + tag1 = "AAAAAAA" + + common_run_attribs = { + "recorded_at": datetime.now(), + "last_updated": datetime.now(), + "pipeline_id_lims": "nobody cares", + "cost_code": "probably ToL", + "id_lims": 1, + "plate_uuid_lims": "uuid1", + "well_uuid_lims": "uuid1", + "pac_bio_library_tube_id_lims": "id", + "pac_bio_library_tube_uuid": "uuid", + "pac_bio_library_tube_name": "bob", + } + + study = Study( + id_lims="id", + id_study_lims="1", + ) + + tag1 = "TTTTTTTT" + tag1_2 = "GGGGGGGG" + well_metrics_b1 = PacBioRunWellMetrics( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + instrument_type="Revio", + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + ).hash_product_id(), + hifi_num_reads=30, + ) + + multiplex_run_1 = PacBioRun( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + id_pac_bio_run_lims=1, + sample=Sample( + id_lims="pooled_id_1", + id_sample_lims="2", + ), + study=study, + plate_barcode="ABCD", + pac_bio_product_metrics=[ + PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=20, + hifi_read_length_mean=45, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_b1, + ), + ], + **common_run_attribs + ) + + multiplex_run_2 = PacBioRun( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + id_pac_bio_run_lims=2, + sample=Sample( + id_lims="pooled_id_2", + id_sample_lims="3", + ), + study=study, + plate_barcode="ABCD", + pac_bio_product_metrics=[ + PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1_2, + ).hash_product_id(), + qc=1, + hifi_read_bases=100, + hifi_num_reads=10, + hifi_read_length_mean=10, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=100.00, + pac_bio_run_well_metrics=well_metrics_b1, + ) + ], + **common_run_attribs + ) + + mlwhdb_test_session.add_all([multiplex_run_1, multiplex_run_2]) + mlwhdb_test_session.commit() + + +# Some runs use "default barcodes" and the tag1 fields in pac_bio_run are empty. When this is true, we also lose the deplex stats +# Show user "default" in the interface? +# Not all runs get any hifi stats in pac_bio_product_metrics. Not all runs use the hifi reads setting diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index 701cce85..32a07df1 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -2,6 +2,7 @@ from lang_qc.db.helper.wells import WellWh from lang_qc.models.pacbio.qc_data import QCDataWell +from tests.fixtures.sample_data import multiplexed_run, simplex_run def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): @@ -98,3 +99,34 @@ def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): assert ( qc.percentage_deplexed_reads["value"] == None ), "Absent metrics mean this is set to none" + + +def test_pool_metrics_from_single_sample_well(mlwhdb_test_session, simplex_run): + helper = WellWh(session=mlwhdb_test_session) + id = PacBioEntity(run_name="RUN", well_label="A1", plate_number=1).hash_product_id() + + metrics = helper.get_metrics_by_well_product_id(id) + assert metrics is None, "Got no metrics for a one-sample well" + + +def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run): + helper = WellWh(session=mlwhdb_test_session) + id = PacBioEntity(run_name="RUN", well_label="B1", plate_number=1).hash_product_id() + metrics = helper.get_metrics_by_well_product_id(id) + + assert metrics, "Two samples means we get a metrics response" + assert ( + int(metrics.pool_coeff_of_variance) == 47 + ), "Variance between 20 and 10 is ~47%" + + assert metrics.products[0].hifi_read_bases == 100 + assert ( + metrics.products[1].hifi_read_bases == 900 + ), "hifi read base counts are faithfully copied" + + assert ( + int(metrics.products[0].percentage_total_reads) == 33 + ), "10 of 30 reads is 33.3%" + assert ( + int(metrics.products[1].percentage_total_reads) == 66 + ), "20 of 30 reads is 66.6%" From b092e194cb56e45d0c02076e147ff46a5ae09ac0 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 14 May 2024 16:23:40 +0000 Subject: [PATCH 06/46] fixture parametrisation not quite right --- tests/fixtures/sample_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 15ed2242..c359b26d 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -13,7 +13,7 @@ @pytest.fixture(scope="function", params=["AAAAAAAA", None]) -def simplex_run(mlwhdb_test_session): +def simplex_run(request, mlwhdb_test_session): """ A single sample, well, run mlwh fixture that provides both an explicit tag1 for the sample, and an implicit default tag (when the PacBio instrument is @@ -22,7 +22,7 @@ def simplex_run(mlwhdb_test_session): run_name = "RUN" well_label = "A1" plate_number = 1 - tag1 = mlwhdb_test_session.param + tag1 = request.param common_run_attribs = { "recorded_at": datetime.now(), From 4382aa72462736efd1dba00e97f0d9cfb2bec814 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 17 May 2024 13:36:50 +0000 Subject: [PATCH 07/46] An (untested) endpoint for fetching pool stats --- lang_qc/endpoints/pacbio_well.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index 27534f6a..f2324883 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -47,7 +47,10 @@ InvalidDictValueError, RunNotFoundError, ) -from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256, PacBioProductSHA256 +from lang_qc.util.type_checksum import ( + ChecksumSHA256, + PacBioWellSHA256, +) """ A collection of API endpoints that are specific to the PacBio sequencing @@ -186,6 +189,28 @@ def get_seq_metrics( return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state) +@router.get( + "/products/{id_product}/seq_level/pool", + summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID", + responses={ + status.HTTP_404_NOT_FOUND: {"description": "Product not found"}, + status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"}, + }, + response_model=QCPoolMetrics, +) +def get_product_metrics( + id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db) +) -> QCPoolMetrics: + metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id( + id_product + ) + if metrics is None: + raise HTTPException( + status_code=404, detail="Well does not have any pool metrics" + ) + return metrics + + @router.post( "/products/{id_product}/qc_claim", summary="Claim the well to start QC", From 7f329ffe60f2bc0b1ce07ee79b899fa4980715ae Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Mon, 20 May 2024 15:26:13 +0000 Subject: [PATCH 08/46] Make pool fixture self-cleaning --- tests/fixtures/sample_data.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index c359b26d..7891a0dd 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -12,7 +12,7 @@ ) -@pytest.fixture(scope="function", params=["AAAAAAAA", None]) +@pytest.fixture(scope="module", params=["AAAAAAAA", None]) def simplex_run(request, mlwhdb_test_session): """ A single sample, well, run mlwh fixture that provides both an explicit tag1 @@ -88,6 +88,10 @@ def simplex_run(request, mlwhdb_test_session): ) mlwhdb_test_session.add(simplex_run) mlwhdb_test_session.commit() + yield simplex_run + mlwhdb_test_session.delete(simplex_run) + mlwhdb_test_session.delete(study) + mlwhdb_test_session.commit() @pytest.fixture(scope="function") @@ -198,6 +202,11 @@ def multiplexed_run(mlwhdb_test_session): mlwhdb_test_session.add_all([multiplex_run_1, multiplex_run_2]) mlwhdb_test_session.commit() + yield (multiplex_run_1, multiplex_run_2) + mlwhdb_test_session.delete(multiplex_run_1) + mlwhdb_test_session.delete(multiplex_run_2) + mlwhdb_test_session.delete(study) + mlwhdb_test_session.commit() # Some runs use "default barcodes" and the tag1 fields in pac_bio_run are empty. When this is true, we also lose the deplex stats From 3c9b9bb275a0f7785fe968e3bf7fea274675a12e Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Mon, 20 May 2024 17:00:45 +0000 Subject: [PATCH 09/46] Add metrics from mlwh to a multi-sample well, and test pool API endpoint Black rides again --- lang_qc/util/type_checksum.py | 4 +- .../mlwh_pb_runs/300-PacBioProductMetrics.yml | 40 +++++++++---------- .../endpoints/test_single_well_qc_details.py | 18 +++++++++ 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py index 15b8bdcd..4a72dce6 100644 --- a/lang_qc/util/type_checksum.py +++ b/lang_qc/util/type_checksum.py @@ -46,6 +46,7 @@ class PacBioWellSHA256(ChecksumSHA256): """ A checksum generated from the coordinates of a single well on a plate in a PacBio run """ + pass @@ -55,4 +56,5 @@ class PacBioProductSHA256(ChecksumSHA256): See `npg_id_generation.pac_bio.PacBioEntity`. Tags only contribute to the checksum when samples are multiplexed. """ - pass \ No newline at end of file + + pass diff --git a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml index 0b6de2e2..6485990f 100644 --- a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml +++ b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml @@ -255,11 +255,11 @@ id_pac_bio_tmp: 120632 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 97 + hifi_bases_percent: 27.49 + hifi_num_reads: 1952224 + hifi_read_bases: 21504288522 + hifi_read_length_mean: 11015 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30023 id_pac_bio_product: 74af5a311e15af654336aea65826a2c4974842d752e25875b0303ad5a3556167 @@ -267,11 +267,11 @@ id_pac_bio_tmp: 120633 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 96 + hifi_bases_percent: 19.62 + hifi_num_reads: 1139885 + hifi_read_bases: 15344650012 + hifi_read_length_mean: 13461 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30024 id_pac_bio_product: 11022006a649937c570d100ccb382dddadf9a7174ee303903c8d2b7cd7efb328 @@ -279,11 +279,11 @@ id_pac_bio_tmp: 120634 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 96 + hifi_bases_percent: 23.7 + hifi_num_reads: 1751410 + hifi_read_bases: 18538781061 + hifi_read_length_mean: 10585 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30025 id_pac_bio_product: e6a2157d0fda8faae1288025e99ce5f8133f1466b752a67809668e5b9b16d5b1 @@ -291,11 +291,11 @@ id_pac_bio_tmp: 120635 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 97 + hifi_bases_percent: 28.72 + hifi_num_reads: 1991282 + hifi_read_bases: 22462478066 + hifi_read_length_mean: 11280 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30026 id_pac_bio_product: 9840280d97c98ff3ddda36ac95cf3b87f5810cc3be73a64c27d6ab92cfaab0ac diff --git a/tests/endpoints/test_single_well_qc_details.py b/tests/endpoints/test_single_well_qc_details.py index 7931e425..b9b8d62d 100644 --- a/tests/endpoints/test_single_well_qc_details.py +++ b/tests/endpoints/test_single_well_qc_details.py @@ -1,4 +1,5 @@ from fastapi.testclient import TestClient +from npg_id_generation.pac_bio import PacBioEntity from tests.fixtures.well_data import load_data4well_retrieval, load_dicts_and_users @@ -165,3 +166,20 @@ def test_get_well_info( assert result["plate_number"] == 2 assert result["id_product"] == id_product assert result["qc_state"] is None + + +def test_get_pool_info(test_client: TestClient, mlwhdb_load_runs): + id_product = PacBioEntity( + run_name="TRACTION-RUN-1140", well_label="D1", plate_number=1 + ).hash_product_id() + response = test_client.get(f"/pacbio/products/{id_product}/seq_level/pool") + assert response.status_code == 200 + + data = response.json() + assert int(data["pool_coeff_of_variance"]) == 23, "variance is calculated" + assert {prod["tag1_name"] for prod in data["products"]} == { + "bc2036", + "bc2040", + "bc2054", + "bc2063", + }, "Correct products present" From 7a55cc6e3426f48567a0d247de1afac4430ee9d3 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 23 May 2024 14:54:01 +0000 Subject: [PATCH 10/46] parameterised fixture triggers unique condition in DB, so make more dynamic shorten some lines for flake8 --- lang_qc/models/pacbio/qc_data.py | 2 +- lang_qc/util/type_checksum.py | 4 +- tests/fixtures/sample_data.py | 119 +++++++++++++++-------------- tests/test_pac_bio_qc_data_well.py | 6 +- 4 files changed, 70 insertions(+), 61 deletions(-) diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 8f64b842..259a1785 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -177,7 +177,7 @@ class SampleDeplexingStats(BaseModel): class QCPoolMetrics(BaseModel): pool_coeff_of_variance: float | None = Field( title="Coefficient of variance for reads in the pool", - description="Percentage of the standard deviation w.r.t. mean, reported when the pool is larger than one", + description="Percentage of the standard deviation w.r.t. mean, when pool is more than one", ) products: list[SampleDeplexingStats] = Field( title="List of products and their metrics" diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py index 4a72dce6..c78b0997 100644 --- a/lang_qc/util/type_checksum.py +++ b/lang_qc/util/type_checksum.py @@ -52,8 +52,8 @@ class PacBioWellSHA256(ChecksumSHA256): class PacBioProductSHA256(ChecksumSHA256): """ - A checksum generated from the combination of run, well, plate and any tags required for deplexing - See `npg_id_generation.pac_bio.PacBioEntity`. + A checksum generated from the combination of run, well, plate and any tags required for + deplexing, see `npg_id_generation.pac_bio.PacBioEntity`. Tags only contribute to the checksum when samples are multiplexed. """ diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 7891a0dd..dd267805 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -19,7 +19,8 @@ def simplex_run(request, mlwhdb_test_session): for the sample, and an implicit default tag (when the PacBio instrument is run with default barcodes) """ - run_name = "RUN" + run_name = "RUN-9999" + run_name += request.param if request.param else "" well_label = "A1" plate_number = 1 tag1 = request.param @@ -49,6 +50,23 @@ def simplex_run(request, mlwhdb_test_session): ).hash_product_id(), ) + product = PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=10, + hifi_read_length_mean=90, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_a1, + ) + study = Study( id_lims="id", id_study_lims="1", @@ -62,28 +80,11 @@ def simplex_run(request, mlwhdb_test_session): id_pac_bio_run_lims=0, sample=Sample( id_lims="id", - id_sample_lims="1", + id_sample_lims=request.param or "1", ), study=study, plate_barcode="ABCD", - pac_bio_product_metrics=[ - PacBioProductMetrics( - id_pac_bio_product=PacBioEntity( - run_name=run_name, - well_label=well_label, - plate_number=plate_number, - tags=tag1, - ).hash_product_id(), - qc=1, - hifi_read_bases=900, - hifi_num_reads=10, - hifi_read_length_mean=90, - barcode_quality_score_mean=34, - hifi_read_quality_mean=35, - hifi_bases_percent=90.001, - pac_bio_run_well_metrics=well_metrics_a1, - ) - ], + pac_bio_product_metrics=[product], **common_run_attribs ) mlwhdb_test_session.add(simplex_run) @@ -91,6 +92,8 @@ def simplex_run(request, mlwhdb_test_session): yield simplex_run mlwhdb_test_session.delete(simplex_run) mlwhdb_test_session.delete(study) + mlwhdb_test_session.delete(product) + mlwhdb_test_session.delete(well_metrics_a1) mlwhdb_test_session.commit() @@ -101,7 +104,6 @@ def multiplexed_run(mlwhdb_test_session): run_name = "RUN" well_label = "B1" plate_number = 1 - tag1 = "AAAAAAA" common_run_attribs = { "recorded_at": datetime.now(), @@ -136,6 +138,23 @@ def multiplexed_run(mlwhdb_test_session): hifi_num_reads=30, ) + product_1 = PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=20, + hifi_read_length_mean=45, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_b1, + ) + multiplex_run_1 = PacBioRun( pac_bio_run_name=run_name, well_label=well_label, @@ -147,27 +166,27 @@ def multiplexed_run(mlwhdb_test_session): ), study=study, plate_barcode="ABCD", - pac_bio_product_metrics=[ - PacBioProductMetrics( - id_pac_bio_product=PacBioEntity( - run_name=run_name, - well_label=well_label, - plate_number=plate_number, - tags=tag1, - ).hash_product_id(), - qc=1, - hifi_read_bases=900, - hifi_num_reads=20, - hifi_read_length_mean=45, - barcode_quality_score_mean=34, - hifi_read_quality_mean=35, - hifi_bases_percent=90.001, - pac_bio_run_well_metrics=well_metrics_b1, - ), - ], + pac_bio_product_metrics=[product_1], **common_run_attribs ) + product_2 = PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1_2, + ).hash_product_id(), + qc=1, + hifi_read_bases=100, + hifi_num_reads=10, + hifi_read_length_mean=10, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=100.00, + pac_bio_run_well_metrics=well_metrics_b1, + ) + multiplex_run_2 = PacBioRun( pac_bio_run_name=run_name, well_label=well_label, @@ -179,24 +198,7 @@ def multiplexed_run(mlwhdb_test_session): ), study=study, plate_barcode="ABCD", - pac_bio_product_metrics=[ - PacBioProductMetrics( - id_pac_bio_product=PacBioEntity( - run_name=run_name, - well_label=well_label, - plate_number=plate_number, - tags=tag1_2, - ).hash_product_id(), - qc=1, - hifi_read_bases=100, - hifi_num_reads=10, - hifi_read_length_mean=10, - barcode_quality_score_mean=34, - hifi_read_quality_mean=35, - hifi_bases_percent=100.00, - pac_bio_run_well_metrics=well_metrics_b1, - ) - ], + pac_bio_product_metrics=[product_2], **common_run_attribs ) @@ -206,6 +208,9 @@ def multiplexed_run(mlwhdb_test_session): mlwhdb_test_session.delete(multiplex_run_1) mlwhdb_test_session.delete(multiplex_run_2) mlwhdb_test_session.delete(study) + mlwhdb_test_session.delete(well_metrics_b1) + mlwhdb_test_session.delete(product_1) + mlwhdb_test_session.delete(product_2) mlwhdb_test_session.commit() diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index 32a07df1..bd4318a0 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -103,7 +103,11 @@ def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): def test_pool_metrics_from_single_sample_well(mlwhdb_test_session, simplex_run): helper = WellWh(session=mlwhdb_test_session) - id = PacBioEntity(run_name="RUN", well_label="A1", plate_number=1).hash_product_id() + id = PacBioEntity( + run_name=simplex_run.pac_bio_run_name, + well_label=simplex_run.well_label, + plate_number=simplex_run.plate_number, + ).hash_product_id() metrics = helper.get_metrics_by_well_product_id(id) assert metrics is None, "Got no metrics for a one-sample well" From 686481bc11762f889ae98a967bfde15adccf35f2 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 23 May 2024 15:00:13 +0000 Subject: [PATCH 11/46] Stop fixture polluting other tests in module import sort --- lang_qc/endpoints/pacbio_well.py | 7 ++----- tests/fixtures/sample_data.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index f2324883..7e157246 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -37,20 +37,17 @@ from lang_qc.db.mlwh_connection import get_mlwh_db from lang_qc.db.qc_connection import get_qc_db from lang_qc.db.qc_schema import User +from lang_qc.models.pacbio.qc_data import QCPoolMetrics from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState, QcStateBasic -from lang_qc.models.pacbio.qc_data import QCPoolMetrics from lang_qc.util.auth import check_user from lang_qc.util.errors import ( InconsistentInputError, InvalidDictValueError, RunNotFoundError, ) -from lang_qc.util.type_checksum import ( - ChecksumSHA256, - PacBioWellSHA256, -) +from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256 """ A collection of API endpoints that are specific to the PacBio sequencing diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index dd267805..8a9b64eb 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -12,7 +12,7 @@ ) -@pytest.fixture(scope="module", params=["AAAAAAAA", None]) +@pytest.fixture(scope="function", params=["AAAAAAAA", None]) def simplex_run(request, mlwhdb_test_session): """ A single sample, well, run mlwh fixture that provides both an explicit tag1 From 2b6ae77d84f9d8386006d418355c740e69181b7f Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 23 May 2024 16:18:07 +0000 Subject: [PATCH 12/46] Data not needed for defunct mlwh column --- .../mlwh_pb_runs/300-PacBioProductMetrics.yml | 24 ------------------- tests/fixtures/sample_data.py | 3 --- 2 files changed, 27 deletions(-) diff --git a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml index 6485990f..f46e1d13 100644 --- a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml +++ b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml @@ -176,7 +176,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30016 id_pac_bio_product: 3b37d8c1a317f229a3aae182f160f8e4f4856607fb15f1ab0588dde66640afda id_pac_bio_rw_metrics_tmp: 6206 @@ -188,7 +187,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30017 id_pac_bio_product: 2b9048414306eb7683056bd91f6ec81f0b2dbf69484b3dd2dbe39932b52bedbb id_pac_bio_rw_metrics_tmp: 6206 @@ -200,7 +198,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30018 id_pac_bio_product: f50319c97e28f2e0a67ebbc736080c4e98f23cdf6e5b7cec964349ffb13ae797 id_pac_bio_rw_metrics_tmp: 6207 @@ -212,7 +209,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30019 id_pac_bio_product: 080733cab28898fcd69d1a418c7675cba38a548c9c20ac2da48a84c5658ee6b2 id_pac_bio_rw_metrics_tmp: 6207 @@ -224,7 +220,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30020 id_pac_bio_product: 14be4b6a6bb857c0967d56c90d2b57edc1401cdb5f95379312fb8e5ca71e09fa id_pac_bio_rw_metrics_tmp: 6207 @@ -236,7 +231,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30021 id_pac_bio_product: 4153f3a64e39588bf626c4dda42e5ee74b424bba67d69bb74bb029adda2e642c id_pac_bio_rw_metrics_tmp: 6208 @@ -248,7 +242,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30022 id_pac_bio_product: fbbcd5cac5d086ce64b3a37646e261b4c784fce6755fd65d6d41f048d2267c61 id_pac_bio_rw_metrics_tmp: 6208 @@ -260,7 +253,6 @@ hifi_num_reads: 1952224 hifi_read_bases: 21504288522 hifi_read_length_mean: 11015 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30023 id_pac_bio_product: 74af5a311e15af654336aea65826a2c4974842d752e25875b0303ad5a3556167 id_pac_bio_rw_metrics_tmp: 6209 @@ -272,7 +264,6 @@ hifi_num_reads: 1139885 hifi_read_bases: 15344650012 hifi_read_length_mean: 13461 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30024 id_pac_bio_product: 11022006a649937c570d100ccb382dddadf9a7174ee303903c8d2b7cd7efb328 id_pac_bio_rw_metrics_tmp: 6209 @@ -284,7 +275,6 @@ hifi_num_reads: 1751410 hifi_read_bases: 18538781061 hifi_read_length_mean: 10585 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30025 id_pac_bio_product: e6a2157d0fda8faae1288025e99ce5f8133f1466b752a67809668e5b9b16d5b1 id_pac_bio_rw_metrics_tmp: 6209 @@ -296,7 +286,6 @@ hifi_num_reads: 1991282 hifi_read_bases: 22462478066 hifi_read_length_mean: 11280 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30026 id_pac_bio_product: 9840280d97c98ff3ddda36ac95cf3b87f5810cc3be73a64c27d6ab92cfaab0ac id_pac_bio_rw_metrics_tmp: 6209 @@ -308,7 +297,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30027 id_pac_bio_product: 81141cdff1f57c0fc0fc5f88856fa7c6d2945acc5fa6e53e7d1214d17a00c410 id_pac_bio_rw_metrics_tmp: 6210 @@ -320,7 +308,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30028 id_pac_bio_product: 4145bf889c130ecaadcd4d757d0a3ca98d68629556427a27ebc08840ffdd0e0f id_pac_bio_rw_metrics_tmp: 6210 @@ -332,7 +319,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30029 id_pac_bio_product: 5b99ad09c31afd4917da39d44fc6cc40e1915572e80c20acbfda6d6c031e74c5 id_pac_bio_rw_metrics_tmp: 6211 @@ -344,7 +330,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30030 id_pac_bio_product: 0152d7945c4f74fac3ff828012ad2c01a95574df213d7664e7989e1039727cb5 id_pac_bio_rw_metrics_tmp: 6211 @@ -356,7 +341,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30031 id_pac_bio_product: 110e4562a6d28dd96973a98fcc1464d6c82dc413296b95d0c71727d21fa2a193 id_pac_bio_rw_metrics_tmp: 6212 @@ -368,7 +352,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30032 id_pac_bio_product: af65875cfecca04ee585c67525661f57a07d7f1427aa15ca39e158c791d63aa5 id_pac_bio_rw_metrics_tmp: 6212 @@ -380,7 +363,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30033 id_pac_bio_product: c24d50afb4c048f38dca230a03fb4880912713adf7db7a3ec4d5f57ee3c4cdec id_pac_bio_rw_metrics_tmp: 6212 @@ -392,7 +374,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30034 id_pac_bio_product: baa1e87601ca9c16d95b7fda9d9346557de4aaf4adb5c15383d0f8d9366692bf id_pac_bio_rw_metrics_tmp: 6213 @@ -404,7 +385,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30035 id_pac_bio_product: f88bcfb888f075442a005368c070ba83d895b07c013c68e1cb292fce4aaa40f2 id_pac_bio_rw_metrics_tmp: 6213 @@ -416,7 +396,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30036 id_pac_bio_product: 61d2c6fc72d593949cf7b60812a0076c9af57b0fa71b394f0669e410e040458e id_pac_bio_rw_metrics_tmp: 6213 @@ -428,7 +407,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30037 id_pac_bio_product: 252c8d3dc0b4c81e6d7359b0808ba962013e7b320eb9b979da526cecf5fdd019 id_pac_bio_rw_metrics_tmp: 6213 @@ -440,7 +418,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30153 id_pac_bio_product: 2135bf0b32c6b987042e67e062647aa21ac956c1d3385627b7a1d4cd670c355f id_pac_bio_rw_metrics_tmp: 6306 @@ -452,7 +429,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30154 id_pac_bio_product: 790e8882c97615d79ebe27b782eefa87eede2cecda8ebd960cdd88300059f196 id_pac_bio_rw_metrics_tmp: 6307 diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 8a9b64eb..818142fc 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -62,7 +62,6 @@ def simplex_run(request, mlwhdb_test_session): hifi_num_reads=10, hifi_read_length_mean=90, barcode_quality_score_mean=34, - hifi_read_quality_mean=35, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_a1, ) @@ -150,7 +149,6 @@ def multiplexed_run(mlwhdb_test_session): hifi_num_reads=20, hifi_read_length_mean=45, barcode_quality_score_mean=34, - hifi_read_quality_mean=35, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_b1, ) @@ -182,7 +180,6 @@ def multiplexed_run(mlwhdb_test_session): hifi_num_reads=10, hifi_read_length_mean=10, barcode_quality_score_mean=34, - hifi_read_quality_mean=35, hifi_bases_percent=100.00, pac_bio_run_well_metrics=well_metrics_b1, ) From 5c77509fa5be9f2c542a2c561d5eb420b3ccbb4b Mon Sep 17 00:00:00 2001 From: mgcam Date: Sat, 8 Jun 2024 23:53:34 +0100 Subject: [PATCH 13/46] Added a model for PB library LIMS data. Reimplemented the PacBioExperiment class, removed from_orm method, replaced it by a pre-init hook. --- lang_qc/models/pacbio/experiment.py | 160 +++++++++++++++++++++------- lang_qc/models/pacbio/well.py | 4 +- tests/test_pac_bio_experiment.py | 47 +++++--- 3 files changed, 155 insertions(+), 56 deletions(-) diff --git a/lang_qc/models/pacbio/experiment.py b/lang_qc/models/pacbio/experiment.py index 14eeb7cd..76bf598a 100644 --- a/lang_qc/models/pacbio/experiment.py +++ b/lang_qc/models/pacbio/experiment.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Genome Research Ltd. +# Copyright (c) 2023, 2024 Genome Research Ltd. # # Authors: # Marina Gourtovaia @@ -19,14 +19,96 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see . -from typing import List +from typing import Any -from pydantic import BaseModel, ConfigDict, Field +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass from lang_qc.db.mlwh_schema import PacBioRun -class PacBioExperiment(BaseModel): +@dataclass(kw_only=True, frozen=True) +class PacBioLibrary: + """ + This model represents LIMS data associated with a PacBio library. + + The fields of the model can be assigned directly via the constructor. + However, if the `db_library` field, a single row of the PacBioRun table + class, is set via the constructor, the rest of the fields are populated + using this database row object, while any other information passed to the + constructor is disregarded. + + The `db_library` field is not present in the model instance that is + returned by the constructor. + """ + + db_library: PacBioRun = Field(init_var=True) + + study_id: str = Field( + title="LIMS-specific study identifier", + ) + study_name: str = Field( + title="Study name", + ) + sample_id: str = Field( + title="LIMS-specific Sample identifier", + ) + sample_name: str = Field( + title="Sample name", + ) + tag_sequence: list = Field( + title="Tag sequence", + description=""" + Tag sequences as a list. An empty list for a non-indexed library. + """, + ) + library_type: str | None = Field( + default=None, + title="Library type", + ) + pool_name: str | None = Field( + default=None, + title="Pool name", + description=""" + The pac_bio_library_tube_barcode from TRACTION, AKA pool name + """, + ) + + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: + """ + Populates the fields of this object with information available + in the LIMS system. Errors if the `db_library` attribute is not + set via the constructor. + """ + + # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi + if "db_library" not in values.kwargs: + return values.kwargs + db_row: PacBioRun = values.kwargs["db_library"] + if db_row is None: + raise ValueError("None db_library value is not allowed.") + + assigned = dict() + study = db_row.study + assigned["study_name"] = study.name + assigned["study_id"] = study.id_study_lims + sample = db_row.sample + assigned["sample_name"] = sample.name + assigned["sample_id"] = sample.id_sample_lims + assigned["library_type"] = db_row.pipeline_id_lims + assigned["pool_name"] = db_row.pac_bio_library_tube_barcode + assigned["tag_sequence"] = [] + if tag := db_row.tag_sequence: + assigned["tag_sequence"].append(tag) + if tag := db_row.tag2_sequence: + assigned["tag_sequence"].append(tag) + + return assigned + + +@dataclass(kw_only=True, frozen=True) +class PacBioExperiment: """ A response model that contains laboratory tracking information about the PacBio wells and samples prior to the start of the @@ -43,6 +125,8 @@ class PacBioExperiment(BaseModel): (library). """ + db_libraries: list[PacBioRun] = Field(init_var=True) + study_id: list = Field( title="Study identifier", description=""" @@ -50,21 +134,21 @@ class PacBioExperiment(BaseModel): an unlikely case of multiple studies). """, ) - study_name: str = Field( + study_name: str | None = Field( default=None, title="Study name", description=""" Study name, is not set in case of multiple studies. """, ) - sample_id: str = Field( + sample_id: str | None = Field( default=None, title="Sample identifier", description=""" Sample identifier, is not set in case of multiple samples. """, ) - sample_name: str = Field( + sample_name: str | None = Field( default=None, title="Sample name", description=""" @@ -94,59 +178,57 @@ class PacBioExperiment(BaseModel): unlikely case of multiple library types. """, ) - pool_name: str = Field( + pool_name: str | None = Field( default=None, title="Pool name", description=""" The pac_bio_library_tube_barcode from TRACTION, AKA pool name """, ) - model_config = ConfigDict(from_attributes=True, extra="forbid") - @classmethod - def from_orm(cls, lims_db_rows: List[PacBioRun]): + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: """ - A factory method, creates an instance of the PacBioLimsData class. - Should be given a non-empty list of PacBioRun table row objects as - an argument. + Populates the fields of this object with information available + in the LIMS system. + Errors if the `db_libraries` attribute is not set via the constructor. """ + lims_db_rows: list[PacBioRun] = values.kwargs["db_libraries"] num_samples = len(lims_db_rows) if num_samples == 0: - raise Exception("Cannot create PacBioLimsData object, no data.") - if any(row is None for row in lims_db_rows): - raise Exception("Cannot create PacBioLimsData object, None row.") + raise ValueError("Empty db_libraries list is not allowed.") + + lib_objects = [PacBioLibrary(db_library=row) for row in lims_db_rows] - # Using sets for some data instead of lists because we do not - # want repetitions. lims_data = { "num_samples": num_samples, - "study_id": set(), - "library_type": set(), "tag_sequence": [], } - study_name = None - for row in lims_db_rows: - lims_data["study_id"].add(row.study.id_study_lims) - lims_data["library_type"].add(row.pipeline_id_lims) - study_name = row.study.name - if pool_name := row.pac_bio_library_tube_barcode: - lims_data["pool_name"] = pool_name - if num_samples == 1: - if tag := row.tag_sequence: - lims_data["tag_sequence"].append(tag) - if tag := row.tag2_sequence: - lims_data["tag_sequence"].append(tag) - lims_data["sample_id"] = row.sample.id_sample_lims - lims_data["sample_name"] = row.sample.name - lims_data["study_name"] = row.study.name + lims_data["study_id"] = {o.study_id for o in lib_objects} # returns a set + lims_data["library_type"] = { + o.library_type if o.library_type is not None else "UNKNOWN" + for o in lib_objects + } + + pool_names = {o.pool_name for o in lib_objects} + if len(pool_names) > 1: + raise ValueError("Multiple pool names.") + lims_data["pool_name"] = pool_names.pop() + + o = lib_objects[0] + if num_samples == 1: + lims_data["tag_sequence"] = o.tag_sequence + lims_data["sample_id"] = o.sample_id + lims_data["sample_name"] = o.sample_name + lims_data["study_name"] = o.study_name if len(lims_data["study_id"]) == 1: - lims_data["study_name"] = study_name + lims_data["study_name"] = o.study_name - # Convert sets back to lists and sort so that the list items are + # Convert sets back to lists and sort so that the items are # in a predictable order. for key in ("library_type", "study_id"): lims_data[key] = sorted(lims_data[key]) - return cls.model_validate(lims_data) + return lims_data diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index d2047a59..d0d34cbc 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -223,6 +223,8 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: assigned["metrics"] = QCDataWell.from_orm(mlwh_db_row) experiment_info = mlwh_db_row.get_experiment_info() if len(experiment_info): - assigned["experiment_tracking"] = PacBioExperiment.from_orm(experiment_info) + assigned["experiment_tracking"] = PacBioExperiment( + db_libraries=experiment_info + ) return assigned diff --git a/tests/test_pac_bio_experiment.py b/tests/test_pac_bio_experiment.py index ff979541..775693c5 100644 --- a/tests/test_pac_bio_experiment.py +++ b/tests/test_pac_bio_experiment.py @@ -2,7 +2,19 @@ from sqlalchemy import select from lang_qc.db.mlwh_schema import PacBioRun -from lang_qc.models.pacbio.experiment import PacBioExperiment +from lang_qc.models.pacbio.experiment import PacBioExperiment, PacBioLibrary + + +def test_creating_library_object(mlwhdb_test_session, mlwhdb_load_runs): + + l = PacBioLibrary( + study_id="1", + sample_id="1", + study_name="st_name", + sample_name="sa_name", + tag_sequence=[], + ) + assert l.study_id == "1" def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): @@ -17,7 +29,13 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_row = mlwhdb_test_session.execute(query).scalars().one() - lims = PacBioExperiment.from_orm([well_row]) + with pytest.raises(Exception, match=r"Empty db_libraries list is not allowed."): + PacBioExperiment(db_libraries=[]) + + with pytest.raises(ValueError, match=r"None db_library value is not allowed."): + PacBioExperiment(db_libraries=[well_row, None]) + + lims = PacBioExperiment(db_libraries=[well_row]) assert lims.num_samples == 1 assert lims.study_id == ["6457"] assert lims.study_name == "Tree of Life - ASG" @@ -34,7 +52,7 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_row = mlwhdb_test_session.execute(query).scalars().one() - lims = PacBioExperiment.from_orm([well_row]) + lims = PacBioExperiment(db_libraries=[well_row]) assert lims.num_samples == 1 assert lims.study_id == ["5901"] assert lims.study_name == "DTOL_Darwin Tree of Life" @@ -51,7 +69,7 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_rows = mlwhdb_test_session.execute(query).scalars().all() - lims = PacBioExperiment.from_orm(well_rows) + lims = PacBioExperiment(db_libraries=well_rows) assert lims.num_samples == 40 assert lims.study_id == ["7069"] assert lims.study_name == "Alternative Enzymes 2022 microbial genomes" @@ -68,7 +86,7 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_rows = mlwhdb_test_session.execute(query).scalars().all() - lims = PacBioExperiment.from_orm(well_rows) + lims = PacBioExperiment(db_libraries=well_rows) assert lims.num_samples == 3 assert lims.study_id == ["5901", "6457"] assert lims.study_name is None @@ -85,7 +103,14 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_rows = mlwhdb_test_session.execute(query).scalars().all() - lims = PacBioExperiment.from_orm(well_rows) + with pytest.raises(ValueError, match=r"Multiple pool names."): + PacBioExperiment(db_libraries=well_rows) + + for row in well_rows: + row.pac_bio_library_tube_barcode = "AXCTYW" + mlwhdb_test_session.commit() + + lims = PacBioExperiment(db_libraries=well_rows) assert lims.num_samples == 42 assert lims.study_id == ["6457", "7069"] assert lims.study_name is None @@ -93,13 +118,3 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): assert lims.sample_name is None assert lims.library_type == ["PacBio_Ultra_Low_Input", "Pacbio_HiFi_mplx"] assert lims.tag_sequence == [] - - with pytest.raises( - Exception, match=r"Cannot create PacBioLimsData object, no data" - ): - PacBioExperiment.from_orm([]) - - with pytest.raises( - Exception, match=r"Cannot create PacBioLimsData object, None row" - ): - PacBioExperiment.from_orm([well_row, None]) From fbff9a74dac11d80972c88ef4b991b60b3d824a3 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 11 Jun 2024 12:08:13 +0100 Subject: [PATCH 14/46] Created an extendable declarative base class ... for mlwh ORM classes so that common methods can be implemented. Customised __repr__ method for one of db classes. --- lang_qc/db/mlwh_schema.py | 36 ++++++++++++++++++++++++++++++++--- tests/test_mlwh_db_classes.py | 24 +++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 tests/test_mlwh_db_classes.py diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py index 395916f4..c0796b17 100644 --- a/lang_qc/db/mlwh_schema.py +++ b/lang_qc/db/mlwh_schema.py @@ -25,9 +25,30 @@ from sqlalchemy.dialects.mysql import SMALLINT as mysqlSMALLINT from sqlalchemy.dialects.mysql import TINYINT as mysqlTINYINT from sqlalchemy.dialects.mysql import VARCHAR as mysqlVARCHAR -from sqlalchemy.orm import declarative_base, relationship +from sqlalchemy.orm import DeclarativeBase, relationship -Base = declarative_base() + +class Base(DeclarativeBase): + """ + A base class for declarative class definitions for the ml warehouse database. + """ + + def get_row_description(self, fields: list[str]) -> str: + """ + Returns a printable representation of the database table row. Interprets + a list of strings given as the `fields` argument as a list of column + names. Combines the name of the class, names of the given columns + and respective values into a row description. The columns for which + the row has a NULL value are omitted from the description. + """ + + pairs = [] + for name in fields: + value = self.__getattribute__(name) + if value is not None: + pairs.append(f"{name}={value}") + description = ", ".join(pairs) + return f"{self.__module__}.{self.__class__.__name__}: {description}" class Sample(Base): @@ -538,7 +559,16 @@ class PacBioRunWellMetrics(Base): "PacBioProductMetrics", back_populates="pac_bio_run_well_metrics" ) - def get_experiment_info(self): + """Custom or customised methods are added below""" + + def __repr__(self): + """Returns a printable representation of the database row""" + + return self.get_row_description( + ["pac_bio_run_name", "well_label", "plate_number", "id_pac_bio_product"] + ) + + def get_experiment_info(self) -> list[PacBioRun]: """Returns a list of PacBioRun mlwh database rows. Returns LIMS information about the PacBio experiment diff --git a/tests/test_mlwh_db_classes.py b/tests/test_mlwh_db_classes.py new file mode 100644 index 00000000..be0b89aa --- /dev/null +++ b/tests/test_mlwh_db_classes.py @@ -0,0 +1,24 @@ +from sqlalchemy import select + +from lang_qc.db.mlwh_schema import PacBioRunWellMetrics + +"""Tests for custom and customised ORM methods""" + + +def test_pac_bio_well_metrics_repr(mlwhdb_test_session, mlwhdb_load_runs): + id1 = "cf18bd66e0f0895ea728c1d08103c62d3de8a57a5f879cee45f7b0acc028aa61" + id2 = "513c674f489b106c6af716dd0d210826ff03b7648d50888839c3722ca1b10dbf" + data = { + id1: f"pac_bio_run_name=TRACTION-RUN-92, well_label=A1, id_pac_bio_product={id1}", + id2: f"pac_bio_run_name=TRACTION-RUN-1140, well_label=A1, plate_number=2, id_pac_bio_product={id2}", + } + + for id in data.keys(): + query = select(PacBioRunWellMetrics).where( + PacBioRunWellMetrics.id_pac_bio_product == id + ) + db_row = mlwhdb_test_session.execute(query).scalar_one() + assert ( + db_row.__repr__() + == "lang_qc.db.mlwh_schema.PacBioRunWellMetrics: " + data[id] + ) From ed54cabcfbe7c70b4facaf1550397e9d73acf920 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 11 Jun 2024 13:17:13 +0100 Subject: [PATCH 15/46] Added a model representing libraries in a well. --- lang_qc/models/pacbio/well.py | 40 ++++++++++++++++++++-- tests/test_pb_well_models.py | 62 +++++++++++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index d0d34cbc..e5bf0d82 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -27,7 +27,7 @@ from pydantic.dataclasses import dataclass from lang_qc.db.mlwh_schema import PacBioRunWellMetrics -from lang_qc.models.pacbio.experiment import PacBioExperiment +from lang_qc.models.pacbio.experiment import PacBioExperiment, PacBioLibrary from lang_qc.models.pacbio.qc_data import QCDataWell from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_state import QcState @@ -132,9 +132,10 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: """ # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi - mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] - assert mlwh_db_row + if "db_well" not in values.kwargs: + raise ValueError("None db_well value is not allowed.") + mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] column_names = [column.key for column in PacBioRunWellMetrics.__table__.columns] assigned = dict() @@ -175,6 +176,39 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: return assigned +@dataclass(kw_only=True, frozen=True) +class PacBioWellLibraries(PacBioWell): + """A response model binding together basic PacBio well and LIMS data for + the libraries, which were sequenced in this well. + """ + + libraries: list[PacBioLibrary] = Field( + title="A list of `PacBioLibrary` objects", + description=""" + A list of `PacBioLibrary` objects. Each member of the list represents + a library, which was sequenced in this well. If the object is created + by supplying the `db_well` attribute via the constructor, the list + is never empty. The list is not sorted. + """, + ) + + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: + + assigned = super().pre_root(values) + mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] + lims_data = mlwh_db_row.get_experiment_info() + if len(lims_data) == 0: + raise ValueError( + f"No LIMS data retrieved for {mlwh_db_row.__repr__()} " + "on account of partially linked or unlinked product data." + ) + + assigned["libraries"] = [PacBioLibrary(db_library=row) for row in lims_data] + + return assigned + + class PacBioPagedWells(PagedResponse, extra="forbid"): """A response model for paged data about PacBio wells.""" diff --git a/tests/test_pb_well_models.py b/tests/test_pb_well_models.py index 12d64a44..ed80b755 100644 --- a/tests/test_pb_well_models.py +++ b/tests/test_pb_well_models.py @@ -1,10 +1,16 @@ +import pytest from npg_id_generation.pac_bio import PacBioEntity from sqlalchemy.orm import Session from lang_qc.db.helper.qc import get_qc_states_by_id_product_list from lang_qc.db.helper.wells import WellWh from lang_qc.db.mlwh_schema import PacBioRunWellMetrics -from lang_qc.models.pacbio.well import PacBioWellFull, PacBioWellSummary +from lang_qc.models.pacbio.experiment import PacBioLibrary +from lang_qc.models.pacbio.well import ( + PacBioWellFull, + PacBioWellLibraries, + PacBioWellSummary, +) from tests.conftest import compare_dates from tests.fixtures.well_data import load_data4well_retrieval, load_dicts_and_users @@ -116,9 +122,13 @@ def test_create_full_model( assert pb_well.experiment_tracking is None -def test_create_summary_model( +def test_create_summary_and_library_models( mlwhdb_test_session, qcdb_test_session, load_data4well_retrieval, mlwhdb_load_runs ): + + with pytest.raises(ValueError, match=r"None db_well value is not allowed."): + PacBioWellSummary(plate_number=3) + (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION-RUN-92", "A1" ) @@ -126,6 +136,9 @@ def test_create_summary_model( _examine_well_model_a1(pb_well, well_row.id_pac_bio_product) assert pb_well.study_names == ["Tree of Life - ASG"] + pb_well = PacBioWellLibraries(db_well=well_row) + _examine_well_model_a1(pb_well, well_row.id_pac_bio_product) + (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION_RUN_1", "B1" ) @@ -140,7 +153,7 @@ def test_create_summary_model( _examine_well_model_c1(pb_well, well_row.id_pac_bio_product) -def test_create_summary_model_study_info( +def test_create_summary_and_library_models_lims_info( mlwhdb_test_session, qcdb_test_session, load_data4well_retrieval, mlwhdb_load_runs ): # Well with two samples, none is linked to LIMS @@ -150,6 +163,9 @@ def test_create_summary_model_study_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] + with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + PacBioWellLibraries(db_well=well_row) + # Fully linked wells with one sample (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION-RUN-1162", "C1" @@ -163,6 +179,19 @@ def test_create_summary_model_study_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == ["DTOL_Darwin Tree of Life"] + pb_well = PacBioWellLibraries(db_well=well_row) + assert len(pb_well.libraries) == 1 + expected_lib = PacBioLibrary( + study_id="5901", + study_name="DTOL_Darwin Tree of Life", + sample_id="9463663", + sample_name="DTOL14290946", + tag_sequence=["CTCAGCATACGAGTAT"], + library_type="Pacbio_HiFi", + pool_name="TRAC-2-7128", + ) + assert pb_well.libraries[0] == expected_lib + # A fully linked well with multiple samples, all belonging to the same study (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION-RUN-1140", "B1", 1 @@ -180,6 +209,30 @@ def test_create_summary_model_study_info( "ToL_Blaxter_ Reference Genomes_ DNA", ] + pb_well = PacBioWellLibraries(db_well=well_row) + assert len(pb_well.libraries) == 4 + libs = {lib.sample_id: lib for lib in pb_well.libraries} + expected_lib = PacBioLibrary( + study_id="6771", + study_name="ToL_Blaxter_ Reference Genomes_ DNA", + sample_id="8657549", + sample_name="6771STDY13618009", + tag_sequence=["CTGCGATCACGAGTAT"], + library_type="Pacbio_HiFi", + pool_name="TRAC-2-7676", + ) + assert libs["8657549"] == expected_lib + expected_lib = PacBioLibrary( + study_id="5901", + study_name="DTOL_Darwin Tree of Life", + sample_id="9463590", + sample_name="DTOL14291044", + tag_sequence=["TCTGCATCATGAGTAT"], + library_type="Pacbio_HiFi", + pool_name="TRAC-2-7676", + ) + assert libs["9463590"] == expected_lib + # A partially linked well with three samples, which belong to two studies. # The LIMS link for one of the samples is deleted so that two other samples # belong to the same study. @@ -188,3 +241,6 @@ def test_create_summary_model_study_info( ) pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] + + with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + PacBioWellLibraries(db_well=well_row) From 3df42faf70a9a549a8a426d90a83b5640145c7aa Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 11 Jun 2024 13:52:02 +0000 Subject: [PATCH 16/46] Update mlwh model to include new barcode4deplexing column --- lang_qc/db/mlwh_schema.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py index 395916f4..d445120a 100644 --- a/lang_qc/db/mlwh_schema.py +++ b/lang_qc/db/mlwh_schema.py @@ -609,6 +609,11 @@ class PacBioProductMetrics(Base): hifi_read_length_mean = Column( mysqlINTEGER(unsigned=True), nullable=True, comment="The mean HiFi read length" ) + barcode4deplexing = Column( + mysqlVARCHAR(62), + nullable=True, + comment="The barcode recorded in producing deplexed metrics for this product", + ) barcode_quality_score_mean = Column( mysqlSMALLINT(unsigned=True), nullable=True, From 11be2e055ef3894a9dcfa978deb5b50e09d5a51a Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 11 Jun 2024 14:04:13 +0000 Subject: [PATCH 17/46] Supplement fixture with barcode IDs --- .../mlwh_pb_runs/300-PacBioProductMetrics.yml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml index f46e1d13..c02bb364 100644 --- a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml +++ b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml @@ -172,6 +172,7 @@ id_pac_bio_rw_metrics_tmp: 1735 id_pac_bio_tmp: 99008 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2020--bc2020 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -183,6 +184,7 @@ last_changed: 2024-02-28 14:10:14 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2011--bc2011 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -194,6 +196,7 @@ last_changed: 2024-02-28 14:10:14 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc1011_BAK8A_OA--bc1011_BAK8A_OA hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -205,6 +208,7 @@ last_changed: 2024-03-05 15:10:36 qc: 0 - barcode_quality_score_mean: ~ + barcode4deplexing: bc1022_BAK8B_OA--bc1022_BAK8B_OA hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -216,6 +220,7 @@ last_changed: 2024-03-05 15:10:36 qc: 0 - barcode_quality_score_mean: ~ + barcode4deplexing: bc1001_BAK8A_OA--bc1001_BAK8A_OA hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -227,6 +232,7 @@ last_changed: 2024-03-05 15:10:36 qc: 0 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2035--bc2035 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -238,6 +244,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2052--bc2052 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -249,6 +256,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 97 + barcode4deplexing: bc2036--bc2036 hifi_bases_percent: 27.49 hifi_num_reads: 1952224 hifi_read_bases: 21504288522 @@ -260,6 +268,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 96 + barcode4deplexing: bc2040--bc2040 hifi_bases_percent: 19.62 hifi_num_reads: 1139885 hifi_read_bases: 15344650012 @@ -271,6 +280,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 96 + barcode4deplexing: bc2054--bc2054 hifi_bases_percent: 23.7 hifi_num_reads: 1751410 hifi_read_bases: 18538781061 @@ -282,6 +292,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 97 + barcode4deplexing: bc2063--bc2063 hifi_bases_percent: 28.72 hifi_num_reads: 1991282 hifi_read_bases: 22462478066 @@ -293,6 +304,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2016--bc2016 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -304,6 +316,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2096--bc2096 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -315,6 +328,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2056--bc2056 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -326,6 +340,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2072--bc2072 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -337,6 +352,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2021--bc2021 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -348,6 +364,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2011--bc2011 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -359,6 +376,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2015--bc2015 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -370,6 +388,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2083--bc2083 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -381,6 +400,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2084--bc2084 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -392,6 +412,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2085--bc2085 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -403,6 +424,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2094--bc2094 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -414,6 +436,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2070--bc2070 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -425,6 +448,7 @@ last_changed: 2024-03-08 12:10:14 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2055--bc2055 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ From 00df2edb15ea38c985b6c759677cf700374ef702 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 11 Jun 2024 18:59:28 +0100 Subject: [PATCH 18/46] Added an end point for well library data. --- lang_qc/endpoints/pacbio_well.py | 33 +++++++++++- lang_qc/models/pacbio/well.py | 3 +- lang_qc/util/errors.py | 7 +++ tests/endpoints/test_well_libraries.py | 70 ++++++++++++++++++++++++++ tests/test_pb_well_models.py | 5 +- 5 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 tests/endpoints/test_well_libraries.py diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index f9d49573..d2a5a61a 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -37,13 +37,18 @@ from lang_qc.db.mlwh_connection import get_mlwh_db from lang_qc.db.qc_connection import get_qc_db from lang_qc.db.qc_schema import User -from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull +from lang_qc.models.pacbio.well import ( + PacBioPagedWells, + PacBioWellFull, + PacBioWellLibraries, +) from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState, QcStateBasic from lang_qc.util.auth import check_user from lang_qc.util.errors import ( InconsistentInputError, InvalidDictValueError, + MissingLimsDataError, RunNotFoundError, ) from lang_qc.util.type_checksum import ChecksumSHA256 @@ -163,6 +168,32 @@ def get_wells_in_run( return response +@router.get( + "/wells/{id_product}/libraries", + summary="Get well summary and LIMS data for all libraries", + responses={ + status.HTTP_404_NOT_FOUND: {"description": "Well product does not exist"}, + status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"}, + status.HTTP_409_CONFLICT: {"description": "Missing or incomplete LIMS data"}, + }, + response_model=PacBioWellLibraries, +) +def get_well_lims_info( + id_product: ChecksumSHA256, + mlwhdb_session: Session = Depends(get_mlwh_db), +) -> PacBioWellLibraries: + + db_well = _find_well_product_or_error(id_product, mlwhdb_session) + well_libraries: PacBioWellLibraries + try: + well_libraries = PacBioWellLibraries(db_well=db_well) + except MissingLimsDataError as err: + # 409 - Request conflicts with the current state of the server. + raise HTTPException(409, detail=str(err)) + + return well_libraries + + @router.get( "/products/{id_product}/seq_level", summary="Get full sequencing QC metrics and state for a product", diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index e5bf0d82..e8098050 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -31,6 +31,7 @@ from lang_qc.models.pacbio.qc_data import QCDataWell from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_state import QcState +from lang_qc.util.errors import MissingLimsDataError def get_field_names(cls): @@ -199,7 +200,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] lims_data = mlwh_db_row.get_experiment_info() if len(lims_data) == 0: - raise ValueError( + raise MissingLimsDataError( f"No LIMS data retrieved for {mlwh_db_row.__repr__()} " "on account of partially linked or unlinked product data." ) diff --git a/lang_qc/util/errors.py b/lang_qc/util/errors.py index 21dab573..aeac0ef9 100644 --- a/lang_qc/util/errors.py +++ b/lang_qc/util/errors.py @@ -25,3 +25,10 @@ class EmptyListOfRunNamesError(Exception): class RunNotFoundError(Exception): """Exception to be used when no well metrics data for a run is found.""" + + +class MissingLimsDataError(Exception): + """ + Exception to be used when product LIMS data is not available + or partially missing. + """ diff --git a/tests/endpoints/test_well_libraries.py b/tests/endpoints/test_well_libraries.py new file mode 100644 index 00000000..d35e4767 --- /dev/null +++ b/tests/endpoints/test_well_libraries.py @@ -0,0 +1,70 @@ +from fastapi.testclient import TestClient +from sqlalchemy import select + +# from lang_qc.db.mlwh_schema import PacBioRunWellMetrics + + +def test_well_libraries(test_client: TestClient, mlwhdb_load_runs): + """Test retrieval of LIMS library data for a well.""" + + response = test_client.get(f"/pacbio/wells/malformed/libraries") + assert response.status_code == 422 + + id_product = "aaa8bd66e0f0895ea728c1d08103c62d3de8a57a5f879cee45f7b0acc028aa61" + response = test_client.get(f"/pacbio/wells/{id_product}/libraries") + assert response.status_code == 404 + + # Partially linked well + id_product = "26928ba6ec2a00c04dd6c7c68008ec9436e3979a384b9f708dc371c99f272e17" + response = test_client.get(f"/pacbio/wells/{id_product}/libraries") + assert response.status_code == 409 + assert response.json()["detail"] == "".join( + [ + "No LIMS data retrieved for lang_qc.db.mlwh_schema.PacBioRunWellMetrics:", + " pac_bio_run_name=TRACTION-RUN-1140, well_label=C1, plate_number=2,", + " id_pac_bio_product=26928ba6ec2a00c04dd6c7c68008ec9436e3979a384b9f708dc371c99f272e17", + " on account of partially linked or unlinked product data.", + ] + ) + + # Fully linked well + id_product = "513c674f489b106c6af716dd0d210826ff03b7648d50888839c3722ca1b10dbf" + response = test_client.get(f"/pacbio/wells/{id_product}/libraries") + assert response.status_code == 200 + expected_response = { + "id_product": "513c674f489b106c6af716dd0d210826ff03b7648d50888839c3722ca1b10dbf", + "label": "A1", + "plate_number": 2, + "run_name": "TRACTION-RUN-1140", + "run_start_time": "2024-02-23T10:28:12", + "run_complete_time": "2024-02-25T20:53:05", + "well_start_time": "2024-02-24T14:25:12", + "well_complete_time": "2024-02-26T00:27:52", + "run_status": "Complete", + "well_status": "Complete", + "instrument_name": "84093", + "instrument_type": "Revio", + "qc_state": None, + "libraries": [ + { + "study_id": "5901", + "study_name": "DTOL_Darwin Tree of Life", + "sample_id": "9478726", + "sample_name": "DTOL14523243", + "tag_sequence": ["ATCTGCACGTGAGTAT"], + "library_type": "Pacbio_HiFi", + "pool_name": "TRAC-2-7677", + }, + { + "study_id": "5901", + "study_name": "DTOL_Darwin Tree of Life", + "sample_id": "9518398", + "sample_name": "DTOL14180244", + "tag_sequence": ["ATGTACTAGTGAGTAT"], + "library_type": "Pacbio_HiFi", + "pool_name": "TRAC-2-7677", + }, + ], + } + + assert response.json() == expected_response diff --git a/tests/test_pb_well_models.py b/tests/test_pb_well_models.py index ed80b755..ce5560d3 100644 --- a/tests/test_pb_well_models.py +++ b/tests/test_pb_well_models.py @@ -11,6 +11,7 @@ PacBioWellLibraries, PacBioWellSummary, ) +from lang_qc.util.errors import MissingLimsDataError from tests.conftest import compare_dates from tests.fixtures.well_data import load_data4well_retrieval, load_dicts_and_users @@ -163,7 +164,7 @@ def test_create_summary_and_library_models_lims_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] - with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + with pytest.raises(MissingLimsDataError, match=r"No LIMS data retrieved"): PacBioWellLibraries(db_well=well_row) # Fully linked wells with one sample @@ -242,5 +243,5 @@ def test_create_summary_and_library_models_lims_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] - with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + with pytest.raises(MissingLimsDataError, match=r"No LIMS data retrieved"): PacBioWellLibraries(db_well=well_row) From 6e5472a0d5bcf7b8521600b6b99e7b2f04aaa737 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Wed, 12 Jun 2024 12:51:48 +0000 Subject: [PATCH 19/46] Add deplexing barcodes and modes to test data. Check deplexing mode to determine whether to run stats or not. --- lang_qc/db/helper/wells.py | 5 ++--- lang_qc/models/pacbio/qc_data.py | 1 + tests/fixtures/sample_data.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 48a3042a..69fac7e8 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -84,10 +84,8 @@ def get_metrics_by_well_product_id( self, id_product: PacBioWellSHA256 ) -> QCPoolMetrics | None: well = self.get_mlwh_well_by_product_id(id_product) - if well: + if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode: product_metrics = well.pac_bio_product_metrics - if len(product_metrics) == 1: - return None cov: float | None if any(p.hifi_num_reads is None for p in product_metrics): @@ -103,6 +101,7 @@ def get_metrics_by_well_product_id( id_product=prod.id_pac_bio_product, tag1_name=prod.pac_bio_run.tag_identifier, tag2_name=prod.pac_bio_run.tag2_identifier, + deplexing_barcode=prod.barcode4deplexing, hifi_read_bases=prod.hifi_read_bases, hifi_num_reads=prod.hifi_num_reads, hifi_read_length_mean=prod.hifi_read_length_mean, diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 259a1785..fb9a8747 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -167,6 +167,7 @@ class SampleDeplexingStats(BaseModel): id_product: PacBioProductSHA256 tag1_name: str | None tag2_name: str | None + deplexing_barcode: str | None hifi_read_bases: int | None hifi_num_reads: int | None hifi_read_length_mean: float | None diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 818142fc..e86fbb5f 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -48,6 +48,7 @@ def simplex_run(request, mlwhdb_test_session): well_label=well_label, plate_number=plate_number, ).hash_product_id(), + demultiplex_mode=None, ) product = PacBioProductMetrics( @@ -64,6 +65,7 @@ def simplex_run(request, mlwhdb_test_session): barcode_quality_score_mean=34, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_a1, + barcode4deplexing=None, ) study = Study( @@ -135,6 +137,7 @@ def multiplexed_run(mlwhdb_test_session): plate_number=plate_number, ).hash_product_id(), hifi_num_reads=30, + demultiplex_mode="OnInstrument", ) product_1 = PacBioProductMetrics( @@ -151,6 +154,7 @@ def multiplexed_run(mlwhdb_test_session): barcode_quality_score_mean=34, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_b1, + barcode4deplexing="bc10--bc10", ) multiplex_run_1 = PacBioRun( @@ -182,6 +186,7 @@ def multiplexed_run(mlwhdb_test_session): barcode_quality_score_mean=34, hifi_bases_percent=100.00, pac_bio_run_well_metrics=well_metrics_b1, + barcode4deplexing="bc11--bc11", ) multiplex_run_2 = PacBioRun( From fd3e9ef00b758a50ac6a7ab39463fa88f752de2b Mon Sep 17 00:00:00 2001 From: mgcam Date: Wed, 12 Jun 2024 15:01:54 +0100 Subject: [PATCH 20/46] Dropped direct calls to __repr__() Also made the helper function of the parent class 'private'. --- lang_qc/db/mlwh_schema.py | 4 ++-- lang_qc/models/pacbio/well.py | 2 +- tests/test_mlwh_db_classes.py | 7 +++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py index c0796b17..c1cbff09 100644 --- a/lang_qc/db/mlwh_schema.py +++ b/lang_qc/db/mlwh_schema.py @@ -33,7 +33,7 @@ class Base(DeclarativeBase): A base class for declarative class definitions for the ml warehouse database. """ - def get_row_description(self, fields: list[str]) -> str: + def _get_row_description(self, fields: list[str]) -> str: """ Returns a printable representation of the database table row. Interprets a list of strings given as the `fields` argument as a list of column @@ -564,7 +564,7 @@ class PacBioRunWellMetrics(Base): def __repr__(self): """Returns a printable representation of the database row""" - return self.get_row_description( + return self._get_row_description( ["pac_bio_run_name", "well_label", "plate_number", "id_pac_bio_product"] ) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index e8098050..00926da3 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -201,7 +201,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: lims_data = mlwh_db_row.get_experiment_info() if len(lims_data) == 0: raise MissingLimsDataError( - f"No LIMS data retrieved for {mlwh_db_row.__repr__()} " + f"No LIMS data retrieved for {str(mlwh_db_row)} " "on account of partially linked or unlinked product data." ) diff --git a/tests/test_mlwh_db_classes.py b/tests/test_mlwh_db_classes.py index be0b89aa..5b832fe8 100644 --- a/tests/test_mlwh_db_classes.py +++ b/tests/test_mlwh_db_classes.py @@ -18,7 +18,6 @@ def test_pac_bio_well_metrics_repr(mlwhdb_test_session, mlwhdb_load_runs): PacBioRunWellMetrics.id_pac_bio_product == id ) db_row = mlwhdb_test_session.execute(query).scalar_one() - assert ( - db_row.__repr__() - == "lang_qc.db.mlwh_schema.PacBioRunWellMetrics: " + data[id] - ) + expected_string = "lang_qc.db.mlwh_schema.PacBioRunWellMetrics: " + data[id] + assert db_row.__repr__() == expected_string + assert str(db_row) == expected_string From 2f9be8aec83686f16c8bbfa7ebd1c2f4d29a08cd Mon Sep 17 00:00:00 2001 From: mgcam Date: Wed, 12 Jun 2024 16:59:42 +0100 Subject: [PATCH 21/46] Added a check for unlinked data. --- lang_qc/db/helper/wells.py | 27 ++++++++++++++++++--------- tests/test_pac_bio_qc_data_well.py | 13 +++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 69fac7e8..57d7b01c 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -85,7 +85,15 @@ def get_metrics_by_well_product_id( ) -> QCPoolMetrics | None: well = self.get_mlwh_well_by_product_id(id_product) if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode: + product_metrics = well.pac_bio_product_metrics + lib_lims_data = [ + row + for row in map(lambda product: product.pac_bio_run, product_metrics) + if row is not None + ] + if len(lib_lims_data) != len(product_metrics): + raise Exception("Partially linked LIMS data or no linked LIMS data") cov: float | None if any(p.hifi_num_reads is None for p in product_metrics): @@ -94,13 +102,13 @@ def get_metrics_by_well_product_id( hifi_reads = [prod.hifi_num_reads for prod in product_metrics] cov = stdev(hifi_reads) / mean(hifi_reads) * 100 - return QCPoolMetrics( - pool_coeff_of_variance=cov, - products=[ + sample_stats = [] + for (i, prod) in enumerate(product_metrics): + sample_stats.append( SampleDeplexingStats( id_product=prod.id_pac_bio_product, - tag1_name=prod.pac_bio_run.tag_identifier, - tag2_name=prod.pac_bio_run.tag2_identifier, + tag1_name=lib_lims_data[i].tag_identifier, + tag2_name=lib_lims_data[i].tag2_identifier, deplexing_barcode=prod.barcode4deplexing, hifi_read_bases=prod.hifi_read_bases, hifi_num_reads=prod.hifi_num_reads, @@ -108,13 +116,14 @@ def get_metrics_by_well_product_id( hifi_bases_percent=prod.hifi_bases_percent, percentage_total_reads=( prod.hifi_num_reads / well.hifi_num_reads * 100 - if well.hifi_num_reads + if (well.hifi_num_reads and prod.hifi_num_reads) else None ), ) - for prod in product_metrics - ], - ) + ) + + return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats) + return None def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index bd4318a0..3be9de90 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -1,3 +1,4 @@ +import pytest from npg_id_generation.pac_bio import PacBioEntity from lang_qc.db.helper.wells import WellWh @@ -134,3 +135,15 @@ def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run): assert ( int(metrics.products[1].percentage_total_reads) == 66 ), "20 of 30 reads is 66.6%" + + +def test_pool_metrics_from_well(mlwhdb_test_session): + + id = PacBioEntity( + run_name="TRACTION-RUN-1140", well_label="C1", plate_number=2 + ).hash_product_id() + helper = WellWh(session=mlwhdb_test_session) + with pytest.raises( + Exception, match=r"Partially linked LIMS data or no linked LIMS data" + ): + helper.get_metrics_by_well_product_id(id) From 372fc567c64a4f70806079291b8e48d0390afa60 Mon Sep 17 00:00:00 2001 From: mgcam Date: Thu, 13 Jun 2024 12:09:32 +0100 Subject: [PATCH 22/46] Simplified getting linked lims data. --- lang_qc/db/helper/wells.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 57d7b01c..4c8f0894 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -88,9 +88,9 @@ def get_metrics_by_well_product_id( product_metrics = well.pac_bio_product_metrics lib_lims_data = [ - row - for row in map(lambda product: product.pac_bio_run, product_metrics) - if row is not None + product.pac_bio_run + for product in product_metrics + if product.pac_bio_run is not None ] if len(lib_lims_data) != len(product_metrics): raise Exception("Partially linked LIMS data or no linked LIMS data") From a467f64ca94487028298cdb3b454b649367141c3 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 14 Jun 2024 14:19:35 +0000 Subject: [PATCH 23/46] Remove the now unused test data factory fixture. --- tests/fixtures/inbox_data.py | 101 ----------------------------------- 1 file changed, 101 deletions(-) diff --git a/tests/fixtures/inbox_data.py b/tests/fixtures/inbox_data.py index 64379b18..680d3af9 100644 --- a/tests/fixtures/inbox_data.py +++ b/tests/fixtures/inbox_data.py @@ -97,104 +97,3 @@ def inbox_data(mlwhdb_test_session): mlwhdb_test_session.commit() yield True - - -@pytest.fixture() -def test_data_factory(mlwhdb_test_session, qcdb_test_session): - def setup_data(desired_wells): - # Setup dicts and "filler" data - - library_qc_type = QcType( - qc_type="library", description="Sample/library evaluation" - ) - seq_qc_type = QcType( - qc_type="sequencing", description="Sequencing process evaluation" - ) - - run_name_attr = SubProductAttr( - attr_name="run_name", description="PacBio run name." - ) - well_label_attr = SubProductAttr( - attr_name="well_label", description="PacBio well label" - ) - seq_platform = SeqPlatform(name="PacBio", description="Pacific Biosciences.") - user = User(username="zx80@example.com") - other_user = User(username="cd32@example.com") - states = ["Passed", "Failed", "Claimed", "On hold", "Aborted"] - state_dicts = {} - - for state in states: - outcome = None - if state == "Passed": - outcome = True - elif state == "Failed": - outcome = False - state_dicts[state] = QcStateDict(state=state, outcome=outcome) - - qcdb_test_session.add_all(state_dicts.values()) - qcdb_test_session.add_all( - [ - library_qc_type, - seq_qc_type, - run_name_attr, - well_label_attr, - seq_platform, - user, - other_user, - ] - ) - - # Start adding the PacBioRunWellMetrics and QcState rows. - for run_name, wells in desired_wells.items(): - for well_label, state in wells.items(): - - pbe = PacBioEntity(run_name=run_name, well_label=well_label) - id = pbe.hash_product_id() - - run_metrics = PacBioRunWellMetrics( - pac_bio_run_name=run_name, - well_label=well_label, - id_pac_bio_product=id, - instrument_type="PacBio", - polymerase_num_reads=1337, - ccs_execution_mode="None", - well_status="Complete", - run_start=datetime.now() - timedelta(days=3), - run_complete=datetime.now() - timedelta(days=1), - well_start=datetime.now() - timedelta(days=2), - well_complete=datetime.now() - timedelta(days=1), - ) - mlwhdb_test_session.add(run_metrics) - - if state is not None: - - qc_state = QcState( - created_by="me", - is_preliminary=state in ["On hold", "Claimed"], - qc_state_dict=state_dicts[state], - qc_type=seq_qc_type, - seq_product=SeqProduct( - id_product=id, - seq_platform=seq_platform, - sub_products=[ - SubProduct( - sub_product_attr=run_name_attr, - sub_product_attr_=well_label_attr, - value_attr_one=run_name, - value_attr_two=well_label, - properties_digest=id, - ), - ], - ), - user=user, - ) - qcdb_test_session.add(qc_state) - # Feed back to fixture use - desired_wells[run_name][well_label] = qc_state - - qcdb_test_session.commit() - mlwhdb_test_session.commit() - - return desired_wells - - yield setup_data From 221fa08a7939fecb04f732534c063ac33d645a1c Mon Sep 17 00:00:00 2001 From: mgcam Date: Fri, 14 Jun 2024 15:43:00 +0100 Subject: [PATCH 24/46] Moved QCPoolMetrics model generation to the model itself. Return 409 status in case of incomplete or missing LIMS data. Do not error if the metrics cannot be computed since the client does not necessary know whether it is appropriate to request this metrics. --- lang_qc/db/helper/wells.py | 48 -------------------- lang_qc/endpoints/pacbio_well.py | 15 ++++--- lang_qc/models/pacbio/qc_data.py | 70 +++++++++++++++++++++++++++++- tests/test_pac_bio_qc_data_well.py | 60 +++++++++++++++---------- 4 files changed, 113 insertions(+), 80 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 4c8f0894..1a9e67bf 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -21,7 +21,6 @@ import logging from datetime import date, datetime, timedelta -from statistics import mean, stdev from typing import ClassVar, List from pydantic import BaseModel, ConfigDict, Field @@ -34,7 +33,6 @@ ) from lang_qc.db.mlwh_schema import PacBioRunWellMetrics from lang_qc.db.qc_schema import QcState, QcStateDict, QcType -from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_flow_status import QcFlowStatusEnum @@ -80,52 +78,6 @@ def get_mlwh_well_by_product_id( ) ).scalar_one_or_none() - def get_metrics_by_well_product_id( - self, id_product: PacBioWellSHA256 - ) -> QCPoolMetrics | None: - well = self.get_mlwh_well_by_product_id(id_product) - if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode: - - product_metrics = well.pac_bio_product_metrics - lib_lims_data = [ - product.pac_bio_run - for product in product_metrics - if product.pac_bio_run is not None - ] - if len(lib_lims_data) != len(product_metrics): - raise Exception("Partially linked LIMS data or no linked LIMS data") - - cov: float | None - if any(p.hifi_num_reads is None for p in product_metrics): - cov = None - else: - hifi_reads = [prod.hifi_num_reads for prod in product_metrics] - cov = stdev(hifi_reads) / mean(hifi_reads) * 100 - - sample_stats = [] - for (i, prod) in enumerate(product_metrics): - sample_stats.append( - SampleDeplexingStats( - id_product=prod.id_pac_bio_product, - tag1_name=lib_lims_data[i].tag_identifier, - tag2_name=lib_lims_data[i].tag2_identifier, - deplexing_barcode=prod.barcode4deplexing, - hifi_read_bases=prod.hifi_read_bases, - hifi_num_reads=prod.hifi_num_reads, - hifi_read_length_mean=prod.hifi_read_length_mean, - hifi_bases_percent=prod.hifi_bases_percent, - percentage_total_reads=( - prod.hifi_num_reads / well.hifi_num_reads * 100 - if (well.hifi_num_reads and prod.hifi_num_reads) - else None - ), - ) - ) - - return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats) - - return None - def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: """ Get recent not QC-ed completed wells from the mlwh database. diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index 2c4671c1..eea981a1 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -222,6 +222,7 @@ def get_seq_metrics( summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID", responses={ status.HTTP_404_NOT_FOUND: {"description": "Product not found"}, + status.HTTP_409_CONFLICT: {"description": "Missing or incomplete LIMS data"}, status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"}, }, response_model=QCPoolMetrics, @@ -229,13 +230,13 @@ def get_seq_metrics( def get_product_metrics( id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db) ) -> QCPoolMetrics: - metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id( - id_product - ) - if metrics is None: - raise HTTPException( - status_code=404, detail="Well does not have any pool metrics" - ) + + mlwh_well = _find_well_product_or_error(id_product, mlwhdb_session) + try: + metrics = QCPoolMetrics(db_well=mlwh_well) + except MissingLimsDataError as err: + raise HTTPException(409, detail=str(err)) + return metrics diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index fb9a8747..6aa6a0a4 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -20,9 +20,14 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see -from pydantic import BaseModel, ConfigDict, Field +from statistics import mean, stdev +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic.dataclasses import dataclass from lang_qc.db.mlwh_schema import PacBioRunWellMetrics +from lang_qc.util.errors import MissingLimsDataError from lang_qc.util.type_checksum import PacBioProductSHA256 @@ -175,7 +180,10 @@ class SampleDeplexingStats(BaseModel): percentage_total_reads: float | None -class QCPoolMetrics(BaseModel): +@dataclass(kw_only=True, frozen=True) +class QCPoolMetrics: + + db_well: PacBioRunWellMetrics = Field(init_var=True) pool_coeff_of_variance: float | None = Field( title="Coefficient of variance for reads in the pool", description="Percentage of the standard deviation w.r.t. mean, when pool is more than one", @@ -183,3 +191,61 @@ class QCPoolMetrics(BaseModel): products: list[SampleDeplexingStats] = Field( title="List of products and their metrics" ) + + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: + """ + Populates this object with the run and well tracking information + from a database row that is passed as an argument. + """ + + db_well_key_name = "db_well" + # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi + if db_well_key_name not in values.kwargs: + return values.kwargs + + well: PacBioRunWellMetrics = values.kwargs[db_well_key_name] + if well is None: + raise ValueError(f"None {db_well_key_name} value is not allowed.") + + cov: float = None + sample_stats = [] + + if well.demultiplex_mode and "Instrument" in well.demultiplex_mode: + product_metrics = well.pac_bio_product_metrics + lib_lims_data = [ + product.pac_bio_run + for product in product_metrics + if product.pac_bio_run is not None + ] + if len(lib_lims_data) != len(product_metrics): + raise MissingLimsDataError( + "Partially linked LIMS data or no linked LIMS data" + ) + + if any(p.hifi_num_reads is None for p in product_metrics): + cov = None + else: + hifi_reads = [prod.hifi_num_reads for prod in product_metrics] + cov = stdev(hifi_reads) / mean(hifi_reads) * 100 + + for (i, prod) in enumerate(product_metrics): + sample_stats.append( + SampleDeplexingStats( + id_product=prod.id_pac_bio_product, + tag1_name=lib_lims_data[i].tag_identifier, + tag2_name=lib_lims_data[i].tag2_identifier, + deplexing_barcode=prod.barcode4deplexing, + hifi_read_bases=prod.hifi_read_bases, + hifi_num_reads=prod.hifi_num_reads, + hifi_read_length_mean=prod.hifi_read_length_mean, + hifi_bases_percent=prod.hifi_bases_percent, + percentage_total_reads=( + prod.hifi_num_reads / well.hifi_num_reads * 100 + if (well.hifi_num_reads and prod.hifi_num_reads) + else None + ), + ) + ) + + return {"pool_coeff_of_variance": cov, "products": sample_stats} diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index 3be9de90..772a9cb6 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -2,7 +2,8 @@ from npg_id_generation.pac_bio import PacBioEntity from lang_qc.db.helper.wells import WellWh -from lang_qc.models.pacbio.qc_data import QCDataWell +from lang_qc.models.pacbio.qc_data import QCDataWell, QCPoolMetrics +from lang_qc.util.errors import MissingLimsDataError from tests.fixtures.sample_data import multiplexed_run, simplex_run @@ -103,47 +104,60 @@ def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): def test_pool_metrics_from_single_sample_well(mlwhdb_test_session, simplex_run): - helper = WellWh(session=mlwhdb_test_session) + id = PacBioEntity( run_name=simplex_run.pac_bio_run_name, well_label=simplex_run.well_label, plate_number=simplex_run.plate_number, ).hash_product_id() + helper = WellWh(session=mlwhdb_test_session) + row = helper.get_mlwh_well_by_product_id(id) - metrics = helper.get_metrics_by_well_product_id(id) - assert metrics is None, "Got no metrics for a one-sample well" + metric = QCPoolMetrics(db_well=row) + assert metric.pool_coeff_of_variance is None + assert metric.products == [] def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run): - helper = WellWh(session=mlwhdb_test_session) + id = PacBioEntity(run_name="RUN", well_label="B1", plate_number=1).hash_product_id() - metrics = helper.get_metrics_by_well_product_id(id) + helper = WellWh(session=mlwhdb_test_session) + row = helper.get_mlwh_well_by_product_id(id) + metrics_via_db = QCPoolMetrics(db_well=row) + metrics_direct = QCPoolMetrics( + pool_coeff_of_variance=metrics_via_db.pool_coeff_of_variance, + products=metrics_via_db.products, + ) - assert metrics, "Two samples means we get a metrics response" - assert ( - int(metrics.pool_coeff_of_variance) == 47 - ), "Variance between 20 and 10 is ~47%" + for metrics in [metrics_via_db, metrics_direct]: + assert ( + int(metrics.pool_coeff_of_variance) == 47 + ), "Variance between 20 and 10 is ~47%" - assert metrics.products[0].hifi_read_bases == 100 - assert ( - metrics.products[1].hifi_read_bases == 900 - ), "hifi read base counts are faithfully copied" + assert metrics.products[0].hifi_read_bases == 100 + assert ( + metrics.products[1].hifi_read_bases == 900 + ), "hifi read base counts are faithfully copied" + + assert ( + int(metrics.products[0].percentage_total_reads) == 33 + ), "10 of 30 reads is 33.3%" + assert ( + int(metrics.products[1].percentage_total_reads) == 66 + ), "20 of 30 reads is 66.6%" - assert ( - int(metrics.products[0].percentage_total_reads) == 33 - ), "10 of 30 reads is 33.3%" - assert ( - int(metrics.products[1].percentage_total_reads) == 66 - ), "20 of 30 reads is 66.6%" +def test_errors_instantiating_pool_metrics(mlwhdb_test_session): -def test_pool_metrics_from_well(mlwhdb_test_session): + with pytest.raises(ValueError, match=r"None db_well value is not allowed."): + QCPoolMetrics(db_well=None) id = PacBioEntity( run_name="TRACTION-RUN-1140", well_label="C1", plate_number=2 ).hash_product_id() helper = WellWh(session=mlwhdb_test_session) + row = helper.get_mlwh_well_by_product_id(id) with pytest.raises( - Exception, match=r"Partially linked LIMS data or no linked LIMS data" + MissingLimsDataError, match=r"Partially linked LIMS data or no linked LIMS data" ): - helper.get_metrics_by_well_product_id(id) + QCPoolMetrics(db_well=row) From ba41cacda71e4c62a7bd7042eeeab3699b23076e Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 18 Jun 2024 16:40:51 +0000 Subject: [PATCH 25/46] Fix a testing issue with vue-test-utils and changing .find() behaviour --- frontend/src/components/WellTable.vue | 2 +- frontend/src/views/__tests__/WellsByRun.spec.js | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/frontend/src/components/WellTable.vue b/frontend/src/components/WellTable.vue index 2fcc6ff8..29c06998 100644 --- a/frontend/src/components/WellTable.vue +++ b/frontend/src/components/WellTable.vue @@ -30,7 +30,7 @@ defineEmits(['wellSelected']) Well time complete - {{ wellObj.run_name }} + {{ wellObj.run_name }} { ] ) await wrapper.setProps({runName: ['TRACTION-RUN-211', 'TRACTION-RUN-210']}) + await flushPromises() - test('Table now contains wells from both runs', () => { - const table = wrapper.get('table') - expect(table.exists()).toBe(true) + const table = wrapper.get('table') + expect(table.exists()).toBe(true) - expect(table.find('TRACTION-RUN-211').exists()).toBe(true) - expect(table.find('TRACTION-RUN-210').exists()).toBe(true) + console.log(table.html()) + expect(table.find("td#TRACTION-RUN-211").exists()).toBe(true) + expect(table.find("td#TRACTION-RUN-210").exists()).toBe(true) - const rows = table.findAll('tr') - expect(rows.length).toEqual(4) - }) + const rows = table.findAll('tr') + expect(rows.length).toEqual(4) }) }) From 6c7590eeb26271a0ebb3bcab30254d88abb2f182 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 18 Jun 2024 16:41:10 +0000 Subject: [PATCH 26/46] Implement a helper for loading pool metrics --- frontend/src/utils/__tests__/langqc.spec.js | 3 +++ frontend/src/utils/langqc.js | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/frontend/src/utils/__tests__/langqc.spec.js b/frontend/src/utils/__tests__/langqc.spec.js index a1fc14de..eefe81d0 100644 --- a/frontend/src/utils/__tests__/langqc.spec.js +++ b/frontend/src/utils/__tests__/langqc.spec.js @@ -95,6 +95,9 @@ describe('Example fake remote api call', () => { client.getWellsForRunPromise('blah') expect(fetch.mock.calls[6][0]).toEqual('/api/pacbio/run/blah?page_size=100&page=1') + + client.getPoolMetrics('A12345'); + expect(fetch.mock.calls[7][0]).toEqual('/api/pacbio/products/A12345/seq_level/pool') }); }); diff --git a/frontend/src/utils/langqc.js b/frontend/src/utils/langqc.js index 2e315b59..daf88570 100644 --- a/frontend/src/utils/langqc.js +++ b/frontend/src/utils/langqc.js @@ -118,4 +118,10 @@ export default class LangQc { } ) } + + getPoolMetrics(id_product) { + // Use the product metrics endpoint to get additional metrics + // for a well. + return this.fetchWrapper(this.buildUrl(['products', id_product, 'seq_level', 'pool'])); + } } From d755afd736cfccbdcfafc15f6491e2e2e3e0ca5d Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 9 Jul 2024 16:24:53 +0000 Subject: [PATCH 27/46] Scale pool stats to gigabases/round to 2dp for presentation --- lang_qc/models/pacbio/qc_data.py | 12 ++++++------ tests/fixtures/sample_data.py | 12 ++++++------ tests/test_pac_bio_qc_data_well.py | 16 ++++++++-------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 6aa6a0a4..ca29192d 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -173,7 +173,7 @@ class SampleDeplexingStats(BaseModel): tag1_name: str | None tag2_name: str | None deplexing_barcode: str | None - hifi_read_bases: int | None + hifi_read_bases: float | None = Field(title="HiFi read bases (Gb)") hifi_num_reads: int | None hifi_read_length_mean: float | None hifi_bases_percent: float | None @@ -208,7 +208,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: if well is None: raise ValueError(f"None {db_well_key_name} value is not allowed.") - cov: float = None + cov: float | None = None sample_stats = [] if well.demultiplex_mode and "Instrument" in well.demultiplex_mode: @@ -227,21 +227,21 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: cov = None else: hifi_reads = [prod.hifi_num_reads for prod in product_metrics] - cov = stdev(hifi_reads) / mean(hifi_reads) * 100 + cov = round(stdev(hifi_reads) / mean(hifi_reads) * 100, 2) - for (i, prod) in enumerate(product_metrics): + for i, prod in enumerate(product_metrics): sample_stats.append( SampleDeplexingStats( id_product=prod.id_pac_bio_product, tag1_name=lib_lims_data[i].tag_identifier, tag2_name=lib_lims_data[i].tag2_identifier, deplexing_barcode=prod.barcode4deplexing, - hifi_read_bases=prod.hifi_read_bases, + hifi_read_bases=convert_to_gigabase(prod, "hifi_read_bases"), hifi_num_reads=prod.hifi_num_reads, hifi_read_length_mean=prod.hifi_read_length_mean, hifi_bases_percent=prod.hifi_bases_percent, percentage_total_reads=( - prod.hifi_num_reads / well.hifi_num_reads * 100 + round(prod.hifi_num_reads / well.hifi_num_reads * 100, 2) if (well.hifi_num_reads and prod.hifi_num_reads) else None ), diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index e86fbb5f..436cc67d 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -59,9 +59,9 @@ def simplex_run(request, mlwhdb_test_session): tags=tag1, ).hash_product_id(), qc=1, - hifi_read_bases=900, + hifi_read_bases=90000000, hifi_num_reads=10, - hifi_read_length_mean=90, + hifi_read_length_mean=9000000, barcode_quality_score_mean=34, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_a1, @@ -148,9 +148,9 @@ def multiplexed_run(mlwhdb_test_session): tags=tag1, ).hash_product_id(), qc=1, - hifi_read_bases=900, + hifi_read_bases=90000000, hifi_num_reads=20, - hifi_read_length_mean=45, + hifi_read_length_mean=4500000, barcode_quality_score_mean=34, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_b1, @@ -180,9 +180,9 @@ def multiplexed_run(mlwhdb_test_session): tags=tag1_2, ).hash_product_id(), qc=1, - hifi_read_bases=100, + hifi_read_bases=10000000, hifi_num_reads=10, - hifi_read_length_mean=10, + hifi_read_length_mean=1000000, barcode_quality_score_mean=34, hifi_bases_percent=100.00, pac_bio_run_well_metrics=well_metrics_b1, diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index 772a9cb6..fb83be2f 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -132,19 +132,19 @@ def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run): for metrics in [metrics_via_db, metrics_direct]: assert ( int(metrics.pool_coeff_of_variance) == 47 - ), "Variance between 20 and 10 is ~47%" + ), "Variance between 20 reads and 10 reads is ~47%" - assert metrics.products[0].hifi_read_bases == 100 + assert metrics.products[0].hifi_read_bases == 0.01 assert ( - metrics.products[1].hifi_read_bases == 900 - ), "hifi read base counts are faithfully copied" + metrics.products[1].hifi_read_bases == 0.09 + ), "hifi read base counts are scaled to Gigabases" assert ( - int(metrics.products[0].percentage_total_reads) == 33 - ), "10 of 30 reads is 33.3%" + metrics.products[0].percentage_total_reads == 33.33 + ), "10Mb of 30Mb reads is 33.33% (2 d.p.)" assert ( - int(metrics.products[1].percentage_total_reads) == 66 - ), "20 of 30 reads is 66.6%" + metrics.products[1].percentage_total_reads == 66.67 + ), "20Mb of 30Mb reads is 66.67% (2 d.p.)" def test_errors_instantiating_pool_metrics(mlwhdb_test_session): From 389f51fcf569b33e0c348e0dc65e321dba3379e3 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 9 Jul 2024 16:26:22 +0000 Subject: [PATCH 28/46] App was relying on nested components importing ElToolTip --- frontend/src/App.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/App.vue b/frontend/src/App.vue index ca6dfc25..1e4c61cf 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -1,7 +1,7 @@ + + diff --git a/frontend/src/components/__tests__/PoolStats.spec.js b/frontend/src/components/__tests__/PoolStats.spec.js new file mode 100644 index 00000000..fe8d33d5 --- /dev/null +++ b/frontend/src/components/__tests__/PoolStats.spec.js @@ -0,0 +1,60 @@ +import { describe, expect, test } from 'vitest' +import { mount } from '@vue/test-utils' +import ElementPlus from 'element-plus' + +import PoolStats from '../PoolStats.vue' + +const wrapper = mount(PoolStats, { + global: { + plugins: [ElementPlus], + }, + props: { + pool: { + pool_coeff_of_variance: 47.2, + products: [{ + id_product: 'A'.repeat(64), + tag1_name: 'TTTTTTTT', + tag2_name: null, + deplexing_barcode: 'bc10--bc10', + hifi_read_bases: 900, + hifi_num_reads: 20, + hifi_read_length_mean: 45, + hifi_bases_percent: 90.001, + percentage_total_reads: 66.6 + },{ + id_product: 'B'.repeat(64), + tag1_name: 'GGGGGGGG', + tag2_name: null, + deplexing_barcode: 'bc11--bc11', + hifi_read_bases: 100, + hifi_num_reads: 10, + hifi_read_length_mean: 10, + hifi_bases_percent: 100, + percentage_total_reads: 33.3 + }] + } + } +}) + +describe('Create poolstats table with good data', () => { + test('Component is "folded" by default', () => { + expect(wrapper.getComponent('transition-stub').attributes()['appear']).toEqual('false') + }) + + test('Coefficient of variance showing', async () => { + let topStat = wrapper.find('p') + await topStat.trigger('focus') + expect(topStat.classes('el-tooltip__trigger')).toBeTruthy() + + expect(topStat.text()).toEqual('Coefficient of Variance: 47.2') + }) + + test('Table looks about right', () => { + let rows = wrapper.findAll('tr') + expect(rows.length).toEqual(3) + + // Check tag 1 has been set + expect(rows[1].find('td').text()).toEqual('TTTTTTTT') + expect(rows[2].find('td').text()).toEqual('GGGGGGGG') + }) +}) \ No newline at end of file From 074b2a4dc5015cc1ab898b9d7ed1ada97568bdba Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Wed, 10 Jul 2024 11:34:11 +0000 Subject: [PATCH 31/46] Add PoolStats component to QC View and load data on-demand. Fake some HTTP responses in testing --- frontend/src/components/QcView.vue | 33 +++++++++++++++++-- .../src/components/__tests__/QcView.spec.js | 5 +++ .../src/views/__tests__/WellsByRun.spec.js | 6 ++-- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/frontend/src/components/QcView.vue b/frontend/src/components/QcView.vue index 0ab4c6aa..ce0a84ca 100644 --- a/frontend/src/components/QcView.vue +++ b/frontend/src/components/QcView.vue @@ -3,15 +3,36 @@ * An information view containing run data and metrics useful for QC assessment */ - import { computed } from "vue"; + import { computed, ref, watch } from "vue"; import groupMetrics from "../utils/metrics.js"; import { combineLabelWithPlate } from "../utils/text.js" + import PoolStats from "./PoolStats.vue"; + import LangQc from "../utils/langqc"; + + + const dataClient = new LangQc() const props = defineProps({ // Well object representing one prepared input for the instrument // Expects content in the form of lang_qc/models/pacbio/well.py:PacBioWellFull well: Object, - }); + }) + + const poolStats = ref(null) + watch(() => props.well, () => { + poolStats.value = null // empty in case next well doesn't have a pool + dataClient.getPoolMetrics(props.well.id_product).then( + (response) => { poolStats.value = response } + ).catch((error) => { + if (error.message.match("Conflict")) { + // Nothing to do + } else { + console.log(error) + // make a banner show this error? + } + }) + }, { immediate: true} + ) const slURL = computed(() => { let hostname = props.well.metrics.smrt_link.hostname @@ -97,7 +118,6 @@ } return '' }) -