Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update loading of genomics data #197

Merged
merged 6 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
"editor.formatOnSave": true,
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.codeActionsOnSave": {
"source.fixAll.ruff": true,
"source.organizeImports.ruff": true,
"source.fixAll.ruff": "explicit",
gcroci2 marked this conversation as resolved.
Show resolved Hide resolved
"source.organizeImports.ruff": "explicit"
}
},
"python.linting.prospectorEnabled": false
"python.linting.prospectorEnabled": false,
"python.analysis.completeFunctionParens": true,
"python.defaultInterpreterPath": "",
"python.languageServer": "Pylance",
Expand Down
4 changes: 0 additions & 4 deletions src/nplinker/genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
from .utils import add_bgc_to_gcf
from .utils import add_strain_to_bgc
from .utils import generate_mappings_genome_id_bgc_id
from .utils import get_bgcs_from_gcfs
from .utils import get_strains_from_bgcs


logging.getLogger(__name__).addHandler(logging.NullHandler())
Expand All @@ -16,8 +14,6 @@
"BGC",
"GCF",
"generate_mappings_genome_id_bgc_id",
"get_bgcs_from_gcfs",
"get_strains_from_bgcs",
"add_bgc_to_gcf",
"add_strain_to_bgc",
]
88 changes: 55 additions & 33 deletions src/nplinker/genomics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def generate_mappings_genome_id_bgc_id(
logger.info("Generated genome-BGC mappings file: %s", output_file)


def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> None:
def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[BGC], list[BGC]]:
"""Assign a Strain object to `BGC.strain` for input BGCs.

BGC id is used to find the corresponding Strain object. It's possible that
Expand All @@ -74,65 +74,87 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> None:
strains(StrainCollection): A collection of all strain objects.
bgcs(list[BGC]): A list of BGC objects.

Returns:
tuple(list[BGC], list[BGC]): A tuple of two lists of BGC objects. The
first list contains BGC objects that are updated with Strain object;
the second list contains BGC objects that are not updated with
Strain object because no Strain object is found.

Raises:
ValueError: Strain id not found in the strain collection.
ValueError: Multiple strain objects found for a BGC id.
"""
bgc_with_strain = []
bgc_without_strain = []
for bgc in bgcs:
try:
strain_list = strains.lookup(bgc.bgc_id)
except ValueError as e:
raise ValueError(
f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' "
"not found in the strain collection."
) from e
except ValueError:
bgc_without_strain.append(bgc)
continue
if len(strain_list) > 1:
raise ValueError(
f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
f"BGC object accept only one strain."
)
bgc.strain = strain_list[0]
bgc_with_strain.append(bgc)

logger.info(
f"{len(bgc_with_strain)} BGC objects updated with Strain object.\n"
f"{len(bgc_without_strain)} BGC objects not updated with Strain object."
)
return bgc_with_strain, bgc_without_strain


def add_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]) -> None:
"""To add BGC objects to GCF object based on GCF's BGC ids.
def add_bgc_to_gcf(
bgcs: list[BGC], gcfs: list[GCF]
) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]:
"""Add BGC objects to GCF object based on GCF's BGC ids.

The attribute of `GCF.bgc_ids` contains the ids of BGC objects. These ids
are used to find BGC objects from the input `bgcs` list. The found BGC
objects are added to the `bgcs` attribute of GCF object. It is possible that
some BGC ids are not found in the input `bgcs` list, and so their BGC
objects are missing in the GCF object.

This method changes the lists `bgcs` and `gcfs` in place.

Args:
bgcs(list[BGC]): A list of BGC objects.
gcfs(list[GCF]): A list of GCF objects.

Raises:
KeyError: BGC id not found in the list of BGC objects.
Returns:
tuple(list[GCF], list[GCF], dict[GCF, set[str]]):
The first list contains GCF objects that are updated with BGC objects;
The second list contains GCF objects that are not updated with BGC objects
because no BGC objects are found;
The dictionary contains GCF objects as keys and a set of ids of missing
BGC objects as values.
"""
bgc_dict = {bgc.bgc_id: bgc for bgc in bgcs}
gcf_with_bgc = []
gcf_without_bgc = []
gcf_missing_bgc: dict[GCF, set[str]] = {}
for gcf in gcfs:
for bgc_id in gcf.bgc_ids:
try:
bgc = bgc_dict[bgc_id]
except KeyError as e:
raise KeyError(
f"BGC id '{bgc_id}' from GCF object '{gcf.gcf_id}' "
"not found in the list of BGC objects."
) from e
except KeyError:
if gcf not in gcf_missing_bgc:
gcf_missing_bgc[gcf] = {bgc_id}
else:
gcf_missing_bgc[gcf].add(bgc_id)
continue
gcf.add_bgc(bgc)


def get_bgcs_from_gcfs(gcfs: list[GCF]) -> list[BGC]:
"""Get all BGC objects from given GCF objects."""
s = set()
for gcf in gcfs:
s |= gcf.bgcs
return list(s)


def get_strains_from_bgcs(bgcs: list[BGC]) -> StrainCollection:
"""Get all strain objects from given BGC objects."""
sc = StrainCollection()
for bgc in bgcs:
if bgc.strain is not None:
sc.add(bgc.strain)
if gcf.bgcs:
gcf_with_bgc.append(gcf)
else:
logger.warning("Strain is None for BGC %s", bgc.bgc_id)
return sc
gcf_without_bgc.append(gcf)

logger.info(
f"{len(gcf_with_bgc)} GCF objects updated with BGC objects.\n"
f"{len(gcf_without_bgc)} GCF objects not updated with BGC objects.\n"
f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects."
)
return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc
26 changes: 17 additions & 9 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from nplinker.genomics import add_bgc_to_gcf
from nplinker.genomics import add_strain_to_bgc
from nplinker.genomics import generate_mappings_genome_id_bgc_id
from nplinker.genomics import get_bgcs_from_gcfs
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.bigscape import BigscapeGCFLoader
from nplinker.genomics.mibig import MibigLoader
Expand Down Expand Up @@ -441,7 +440,16 @@ def _load_metabolomics(self):
return True

def _load_genomics(self):
"""Loads genomics data to BGC and GCF objects."""
"""Loads genomics data to BGC and GCF objects.
The attribute of `self.bgcs` is set to the loaded BGC objects that have the Strain object
added (i.e. `BGC.strain` updated). If a BGC object does not have the Strain object, it is
not added to `self.bgcs`.
The attribute of `self.gcfs` is set to the loaded GCF objects that have the Strain objects
added (i.e. `GCF._strains` updated). This means only BGC objects with updated Strain objects
(i.e. `self.bgcs`) can be added to GCF objects.
"""
logger.debug("\nLoading genomics data starts...")

# Step 1: load all BGC objects
Expand All @@ -456,15 +464,15 @@ def _load_genomics(self):
)
raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs()

# Step 3: assign Strain object to BGC.strain
add_strain_to_bgc(self.strains, raw_bgcs)
# Step 3: add Strain object to BGC
bgc_with_strain, _ = add_strain_to_bgc(self.strains, raw_bgcs)

# Step 4: assign BGC objects to GCF.bgcs
add_bgc_to_gcf(raw_bgcs, raw_gcfs)
# Step 4: add BGC objects to GCF
gcf_with_bgc, _, _ = add_bgc_to_gcf(bgc_with_strain, raw_gcfs)

# Step 5: get GCF objects and their BGC members
self.gcfs = raw_gcfs
self.bgcs = get_bgcs_from_gcfs(self.gcfs)
# Step 5: set attributes of self.bgcs and self.gcfs with valid objects
self.bgcs = bgc_with_strain
self.gcfs = gcf_with_bgc

logger.debug("Loading genomics data completed\n")
return True
Expand Down
153 changes: 57 additions & 96 deletions tests/genomics/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from nplinker.genomics import add_bgc_to_gcf
from nplinker.genomics import add_strain_to_bgc
from nplinker.genomics import generate_mappings_genome_id_bgc_id
from nplinker.genomics import get_bgcs_from_gcfs
from nplinker.genomics import get_strains_from_bgcs
from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
from nplinker.strain import Strain
from nplinker.strain_collection import StrainCollection
Expand Down Expand Up @@ -71,106 +69,69 @@ def test_generate_mappings_genome_id_bgc_id_empty_dir(tmp_path, caplog):


@pytest.fixture
def strain_collection() -> StrainCollection:
"""Return a StrainCollection object."""
sc = StrainCollection()

strain = Strain("STRAIN_01")
strain.add_alias("BGC_01")
sc.add(strain)

strain = Strain("STRAIN_02")
strain.add_alias("BGC_02")
strain.add_alias("BGC_02_1")
sc.add(strain)

strain = Strain("SAMPLE_BGC_03")
sc.add(strain)

return sc


@pytest.fixture
def bgc_list() -> list[BGC]:
def bgcs() -> list[BGC]:
"""Return a list of BGC objects."""
return [BGC("BGC_01", "NPR"), BGC("BGC_02", "Alkaloid"), BGC("SAMPLE_BGC_03", "Polyketide")]


@pytest.fixture
def gcf_list() -> list[GCF]:
"""Return a list of GCF objects."""
gcf1 = GCF("1")
gcf1.bgc_ids |= {"BGC_01"}
gcf2 = GCF("2")
gcf2.bgc_ids |= {"BGC_02", "SAMPLE_BGC_03"}
return [gcf1, gcf2]


@pytest.fixture
def gcf_list_error() -> list[GCF]:
"""Return a list of GCF objects for testing errors."""
gcf1 = GCF("1")
gcf1.bgc_ids |= {"SAMPLE_BGC_03", "BGC_04"}
return [gcf1]
return [BGC("BGC_01", "NPR"), BGC("BGC_02", "Alkaloid"), BGC("BGC_03", "Polyketide")]


def test_add_strain_to_bgc(strain_collection, bgc_list):
def test_add_strain_to_bgc(bgcs):
"""Test add_strain_to_bgc function."""
for bgc in bgc_list:
assert bgc.strain is None
add_strain_to_bgc(strain_collection, bgc_list)
for bgc in bgc_list:
assert bgc.strain is not None
assert bgc_list[0].strain.id == "STRAIN_01"
assert bgc_list[1].strain.id == "STRAIN_02"
assert bgc_list[2].strain.id == "SAMPLE_BGC_03"


def test_add_strain_to_bgc_error(strain_collection):
strain1 = Strain("STRAIN_01")
strain1.add_alias("BGC_01")
strain2 = Strain("STRAIN_02")
strain2.add_alias("BGC_02")
strain2.add_alias("BGC_02_1")
strain3 = Strain("STRAIN_03")
strains = StrainCollection()
strains.add(strain1)
strains.add(strain2)
strains.add(strain3)

bgc_with_strain, bgc_without_strain = add_strain_to_bgc(strains, bgcs)

assert len(bgc_with_strain) == 2
assert len(bgc_without_strain) == 1
assert bgc_with_strain == [bgcs[0], bgcs[1]]
assert bgc_without_strain == [bgcs[2]]
assert bgc_with_strain[0].strain == strain1
assert bgc_with_strain[1].strain == strain2
assert bgc_without_strain[0].strain is None


def test_add_strain_to_bgc_error(bgcs):
"""Test add_strain_to_bgc function error."""
bgcs = [BGC("BGC_04", "NPR")]
strain1 = Strain("STRAIN_01")
strain1.add_alias("BGC_01")
strain2 = Strain("STRAIN_02")
strain2.add_alias("BGC_01")
strains = StrainCollection()
strains.add(strain1)
strains.add(strain2)

with pytest.raises(ValueError) as e:
add_strain_to_bgc(strain_collection, bgcs)
assert "Strain id 'BGC_04' from BGC object 'BGC_04' not found" in e.value.args[0]
add_strain_to_bgc(strains, bgcs)

assert "Multiple strain objects found for BGC id 'BGC_01'" in e.value.args[0]


def test_add_bgc_to_gcf(bgc_list, gcf_list):
def test_add_bgc_to_gcf(bgcs):
"""Test add_bgc_to_gcf function."""
assert gcf_list[0].bgc_ids == {"BGC_01"}
assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"}
assert len(gcf_list[0].bgcs) == 0
assert len(gcf_list[1].bgcs) == 0
add_bgc_to_gcf(bgc_list, gcf_list)
assert gcf_list[0].bgc_ids == {"BGC_01"}
assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"}
assert len(gcf_list[0].bgcs) == 1
assert len(gcf_list[1].bgcs) == 2
assert gcf_list[0].bgcs == set(bgc_list[:1])
assert gcf_list[1].bgcs == set(bgc_list[1:])


def test_add_bgc_to_gcf_error(bgc_list, gcf_list_error):
"""Test add_bgc_to_gcf function error."""
assert gcf_list_error[0].bgc_ids == {"SAMPLE_BGC_03", "BGC_04"}
assert len(gcf_list_error[0].bgcs) == 0
with pytest.raises(KeyError) as e:
add_bgc_to_gcf(bgc_list, gcf_list_error)
assert "BGC id 'BGC_04' from GCF object '1' not found" in e.value.args[0]


def test_get_bgcs_from_gcfs(bgc_list, gcf_list):
"""Test get_bgcs_from_gcfs function."""
add_bgc_to_gcf(bgc_list, gcf_list)
bgcs = get_bgcs_from_gcfs(gcf_list)
assert isinstance(bgcs, list)
assert len(bgcs) == 3
for i in bgcs:
assert isinstance(i, BGC)


def test_get_strains_from_bgcs(strain_collection, bgc_list):
"""Test get_strains_from_bgcs function."""
add_strain_to_bgc(strain_collection, bgc_list)
strains = get_strains_from_bgcs(bgc_list)
assert isinstance(strains, StrainCollection)
assert strains == strain_collection
gcf1 = GCF("1")
gcf1.bgc_ids = {"BGC_01", "BGC_02"}
gcf2 = GCF("2")
gcf2.bgc_ids = {"BGC_03", "BGC_missing_01"}
gcf3 = GCF("3")
gcf3.bgc_ids = {"BGC_missing_02", "BGC_missing_03"}
gcfs = [gcf1, gcf2, gcf3]

gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc = add_bgc_to_gcf(bgcs, gcfs)

assert len(gcf_with_bgc) == 2
assert len(gcf_without_bgc) == 1
assert len(gcf_missing_bgc) == 2
assert gcf_with_bgc == [gcf1, gcf2]
assert gcf_without_bgc == [gcf3]
assert gcf_missing_bgc == {gcf2: {"BGC_missing_01"}, gcf3: {"BGC_missing_02", "BGC_missing_03"}}
assert gcf_with_bgc[0].bgcs == {bgcs[0], bgcs[1]}
assert gcf_with_bgc[1].bgcs == {bgcs[2]}
assert gcf_without_bgc[0].bgcs == set()
Loading