NPLinker · CunliangGeng · Jan 24, 2024 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -9,11 +9,11 @@
     "editor.formatOnSave": true,
     "editor.defaultFormatter": "charliermarsh.ruff",
     "editor.codeActionsOnSave": {
-      "source.fixAll.ruff": true,
-      "source.organizeImports.ruff": true,
+        "source.fixAll.ruff": "explicit",
+        "source.organizeImports.ruff": "explicit"
     }
   },
-  "python.linting.prospectorEnabled": false
+  "python.linting.prospectorEnabled": false,
   "python.analysis.completeFunctionParens": true,
   "python.defaultInterpreterPath": "",
   "python.languageServer": "Pylance",

diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py
@@ -5,8 +5,6 @@
 from .utils import add_bgc_to_gcf
 from .utils import add_strain_to_bgc
 from .utils import generate_mappings_genome_id_bgc_id
-from .utils import get_bgcs_from_gcfs
-from .utils import get_strains_from_bgcs
 
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
@@ -16,8 +14,6 @@
     "BGC",
     "GCF",
     "generate_mappings_genome_id_bgc_id",
-    "get_bgcs_from_gcfs",
-    "get_strains_from_bgcs",
     "add_bgc_to_gcf",
     "add_strain_to_bgc",
 ]
diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py
@@ -62,7 +62,7 @@ def generate_mappings_genome_id_bgc_id(
     logger.info("Generated genome-BGC mappings file: %s", output_file)
 
 
-def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> None:
+def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[BGC], list[BGC]]:
     """Assign a Strain object to `BGC.strain` for input BGCs.
 
     BGC id is used to find the corresponding Strain object. It's possible that
@@ -74,65 +74,87 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> None:
         strains(StrainCollection): A collection of all strain objects.
         bgcs(list[BGC]): A list of BGC objects.
 
+    Returns:
+        tuple(list[BGC], list[BGC]): A tuple of two lists of BGC objects. The
+            first list contains BGC objects that are updated with Strain object;
+            the second list contains BGC objects that are not updated with
+            Strain object because no Strain object is found.
+
     Raises:
-        ValueError: Strain id not found in the strain collection.
         ValueError: Multiple strain objects found for a BGC id.
     """
+    bgc_with_strain = []
+    bgc_without_strain = []
     for bgc in bgcs:
         try:
             strain_list = strains.lookup(bgc.bgc_id)
-        except ValueError as e:
-            raise ValueError(
-                f"Strain id '{bgc.bgc_id}' from BGC object '{bgc.bgc_id}' "
-                "not found in the strain collection."
-            ) from e
+        except ValueError:
+            bgc_without_strain.append(bgc)
+            continue
         if len(strain_list) > 1:
             raise ValueError(
                 f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
                 f"BGC object accept only one strain."
             )
         bgc.strain = strain_list[0]
+        bgc_with_strain.append(bgc)
+
+    logger.info(
+        f"{len(bgc_with_strain)} BGC objects updated with Strain object.\n"
+        f"{len(bgc_without_strain)} BGC objects not updated with Strain object."
+    )
+    return bgc_with_strain, bgc_without_strain
 
 
-def add_bgc_to_gcf(bgcs: list[BGC], gcfs: list[GCF]) -> None:
-    """To add BGC objects to GCF object based on GCF's BGC ids.
+def add_bgc_to_gcf(
+    bgcs: list[BGC], gcfs: list[GCF]
+) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]:
+    """Add BGC objects to GCF object based on GCF's BGC ids.
+
+    The attribute of `GCF.bgc_ids` contains the ids of BGC objects. These ids
+    are used to find BGC objects from the input `bgcs` list. The found BGC
+    objects are added to the `bgcs` attribute of GCF object. It is possible that
+    some BGC ids are not found in the input `bgcs` list, and so their BGC
+    objects are missing in the GCF object.
 
     This method changes the lists `bgcs` and `gcfs` in place.
 
     Args:
         bgcs(list[BGC]): A list of BGC objects.
         gcfs(list[GCF]): A list of GCF objects.
 
-    Raises:
-        KeyError: BGC id not found in the list of BGC objects.
+    Returns:
+        tuple(list[GCF], list[GCF], dict[GCF, set[str]]):
+            The first list contains GCF objects that are updated with BGC objects;
+            The second list contains GCF objects that are not updated with BGC objects
+            because no BGC objects are found;
+            The dictionary contains GCF objects as keys and a set of ids of missing
+            BGC objects as values.
     """
     bgc_dict = {bgc.bgc_id: bgc for bgc in bgcs}
+    gcf_with_bgc = []
+    gcf_without_bgc = []
+    gcf_missing_bgc: dict[GCF, set[str]] = {}
     for gcf in gcfs:
         for bgc_id in gcf.bgc_ids:
             try:
                 bgc = bgc_dict[bgc_id]
-            except KeyError as e:
-                raise KeyError(
-                    f"BGC id '{bgc_id}' from GCF object '{gcf.gcf_id}' "
-                    "not found in the list of BGC objects."
-                ) from e
+            except KeyError:
+                if gcf not in gcf_missing_bgc:
+                    gcf_missing_bgc[gcf] = {bgc_id}
+                else:
+                    gcf_missing_bgc[gcf].add(bgc_id)
+                continue
             gcf.add_bgc(bgc)
 
-
-def get_bgcs_from_gcfs(gcfs: list[GCF]) -> list[BGC]:
-    """Get all BGC objects from given GCF objects."""
-    s = set()
-    for gcf in gcfs:
-        s |= gcf.bgcs
-    return list(s)
-
-
-def get_strains_from_bgcs(bgcs: list[BGC]) -> StrainCollection:
-    """Get all strain objects from given BGC objects."""
-    sc = StrainCollection()
-    for bgc in bgcs:
-        if bgc.strain is not None:
-            sc.add(bgc.strain)
+        if gcf.bgcs:
+            gcf_with_bgc.append(gcf)
         else:
-            logger.warning("Strain is None for BGC %s", bgc.bgc_id)
-    return sc
+            gcf_without_bgc.append(gcf)
+
+    logger.info(
+        f"{len(gcf_with_bgc)} GCF objects updated with BGC objects.\n"
+        f"{len(gcf_without_bgc)} GCF objects not updated with BGC objects.\n"
+        f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects."
+    )
+    return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -7,7 +7,6 @@
 from nplinker.genomics import add_bgc_to_gcf
 from nplinker.genomics import add_strain_to_bgc
 from nplinker.genomics import generate_mappings_genome_id_bgc_id
-from nplinker.genomics import get_bgcs_from_gcfs
 from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.bigscape import BigscapeGCFLoader
 from nplinker.genomics.mibig import MibigLoader
@@ -441,7 +440,16 @@ def _load_metabolomics(self):
         return True
 
     def _load_genomics(self):
-        """Loads genomics data to BGC and GCF objects."""
+        """Loads genomics data to BGC and GCF objects.
+
+        The attribute of `self.bgcs` is set to the loaded BGC objects that have the Strain object
+        added (i.e. `BGC.strain` updated). If a BGC object does not have the Strain object, it is
+        not added to `self.bgcs`.
+
+        The attribute of `self.gcfs` is set to the loaded GCF objects that have the Strain objects
+        added (i.e. `GCF._strains` updated). This means only BGC objects with updated Strain objects
+        (i.e. `self.bgcs`) can be added to GCF objects.
+        """
         logger.debug("\nLoading genomics data starts...")
 
         # Step 1: load all BGC objects
@@ -456,15 +464,15 @@ def _load_genomics(self):
         )
         raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs()
 
-        # Step 3: assign Strain object to BGC.strain
-        add_strain_to_bgc(self.strains, raw_bgcs)
+        # Step 3: add Strain object to BGC
+        bgc_with_strain, _ = add_strain_to_bgc(self.strains, raw_bgcs)
 
-        # Step 4: assign BGC objects to GCF.bgcs
-        add_bgc_to_gcf(raw_bgcs, raw_gcfs)
+        # Step 4: add BGC objects to GCF
+        gcf_with_bgc, _, _ = add_bgc_to_gcf(bgc_with_strain, raw_gcfs)
 
-        # Step 5: get GCF objects and their BGC members
-        self.gcfs = raw_gcfs
-        self.bgcs = get_bgcs_from_gcfs(self.gcfs)
+        # Step 5: set attributes of self.bgcs and self.gcfs with valid objects
+        self.bgcs = bgc_with_strain
+        self.gcfs = gcf_with_bgc
 
         logger.debug("Loading genomics data completed\n")
         return True

diff --git a/tests/genomics/test_utils.py b/tests/genomics/test_utils.py
@@ -6,8 +6,6 @@
 from nplinker.genomics import add_bgc_to_gcf
 from nplinker.genomics import add_strain_to_bgc
 from nplinker.genomics import generate_mappings_genome_id_bgc_id
-from nplinker.genomics import get_bgcs_from_gcfs
-from nplinker.genomics import get_strains_from_bgcs
 from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
 from nplinker.strain import Strain
 from nplinker.strain_collection import StrainCollection
@@ -71,106 +69,69 @@ def test_generate_mappings_genome_id_bgc_id_empty_dir(tmp_path, caplog):
 
 
 @pytest.fixture
-def strain_collection() -> StrainCollection:
-    """Return a StrainCollection object."""
-    sc = StrainCollection()
-
-    strain = Strain("STRAIN_01")
-    strain.add_alias("BGC_01")
-    sc.add(strain)
-
-    strain = Strain("STRAIN_02")
-    strain.add_alias("BGC_02")
-    strain.add_alias("BGC_02_1")
-    sc.add(strain)
-
-    strain = Strain("SAMPLE_BGC_03")
-    sc.add(strain)
-
-    return sc
-
-
-@pytest.fixture
-def bgc_list() -> list[BGC]:
+def bgcs() -> list[BGC]:
     """Return a list of BGC objects."""
-    return [BGC("BGC_01", "NPR"), BGC("BGC_02", "Alkaloid"), BGC("SAMPLE_BGC_03", "Polyketide")]
-
-
-@pytest.fixture
-def gcf_list() -> list[GCF]:
-    """Return a list of GCF objects."""
-    gcf1 = GCF("1")
-    gcf1.bgc_ids |= {"BGC_01"}
-    gcf2 = GCF("2")
-    gcf2.bgc_ids |= {"BGC_02", "SAMPLE_BGC_03"}
-    return [gcf1, gcf2]
-
-
-@pytest.fixture
-def gcf_list_error() -> list[GCF]:
-    """Return a list of GCF objects for testing errors."""
-    gcf1 = GCF("1")
-    gcf1.bgc_ids |= {"SAMPLE_BGC_03", "BGC_04"}
-    return [gcf1]
+    return [BGC("BGC_01", "NPR"), BGC("BGC_02", "Alkaloid"), BGC("BGC_03", "Polyketide")]
 
 
-def test_add_strain_to_bgc(strain_collection, bgc_list):
+def test_add_strain_to_bgc(bgcs):
     """Test add_strain_to_bgc function."""
-    for bgc in bgc_list:
-        assert bgc.strain is None
-    add_strain_to_bgc(strain_collection, bgc_list)
-    for bgc in bgc_list:
-        assert bgc.strain is not None
-    assert bgc_list[0].strain.id == "STRAIN_01"
-    assert bgc_list[1].strain.id == "STRAIN_02"
-    assert bgc_list[2].strain.id == "SAMPLE_BGC_03"
-
-
-def test_add_strain_to_bgc_error(strain_collection):
+    strain1 = Strain("STRAIN_01")
+    strain1.add_alias("BGC_01")
+    strain2 = Strain("STRAIN_02")
+    strain2.add_alias("BGC_02")
+    strain2.add_alias("BGC_02_1")
+    strain3 = Strain("STRAIN_03")
+    strains = StrainCollection()
+    strains.add(strain1)
+    strains.add(strain2)
+    strains.add(strain3)
+
+    bgc_with_strain, bgc_without_strain = add_strain_to_bgc(strains, bgcs)
+
+    assert len(bgc_with_strain) == 2
+    assert len(bgc_without_strain) == 1
+    assert bgc_with_strain == [bgcs[0], bgcs[1]]
+    assert bgc_without_strain == [bgcs[2]]
+    assert bgc_with_strain[0].strain == strain1
+    assert bgc_with_strain[1].strain == strain2
+    assert bgc_without_strain[0].strain is None
+
+
+def test_add_strain_to_bgc_error(bgcs):
     """Test add_strain_to_bgc function error."""
-    bgcs = [BGC("BGC_04", "NPR")]
+    strain1 = Strain("STRAIN_01")
+    strain1.add_alias("BGC_01")
+    strain2 = Strain("STRAIN_02")
+    strain2.add_alias("BGC_01")
+    strains = StrainCollection()
+    strains.add(strain1)
+    strains.add(strain2)
+
     with pytest.raises(ValueError) as e:
-        add_strain_to_bgc(strain_collection, bgcs)
-    assert "Strain id 'BGC_04' from BGC object 'BGC_04' not found" in e.value.args[0]
+        add_strain_to_bgc(strains, bgcs)
+
+    assert "Multiple strain objects found for BGC id 'BGC_01'" in e.value.args[0]
 
 
-def test_add_bgc_to_gcf(bgc_list, gcf_list):
+def test_add_bgc_to_gcf(bgcs):
     """Test add_bgc_to_gcf function."""
-    assert gcf_list[0].bgc_ids == {"BGC_01"}
-    assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"}
-    assert len(gcf_list[0].bgcs) == 0
-    assert len(gcf_list[1].bgcs) == 0
-    add_bgc_to_gcf(bgc_list, gcf_list)
-    assert gcf_list[0].bgc_ids == {"BGC_01"}
-    assert gcf_list[1].bgc_ids == {"BGC_02", "SAMPLE_BGC_03"}
-    assert len(gcf_list[0].bgcs) == 1
-    assert len(gcf_list[1].bgcs) == 2
-    assert gcf_list[0].bgcs == set(bgc_list[:1])
-    assert gcf_list[1].bgcs == set(bgc_list[1:])
-
-
-def test_add_bgc_to_gcf_error(bgc_list, gcf_list_error):
-    """Test add_bgc_to_gcf function error."""
-    assert gcf_list_error[0].bgc_ids == {"SAMPLE_BGC_03", "BGC_04"}
-    assert len(gcf_list_error[0].bgcs) == 0
-    with pytest.raises(KeyError) as e:
-        add_bgc_to_gcf(bgc_list, gcf_list_error)
-    assert "BGC id 'BGC_04' from GCF object '1' not found" in e.value.args[0]
-
-
-def test_get_bgcs_from_gcfs(bgc_list, gcf_list):
-    """Test get_bgcs_from_gcfs function."""
-    add_bgc_to_gcf(bgc_list, gcf_list)
-    bgcs = get_bgcs_from_gcfs(gcf_list)
-    assert isinstance(bgcs, list)
-    assert len(bgcs) == 3
-    for i in bgcs:
-        assert isinstance(i, BGC)
-
-
-def test_get_strains_from_bgcs(strain_collection, bgc_list):
-    """Test get_strains_from_bgcs function."""
-    add_strain_to_bgc(strain_collection, bgc_list)
-    strains = get_strains_from_bgcs(bgc_list)
-    assert isinstance(strains, StrainCollection)
-    assert strains == strain_collection
+    gcf1 = GCF("1")
+    gcf1.bgc_ids = {"BGC_01", "BGC_02"}
+    gcf2 = GCF("2")
+    gcf2.bgc_ids = {"BGC_03", "BGC_missing_01"}
+    gcf3 = GCF("3")
+    gcf3.bgc_ids = {"BGC_missing_02", "BGC_missing_03"}
+    gcfs = [gcf1, gcf2, gcf3]
+
+    gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc = add_bgc_to_gcf(bgcs, gcfs)
+
+    assert len(gcf_with_bgc) == 2
+    assert len(gcf_without_bgc) == 1
+    assert len(gcf_missing_bgc) == 2
+    assert gcf_with_bgc == [gcf1, gcf2]
+    assert gcf_without_bgc == [gcf3]
+    assert gcf_missing_bgc == {gcf2: {"BGC_missing_01"}, gcf3: {"BGC_missing_02", "BGC_missing_03"}}
+    assert gcf_with_bgc[0].bgcs == {bgcs[0], bgcs[1]}
+    assert gcf_with_bgc[1].bgcs == {bgcs[2]}
+    assert gcf_without_bgc[0].bgcs == set()