fixed issue with incorrect parsing of reaction times from dir names

FAIRChemistry · Oct 9, 2024 · 3f0c653 · 3f0c653
1 parent 533ad42
commit 3f0c653
Show file tree

Hide file tree

Showing 12 changed files with 246 additions and 277 deletions.
diff --git a/chromatopy/readers/abstractreader.py b/chromatopy/readers/abstractreader.py
@@ -90,6 +90,8 @@ def _parse_time_and_unit(
 
         try:
             filenames = [Path(f) for f in data["file_paths"]]
+            if len(filenames) == 0:
+                raise KeyError
         except KeyError:
             path = Path(data["dirpath"])
             if not path.exists():
@@ -101,71 +103,62 @@ def _parse_time_and_unit(
 
             # get all filnames of normal files in the directory. exclude hidden files
 
-            filenames = [
-                f for f in path.iterdir() if f.is_file() and not f.name.startswith(".")
-            ]
+            filenames = [f for f in path.iterdir() if not f.name.startswith(".")]
 
-        pattern = r"(\d+(\.\d+)?)\s*[_-]?\s*(min|minutes?|sec|seconds?|hours?)"
+        pattern = r".*?(\d+(\.\d+)?)\s*[_-]?\s*(min|minutes?|sec|seconds?|hours?)"
 
-        # check if all filenames contain a reaction time and unit
-        if all(re.search(pattern, f.name) for f in filenames):
-            # extract all reaction times and units from the filenames
-            rctn_time_path_dict: dict[float, str] = {}
-            units = []
+        # extract all reaction times and units from the filenames or parent directories
+        rctn_time_path_dict: dict[float, str] = {}
+        units = []
 
-            for file in filenames:
-                match = re.search(pattern, file.name)
-                assert (
-                    match is not None
-                ), f"Could not parse reaction time from '{file.name}'."
+        for file in filenames:
+            match = re.search(pattern, file.name)
+            if not match:
+                match = re.search(pattern, file.parent.name)
+            if match:
                 reaction_time = float(match.group(1))
-                units.append(match.group(3))
+                unit = match.group(3)
+                if reaction_time in rctn_time_path_dict:
+                    logger.debug(
+                        f"Duplicate reaction time '{reaction_time}' found in directory '{data['dirpath']}'."
+                    )
+                    raise MetadataExtractionError(
+                        f"Reaction times in directory '{data['dirpath']}' are not unique."
+                    )
                 rctn_time_path_dict[reaction_time] = str(file.absolute())
-
-            if not len(list(rctn_time_path_dict.keys())) == len(filenames):
-                logger.debug(
-                    f"Reaction times in directory '{data['dirpath']}' are not unique."
-                )
+                units.append(unit)
+            else:
+                logger.debug(f"Could not parse reaction time from '{file}'.")
                 raise MetadataExtractionError(
-                    f"Reaction times in directory '{data['dirpath']}' are not unique."
+                    f"Could not parse reaction time from '{file}'."
                 )
 
-            # check if all units are the same
-            if not all(unit == units[0] for unit in units):
-                logger.debug(
-                    f"Units of reaction times in directory '{data['dirpath']}' are not consistent: {units}"
-                )
-                raise UnitConsistencyError(
-                    f"Units of reaction times in directory '{data['dirpath']}' are not consistent: {units}"
-                )
-
-            try:
-                unit_definition = AbstractReader._map_unit_str_to_UnitDefinition(
-                    units[0]
-                )
-            except ValueError:
-                logger.debug(
-                    f"Unit {units[0]} from directory '{data['dirpath']}' not recognized."
-                )
-                raise MetadataExtractionError(
-                    f"Unit {units[0]} from directory '{data['dirpath']}' not recognized."
-                )
-
-            data["file_paths"] = []
-            data["reaction_times"] = []
-            for time, full_path in sorted(rctn_time_path_dict.items()):
-                data["reaction_times"].append(time)
-                data["file_paths"].append(full_path)
-            data["time_unit"] = unit_definition
+        # check if all units are the same
+        if not all(unit == units[0] for unit in units):
+            logger.debug(
+                f"Units of reaction times in directory '{data['dirpath']}' are not consistent: {units}"
+            )
+            raise UnitConsistencyError(
+                f"Units of reaction times in directory '{data['dirpath']}' are not consistent: {units}"
+            )
 
-        else:
+        try:
+            unit_definition = AbstractReader._map_unit_str_to_UnitDefinition(units[0])
+        except ValueError:
             logger.debug(
-                f"Reaction times and units could not be parsed from filenames in directory '{data['dirpath']}'."
+                f"Unit {units[0]} from directory '{data['dirpath']}' not recognized."
             )
             raise MetadataExtractionError(
-                f"Reaction times and units could not be parsed from filenames in directory '{data['dirpath']}'."
+                f"Unit {units[0]} from directory '{data['dirpath']}' not recognized."
             )
 
+        data["file_paths"] = []
+        data["reaction_times"] = []
+        for time, full_path in sorted(rctn_time_path_dict.items()):
+            data["reaction_times"].append(time)
+            data["file_paths"].append(full_path)
+        data["time_unit"] = unit_definition
+
     @staticmethod
     def _map_unit_str_to_UnitDefinition(
         unit_str: str,

diff --git a/chromatopy/readers/agilent_csv.py b/chromatopy/readers/agilent_csv.py
@@ -1,37 +1,25 @@
-from pathlib import Path
-from typing import Any
-
 import pandas as pd
-from loguru import logger
 
 from chromatopy.model import Chromatogram, Measurement, Peak
 from chromatopy.readers.abstractreader import AbstractReader
 
 
 class AgilentCSVReader(AbstractReader):
-    def model_post_init(self, __context: Any) -> None:
-        if not self.reaction_times or not self.time_unit or not self.file_paths:
-            logger.debug(
-                "Collecting file paths without reaction time and unit parsing."
-            )
-            self._get_file_paths()
-
     def read(self) -> list[Measurement]:
         """Reads chromatographic data from the specified Agilent CSV files.
 
         Returns:
             list[Measurement]: A list of Measurement objects representing the chromatographic data.
         """
-        csv_paths = self._get_file_paths()
 
-        assert len(self.reaction_times) == len(csv_paths), f"""
+        assert len(self.reaction_times) == len(self.file_paths), f"""
         The number of reaction times {len(self.reaction_times)} does not match the number of
-        'RESULTS.CSV' files {len(csv_paths)}.
+        'RESULTS.CSV' files {len(self.file_paths)}.
         """
 
         measurements = []
         for path_idx, (csv_path, reaction_time) in enumerate(
-            zip(csv_paths, self.reaction_times)
+            zip(self.file_paths, self.reaction_times)
         ):
             peaks = self._read_peaks_from_csv(csv_path)
             chromatogram = Chromatogram(peaks=peaks)
@@ -53,35 +41,6 @@ def read(self) -> list[Measurement]:
 
         return measurements
 
-    def _get_file_paths(self) -> list[str]:
-        """Collects the file paths of the Agilent CSV files."""
-        directory = Path(self.dirpath)
-        target_paths = []
-
-        if directory.is_dir():
-            dirs = sorted(directory.iterdir())
-            found_count = 0
-
-            for folder in dirs:
-                if (
-                    folder.is_dir()
-                    and folder.name.endswith(".D")
-                    and not folder.name.startswith(".")
-                ):
-                    for file in folder.iterdir():
-                        if file.name == "RESULTS.CSV" and not file.name.startswith("."):
-                            found_count += 1
-                            target_paths.append(str(file.absolute()))
-
-            if found_count == 0:
-                raise FileNotFoundError(
-                    f"No 'RESULTS.CSV' file found in '{self.dirpath}'."
-                )
-        else:
-            target_paths = [self.dirpath]
-
-        return sorted(target_paths)
-
     def _read_peaks_from_csv(self, path: str, skiprows: int = 6) -> list[Peak]:
         """Reads peaks from an Agilent CSV file."""
         peaks = []

diff --git a/chromatopy/tools/analyzer.py b/chromatopy/tools/analyzer.py
@@ -465,14 +465,15 @@ def read_agilent(
         from chromatopy.readers.agilent_txt import AgilentTXTReader
 
         directory = Path(path)
+
         txt_paths = [
             str(f.absolute())
-            for f in directory.rglob("*.TXT")
+            for f in directory.rglob("Report.TXT")
             if f.parent.parent == directory
         ]
         csv_paths = [
             str(f.absolute())
-            for f in directory.rglob("*.csv")
+            for f in directory.rglob("RESULTS.CSV")
             if f.parent.parent == directory
         ]
 
@@ -491,12 +492,14 @@ def read_agilent(
         if data["reaction_times"] is None:
             data.pop("reaction_times")
 
-        try:
+        if not csv_paths and txt_paths:
             data["file_paths"] = txt_paths  # type: ignore
             measurements = AgilentTXTReader(**data).read()  # type: ignore
-        except FileNotFoundError:
+        elif csv_paths and not txt_paths:
             data["file_paths"] = csv_paths  # type: ignore
             measurements = AgilentCSVReader(**data).read()  # type: ignore
+        else:
+            raise IOError(f"No 'REPORT.TXT' or 'RESULTS.CSV' files found in '{path}'.")
 
         if id is None:
             id = path
@@ -1153,3 +1156,11 @@ def visualize_spectra(self, dark_mode: bool = False) -> go.Figure:
         )
 
         return fig
+
+
+if __name__ == "__main__":
+    path = "/Users/max/Documents/GitHub/eyring-kinetics/data/hetero/RAU-R503"
+
+    ana = ChromAnalyzer.read_agilent(path, ph=7.4, temperature=37)
+    for meas in ana.measurements:
+        print(meas.reaction_time)
diff --git a/...ta/agilent_txt/001F0101.D/Report_0min.TXT → ...es/data/agilent_txt/001F0101.D/Report.TXT b/...ta/agilent_txt/001F0101.D/Report_0min.TXT → ...es/data/agilent_txt/001F0101.D/Report.TXT
diff --git a/...a/agilent_txt/001F0102.D/Report_20min.TXT → ...es/data/agilent_txt/001F0102.D/Report.TXT b/...a/agilent_txt/001F0102.D/Report_20min.TXT → ...es/data/agilent_txt/001F0102.D/Report.TXT
diff --git a/.../agilent_txt/001F0103.D/Report 30 min.TXT → ...es/data/agilent_txt/001F0103.D/Report.TXT b/.../agilent_txt/001F0103.D/Report 30 min.TXT → ...es/data/agilent_txt/001F0103.D/Report.TXT
diff --git a/.../agilent_txt/001F0104.D/Report 40_min.TXT → ...es/data/agilent_txt/001F0104.D/Report.TXT b/.../agilent_txt/001F0104.D/Report 40_min.TXT → ...es/data/agilent_txt/001F0104.D/Report.TXT
diff --git a/docs/examples/data/asm.zip b/docs/examples/data/asm.zip