Add bigtools bigbed writer

open2c · Oct 18, 2024 · 1eebe94 · 1eebe94
1 parent 43bda60
commit 1eebe94
Show file tree

Hide file tree

Showing 2 changed files with 169 additions and 141 deletions.
diff --git a/bioframe/io/bed.py b/bioframe/io/bed.py
@@ -597,50 +597,15 @@ def parse_bed_schema(schema: str) -> tuple[int, bool]:
     return n, extended
 
 
-def to_bed(
+def to_bed_dataframe(
     df: pd.DataFrame,
-    path: str | pathlib.Path | None = None,
-    *,
     schema: str = "infer",
     validate_fields: bool = True,
     require_sorted: bool = False,
     chromsizes: dict | pd.Series | None = None,
     strict_score: bool = False,
     replace_na: bool = True,
-    na_rep: str = "nan",
-) -> str | None:
-    """Write a DataFrame to a BED file.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame to write.
-    path : str or Path, optional
-        Path to write the BED file to. If ``None``, the serialized BED file is
-        returned as a string.
-    schema : str, optional [default: "infer"]
-        BED schema to use. If ``"infer"``, the schema is inferred from the
-        DataFrame's columns.
-    validate_fields : bool, optional [default: True]
-        Whether to validate the fields of the BED file.
-    require_sorted : bool, optional [default: False]
-        Whether to require the BED file to be sorted.
-    chromsizes : dict or pd.Series, optional
-        Chromosome sizes to validate against.
-    strict_score : bool, optional [default: False]
-        Whether to strictly enforce validation of the score field (0-1000).
-    replace_na : bool, optional [default: True]
-        Whether to replace null values of standard BED fields with
-        compliant uninformative values.
-    na_rep : str, optional [default: "nan"]
-        String representation of null values if written.
-
-    Returns
-    -------
-    str or None:
-        The serialized BED file as a string if ``path`` is ``None``, otherwise
-        ``None``.
-    """
+) -> pd.DataFrame:
     if schema == "infer":
         n, extended = infer_bed_schema(df)
     else:
@@ -712,4 +677,60 @@ def to_bed(
         if col in custom_cols:
             bed[col] = df[col]
 
+    return bed
+
+
+def to_bed(
+    df: pd.DataFrame,
+    path: str | pathlib.Path | None = None,
+    *,
+    schema: str = "infer",
+    validate_fields: bool = True,
+    require_sorted: bool = False,
+    chromsizes: dict | pd.Series | None = None,
+    strict_score: bool = False,
+    replace_na: bool = True,
+    na_rep: str = "nan",
+) -> str | None:
+    """Write a DataFrame to a BED file.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame to write.
+    path : str or Path, optional
+        Path to write the BED file to. If ``None``, the serialized BED file is
+        returned as a string.
+    schema : str, optional [default: "infer"]
+        BED schema to use. If ``"infer"``, the schema is inferred from the
+        DataFrame's columns.
+    validate_fields : bool, optional [default: True]
+        Whether to validate the fields of the BED file.
+    require_sorted : bool, optional [default: False]
+        Whether to require the BED file to be sorted.
+    chromsizes : dict or pd.Series, optional
+        Chromosome sizes to validate against.
+    strict_score : bool, optional [default: False]
+        Whether to strictly enforce validation of the score field (0-1000).
+    replace_na : bool, optional [default: True]
+        Whether to replace null values of standard BED fields with
+        compliant uninformative values.
+    na_rep : str, optional [default: "nan"]
+        String representation of null values if written.
+
+    Returns
+    -------
+    str or None:
+        The serialized BED file as a string if ``path`` is ``None``, otherwise
+        ``None``.
+    """
+    bed = to_bed_dataframe(
+        df,
+        schema=schema,
+        validate_fields=validate_fields,
+        require_sorted=require_sorted,
+        chromsizes=chromsizes,
+        strict_score=strict_score,
+        replace_na=replace_na,
+    )
     return bed.to_csv(path, sep="\t", na_rep=na_rep, index=False, header=False)
diff --git a/bioframe/io/fileops.py b/bioframe/io/fileops.py
@@ -245,22 +245,21 @@ def read_alignments(fp, chrom=None, start=None, end=None):
         raise ImportError("pysam is required to use `read_alignments`") from None
 
     ext = os.path.splitext(fp)[1]
-    if ext == '.sam':
-        mode = 'r'
-    elif ext == '.bam':
-        mode = 'rb'
-    elif ext == '.cram':
-        mode = 'rc'
+    if ext == ".sam":
+        mode = "r"
+    elif ext == ".bam":
+        mode = "rb"
+    elif ext == ".cram":
+        mode = "rc"
     else:
-        raise ValueError(f'{ext} is not a supported filetype')
+        raise ValueError(f"{ext} is not a supported filetype")
 
     with closing(pysam.AlignmentFile(fp, mode)) as f:
         records = []
         for s in f.fetch(chrom, start, end):
             # Needed because array.array is not json serializable
             tags = [
-                (k, v.tolist() if isinstance(v, array.array) else v)
-                for k, v in s.tags
+                (k, v.tolist() if isinstance(v, array.array) else v) for k, v in s.tags
             ]
             records.append(
                 (
@@ -487,9 +486,38 @@ def read_bigbed(path, chrom, start=None, end=None, engine="auto"):
     return df
 
 
-def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_binary=None):
-    """
-    Save a bedGraph-like dataframe as a binary BigWig track.
+def _find_ucsc_binary(path, cmd):
+    if path is None:
+        try:
+            assert shutil.which(cmd) is not None
+        except Exception:
+            raise ValueError(
+                f"{cmd} is not present in the current environment. "
+                f"Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
+                f"install it with, for example, conda install -y -c bioconda "
+                f"ucsc-{cmd.lower()} "
+            ) from None
+    elif path.endswith(cmd):
+        if not os.path.isfile(path) and os.access(path, os.X_OK):
+            raise ValueError(
+                f"{cmd} is absent in the provided path or cannot be "
+                f"executed: {path}. "
+            )
+        cmd = path
+    else:
+        cmd = os.path.join(path, cmd)
+        if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
+            raise ValueError(
+                f"{cmd} is absent in the provided path or cannot be "
+                f"executed: {path}. "
+            )
+    return cmd
+
+
+def to_bigwig(
+    df, chromsizes, outpath, value_field=None, engine="ucsc", path_to_binary=None
+):
+    """Save a bedGraph-like dataframe as a binary BigWig file.
 
     Parameters
     ----------
@@ -507,7 +535,6 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
         Provide system path to the bedGraphToBigWig binary.
     engine : {'ucsc', 'bigtools'}, optional
         Engine to use for creating the BigWig file.
-
     """
 
     is_bedgraph = True
@@ -528,43 +555,21 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
     bg["chrom"] = bg["chrom"].astype(str)
     bg = bg.sort_values(["chrom", "start", "end"])
 
-    if chromsizes is None:
-        chromsizes = df.groupby('chrom')['end']
-
-    if engine.lower() == 'ucsc':
-        if path_to_binary is None:
-            cmd = "bedGraphToBigWig"
-            try:
-                assert shutil.which(cmd) is not None
-            except Exception:
-                raise ValueError(
-                    "bedGraphToBigWig is not present in the current environment. "
-                    "Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
-                    "install it with, for example, conda install -y -c bioconda "
-                    "ucsc-bedgraphtobigwig "
-                ) from None
-        elif path_to_binary.endswith("bedGraphToBigWig"):
-            if not os.path.isfile(path_to_binary) and os.access(path_to_binary, os.X_OK):
-                raise ValueError(
-                    f"bedGraphToBigWig is absent in the provided path or cannot be "
-                    f"fexecuted: {path_to_binary}. "
-                )
-            cmd = path_to_binary
-        else:
-            cmd = os.path.join(path_to_binary, "bedGraphToBigWig")
-            if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
-                raise ValueError(
-                    f"bedGraphToBigWig is absent in the provided path or cannot be "
-                    f"executed: {path_to_binary}. "
-                )
+    if engine.lower() == "ucsc":
+        cmd = _find_ucsc_binary(path_to_binary, "bedGraphToBigWig")
 
         with tempfile.NamedTemporaryFile(suffix=".bg") as f, \
-            tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs:  # fmt: skip
-            chromsizes.to_csv(cs, sep="\t", header=False)
+             tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs:  # fmt: skip # noqa: E501
+            pd.Series(chromsizes).to_csv(cs, sep="\t", header=False)
             cs.flush()
 
             bg.to_csv(
-                f.name, sep="\t", columns=columns, index=False, header=False, na_rep="nan"
+                f.name,
+                sep="\t",
+                columns=columns,
+                index=False,
+                header=False,
+                na_rep="nan",
             )
 
             p = subprocess.run(
@@ -573,21 +578,27 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
             )
         return p
 
-    elif engine.lower() == 'bigtools':
-        import pybigtools
+    elif engine.lower() == "bigtools":
+        try:
+            import pybigtools
+        except ImportError:
+            raise ImportError(
+                "pybigtools is required to use engine='bigtools'"
+            ) from None
 
         f = pybigtools.open(outpath, "w")
         if issubclass(type(chromsizes), pd.Series):
             chromsizes = chromsizes.astype(int).to_dict()
 
-        bg = bg.astype({'chrom':str, "start": int, "end": int, value_field: float})
+        bg = bg.astype({"chrom": str, "start": int, "end": int, value_field: float})
         f.write(chroms=chromsizes, vals=bg.itertuples(index=False))
         f.close()
 
 
-def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
-    """
-    Save a bedGraph-like dataframe as a binary BigWig track.
+def to_bigbed(
+    df, chromsizes, outpath, schema="infer", engine="ucsc", path_to_binary=None
+):
+    """Save a BED-like dataframe as a binary BigBed file.
 
     Parameters
     ----------
@@ -602,63 +613,59 @@ def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
         Select the column label of the data frame to generate the track. Default
         is to use the fourth column.
     path_to_binary : str, optional
-        Provide system path to the bedGraphToBigWig binary.
-
+        Provide system path to the bedToBigBed binary.
     """
+    from bioframe.io.bed import infer_bed_schema, parse_bed_schema, to_bed_dataframe
 
-    if path_to_binary is None:
-        cmd = "bedToBigBed"
-        try:
-            assert shutil.which(cmd) is not None
-        except Exception:
-            raise ValueError(
-                "bedToBigBed is not present in the current environment. "
-                "Pass it as 'path_to_binary' parameter to bioframe.to_bigbed or "
-                "install it with, for example, conda install -y -c bioconda "
-                "ucsc-bedtobigbed "
-            ) from None
-    elif path_to_binary.endswith("bedToBigBed"):
-        if not os.path.isfile(path_to_binary) and os.access(path_to_binary, os.X_OK):
-            raise ValueError(
-                f"bedToBigBed is absent in the provided path or cannot be "
-                f"executed: {path_to_binary}. "
-            )
-        cmd = path_to_binary
+    if schema == "infer":
+        n, _ = infer_bed_schema(df)
     else:
-        cmd = os.path.join(path_to_binary, "bedGraphToBigWig")
-        if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
-            raise ValueError(
-                f"bedToBigBed is absent in the provided path or cannot be "
-                f"executed: {path_to_binary}. "
+        n, _ = parse_bed_schema(schema)
+
+    bed = to_bed_dataframe(df, schema=schema)
+    m = len(bed.columns) - n
+    schema = f"bed{n}+{m}" if m > 0 else f"bed{n}"
+
+    if engine.lower() == "ucsc":
+        if path_to_binary is None:
+            cmd = _find_ucsc_binary(path_to_binary, "bedToBigBed")
+
+        with tempfile.NamedTemporaryFile(suffix=".bed") as f, \
+             tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs:  # fmt: skip # noqa: E501
+            pd.Series(chromsizes).to_csv(cs, sep="\t", header=False)
+            cs.flush()
+
+            bed.to_csv(
+                f.name,
+                sep="\t",
+                columns=bed.columns,
+                index=False,
+                header=False,
+                na_rep="nan",
             )
 
-    is_bed6 = True
-    for col in ["chrom", "start", "end", "name", "score", "strand"]:
-        if col not in df.columns:
-            is_bed6 = False
-    if len(df.columns) < 6:
-        is_bed6 = False
-
-    if not is_bed6:
-        raise ValueError(f"A bed6-like DataFrame is required, got {df.columns}")
-
-    columns = ["chrom", "start", "end", "name", "score", "strand"]
-    bed = df[columns].copy()
-    bed["chrom"] = bed["chrom"].astype(str)
-    bed = bed.sort_values(["chrom", "start", "end"])
-
-    with tempfile.NamedTemporaryFile(suffix=".bed") as f, tempfile.NamedTemporaryFile(
-        "wt", suffix=".chrom.sizes"
-    ) as cs:
-        chromsizes.to_csv(cs, sep="\t", header=False)
-        cs.flush()
-
-        bed.to_csv(
-            f.name, sep="\t", columns=columns, index=False, header=False, na_rep="nan"
-        )
+            p = subprocess.run(
+                [cmd, f"-type={schema}", f.name, cs.name, outpath],
+                capture_output=True,
+            )
+        return p
 
-        p = subprocess.run(
-            [cmd, f"-type={schema}", f.name, cs.name, outpath],
-            capture_output=True,
+    elif engine.lower() == "bigtools":
+        try:
+            import pybigtools
+        except ImportError:
+            raise ImportError(
+                "pybigtools is required to use engine='bigtools'"
+            ) from None
+
+        f = pybigtools.open(outpath, "w")
+        if issubclass(type(chromsizes), pd.Series):
+            chromsizes = chromsizes.astype(int).to_dict()
+
+        bed = bed.astype({"chrom": str, "start": int, "end": int})
+        record_iter = (
+            (row[0], row[1], row[2], "\t".join(str(x) for x in row[3:]))
+            for row in bed.itertuples(index=False)
         )
-    return p
+        f.write(chroms=chromsizes, vals=record_iter)
+        f.close()