Skip to content

Commit

Permalink
Add bigtools bigbed writer
Browse files Browse the repository at this point in the history
  • Loading branch information
nvictus committed Oct 18, 2024
1 parent 43bda60 commit 1eebe94
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 141 deletions.
95 changes: 58 additions & 37 deletions bioframe/io/bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,50 +597,15 @@ def parse_bed_schema(schema: str) -> tuple[int, bool]:
return n, extended


def to_bed(
def to_bed_dataframe(
df: pd.DataFrame,
path: str | pathlib.Path | None = None,
*,
schema: str = "infer",
validate_fields: bool = True,
require_sorted: bool = False,
chromsizes: dict | pd.Series | None = None,
strict_score: bool = False,
replace_na: bool = True,
na_rep: str = "nan",
) -> str | None:
"""Write a DataFrame to a BED file.
Parameters
----------
df : pd.DataFrame
DataFrame to write.
path : str or Path, optional
Path to write the BED file to. If ``None``, the serialized BED file is
returned as a string.
schema : str, optional [default: "infer"]
BED schema to use. If ``"infer"``, the schema is inferred from the
DataFrame's columns.
validate_fields : bool, optional [default: True]
Whether to validate the fields of the BED file.
require_sorted : bool, optional [default: False]
Whether to require the BED file to be sorted.
chromsizes : dict or pd.Series, optional
Chromosome sizes to validate against.
strict_score : bool, optional [default: False]
Whether to strictly enforce validation of the score field (0-1000).
replace_na : bool, optional [default: True]
Whether to replace null values of standard BED fields with
compliant uninformative values.
na_rep : str, optional [default: "nan"]
String representation of null values if written.
Returns
-------
str or None:
The serialized BED file as a string if ``path`` is ``None``, otherwise
``None``.
"""
) -> pd.DataFrame:
if schema == "infer":
n, extended = infer_bed_schema(df)
else:
Expand Down Expand Up @@ -712,4 +677,60 @@ def to_bed(
if col in custom_cols:
bed[col] = df[col]

return bed


def to_bed(
df: pd.DataFrame,
path: str | pathlib.Path | None = None,
*,
schema: str = "infer",
validate_fields: bool = True,
require_sorted: bool = False,
chromsizes: dict | pd.Series | None = None,
strict_score: bool = False,
replace_na: bool = True,
na_rep: str = "nan",
) -> str | None:
"""Write a DataFrame to a BED file.
Parameters
----------
df : pd.DataFrame
DataFrame to write.
path : str or Path, optional
Path to write the BED file to. If ``None``, the serialized BED file is
returned as a string.
schema : str, optional [default: "infer"]
BED schema to use. If ``"infer"``, the schema is inferred from the
DataFrame's columns.
validate_fields : bool, optional [default: True]
Whether to validate the fields of the BED file.
require_sorted : bool, optional [default: False]
Whether to require the BED file to be sorted.
chromsizes : dict or pd.Series, optional
Chromosome sizes to validate against.
strict_score : bool, optional [default: False]
Whether to strictly enforce validation of the score field (0-1000).
replace_na : bool, optional [default: True]
Whether to replace null values of standard BED fields with
compliant uninformative values.
na_rep : str, optional [default: "nan"]
String representation of null values if written.
Returns
-------
str or None:
The serialized BED file as a string if ``path`` is ``None``, otherwise
``None``.
"""
bed = to_bed_dataframe(
df,
schema=schema,
validate_fields=validate_fields,
require_sorted=require_sorted,
chromsizes=chromsizes,
strict_score=strict_score,
replace_na=replace_na,
)
return bed.to_csv(path, sep="\t", na_rep=na_rep, index=False, header=False)
215 changes: 111 additions & 104 deletions bioframe/io/fileops.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,22 +245,21 @@ def read_alignments(fp, chrom=None, start=None, end=None):
raise ImportError("pysam is required to use `read_alignments`") from None

ext = os.path.splitext(fp)[1]
if ext == '.sam':
mode = 'r'
elif ext == '.bam':
mode = 'rb'
elif ext == '.cram':
mode = 'rc'
if ext == ".sam":
mode = "r"
elif ext == ".bam":
mode = "rb"
elif ext == ".cram":
mode = "rc"
else:
raise ValueError(f'{ext} is not a supported filetype')
raise ValueError(f"{ext} is not a supported filetype")

with closing(pysam.AlignmentFile(fp, mode)) as f:
records = []
for s in f.fetch(chrom, start, end):
# Needed because array.array is not json serializable
tags = [
(k, v.tolist() if isinstance(v, array.array) else v)
for k, v in s.tags
(k, v.tolist() if isinstance(v, array.array) else v) for k, v in s.tags
]
records.append(
(
Expand Down Expand Up @@ -487,9 +486,38 @@ def read_bigbed(path, chrom, start=None, end=None, engine="auto"):
return df


def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_binary=None):
"""
Save a bedGraph-like dataframe as a binary BigWig track.
def _find_ucsc_binary(path, cmd):
if path is None:
try:
assert shutil.which(cmd) is not None
except Exception:
raise ValueError(
f"{cmd} is not present in the current environment. "
f"Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
f"install it with, for example, conda install -y -c bioconda "
f"ucsc-{cmd.lower()} "
) from None
elif path.endswith(cmd):
if not os.path.isfile(path) and os.access(path, os.X_OK):
raise ValueError(
f"{cmd} is absent in the provided path or cannot be "
f"executed: {path}. "
)
cmd = path
else:
cmd = os.path.join(path, cmd)
if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
raise ValueError(
f"{cmd} is absent in the provided path or cannot be "
f"executed: {path}. "
)
return cmd


def to_bigwig(
df, chromsizes, outpath, value_field=None, engine="ucsc", path_to_binary=None
):
"""Save a bedGraph-like dataframe as a binary BigWig file.
Parameters
----------
Expand All @@ -507,7 +535,6 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
Provide system path to the bedGraphToBigWig binary.
engine : {'ucsc', 'bigtools'}, optional
Engine to use for creating the BigWig file.
"""

is_bedgraph = True
Expand All @@ -528,43 +555,21 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
bg["chrom"] = bg["chrom"].astype(str)
bg = bg.sort_values(["chrom", "start", "end"])

if chromsizes is None:
chromsizes = df.groupby('chrom')['end']

if engine.lower() == 'ucsc':
if path_to_binary is None:
cmd = "bedGraphToBigWig"
try:
assert shutil.which(cmd) is not None
except Exception:
raise ValueError(
"bedGraphToBigWig is not present in the current environment. "
"Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
"install it with, for example, conda install -y -c bioconda "
"ucsc-bedgraphtobigwig "
) from None
elif path_to_binary.endswith("bedGraphToBigWig"):
if not os.path.isfile(path_to_binary) and os.access(path_to_binary, os.X_OK):
raise ValueError(
f"bedGraphToBigWig is absent in the provided path or cannot be "
f"fexecuted: {path_to_binary}. "
)
cmd = path_to_binary
else:
cmd = os.path.join(path_to_binary, "bedGraphToBigWig")
if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
raise ValueError(
f"bedGraphToBigWig is absent in the provided path or cannot be "
f"executed: {path_to_binary}. "
)
if engine.lower() == "ucsc":
cmd = _find_ucsc_binary(path_to_binary, "bedGraphToBigWig")

with tempfile.NamedTemporaryFile(suffix=".bg") as f, \
tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs: # fmt: skip
chromsizes.to_csv(cs, sep="\t", header=False)
tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs: # fmt: skip # noqa: E501
pd.Series(chromsizes).to_csv(cs, sep="\t", header=False)
cs.flush()

bg.to_csv(
f.name, sep="\t", columns=columns, index=False, header=False, na_rep="nan"
f.name,
sep="\t",
columns=columns,
index=False,
header=False,
na_rep="nan",
)

p = subprocess.run(
Expand All @@ -573,21 +578,27 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
)
return p

elif engine.lower() == 'bigtools':
import pybigtools
elif engine.lower() == "bigtools":
try:
import pybigtools
except ImportError:
raise ImportError(
"pybigtools is required to use engine='bigtools'"
) from None

f = pybigtools.open(outpath, "w")
if issubclass(type(chromsizes), pd.Series):
chromsizes = chromsizes.astype(int).to_dict()

bg = bg.astype({'chrom':str, "start": int, "end": int, value_field: float})
bg = bg.astype({"chrom": str, "start": int, "end": int, value_field: float})
f.write(chroms=chromsizes, vals=bg.itertuples(index=False))
f.close()


def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
"""
Save a bedGraph-like dataframe as a binary BigWig track.
def to_bigbed(
df, chromsizes, outpath, schema="infer", engine="ucsc", path_to_binary=None
):
"""Save a BED-like dataframe as a binary BigBed file.
Parameters
----------
Expand All @@ -602,63 +613,59 @@ def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
Select the column label of the data frame to generate the track. Default
is to use the fourth column.
path_to_binary : str, optional
Provide system path to the bedGraphToBigWig binary.
Provide system path to the bedToBigBed binary.
"""
from bioframe.io.bed import infer_bed_schema, parse_bed_schema, to_bed_dataframe

if path_to_binary is None:
cmd = "bedToBigBed"
try:
assert shutil.which(cmd) is not None
except Exception:
raise ValueError(
"bedToBigBed is not present in the current environment. "
"Pass it as 'path_to_binary' parameter to bioframe.to_bigbed or "
"install it with, for example, conda install -y -c bioconda "
"ucsc-bedtobigbed "
) from None
elif path_to_binary.endswith("bedToBigBed"):
if not os.path.isfile(path_to_binary) and os.access(path_to_binary, os.X_OK):
raise ValueError(
f"bedToBigBed is absent in the provided path or cannot be "
f"executed: {path_to_binary}. "
)
cmd = path_to_binary
if schema == "infer":
n, _ = infer_bed_schema(df)
else:
cmd = os.path.join(path_to_binary, "bedGraphToBigWig")
if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
raise ValueError(
f"bedToBigBed is absent in the provided path or cannot be "
f"executed: {path_to_binary}. "
n, _ = parse_bed_schema(schema)

bed = to_bed_dataframe(df, schema=schema)
m = len(bed.columns) - n
schema = f"bed{n}+{m}" if m > 0 else f"bed{n}"

if engine.lower() == "ucsc":
if path_to_binary is None:
cmd = _find_ucsc_binary(path_to_binary, "bedToBigBed")

with tempfile.NamedTemporaryFile(suffix=".bed") as f, \
tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs: # fmt: skip # noqa: E501
pd.Series(chromsizes).to_csv(cs, sep="\t", header=False)
cs.flush()

bed.to_csv(
f.name,
sep="\t",
columns=bed.columns,
index=False,
header=False,
na_rep="nan",
)

is_bed6 = True
for col in ["chrom", "start", "end", "name", "score", "strand"]:
if col not in df.columns:
is_bed6 = False
if len(df.columns) < 6:
is_bed6 = False

if not is_bed6:
raise ValueError(f"A bed6-like DataFrame is required, got {df.columns}")

columns = ["chrom", "start", "end", "name", "score", "strand"]
bed = df[columns].copy()
bed["chrom"] = bed["chrom"].astype(str)
bed = bed.sort_values(["chrom", "start", "end"])

with tempfile.NamedTemporaryFile(suffix=".bed") as f, tempfile.NamedTemporaryFile(
"wt", suffix=".chrom.sizes"
) as cs:
chromsizes.to_csv(cs, sep="\t", header=False)
cs.flush()

bed.to_csv(
f.name, sep="\t", columns=columns, index=False, header=False, na_rep="nan"
)
p = subprocess.run(
[cmd, f"-type={schema}", f.name, cs.name, outpath],
capture_output=True,
)
return p

p = subprocess.run(
[cmd, f"-type={schema}", f.name, cs.name, outpath],
capture_output=True,
elif engine.lower() == "bigtools":
try:
import pybigtools
except ImportError:
raise ImportError(
"pybigtools is required to use engine='bigtools'"
) from None

f = pybigtools.open(outpath, "w")
if issubclass(type(chromsizes), pd.Series):
chromsizes = chromsizes.astype(int).to_dict()

bed = bed.astype({"chrom": str, "start": int, "end": int})
record_iter = (
(row[0], row[1], row[2], "\t".join(str(x) for x in row[3:]))
for row in bed.itertuples(index=False)
)
return p
f.write(chroms=chromsizes, vals=record_iter)
f.close()

0 comments on commit 1eebe94

Please sign in to comment.