From ba2be42c714fffb6b62ea13e7ecfbecb5f4a4d9a Mon Sep 17 00:00:00 2001 From: agalitsyna Date: Mon, 12 Feb 2024 19:02:56 -0500 Subject: [PATCH] Cross score sandbox fixes (#497) * Make cross_score work via CLI and small style fixes (#474) * Cross-score fix of edge case fit bin-wise annotation of cross-scores * Black and parameter naming issue --------- Co-authored-by: Aleksandr --- cooltools/sandbox/cross_score.py | 256 ++++++++++++++++--------------- 1 file changed, 129 insertions(+), 127 deletions(-) diff --git a/cooltools/sandbox/cross_score.py b/cooltools/sandbox/cross_score.py index 2b42f4c4..99b9be46 100644 --- a/cooltools/sandbox/cross_score.py +++ b/cooltools/sandbox/cross_score.py @@ -11,58 +11,58 @@ import pandas as pd - parser = argparse.ArgumentParser( description="""Calculate distance-dependent contact marginals of a Hi-C map. - + These contact marginals are analogous to per-bin contact-frequency-vs-distance curves, but over a small number of distance bins. Such marginals are useful to reveal genomic sites participating in frequent long-distance contacts, e.g. anchors of loops and stripes. - """) - -parser.add_argument('COOL_URI', metavar='COOL_URI', - type=str, - help='input cooler URI') + """ +) -parser.add_argument('--distbins', - default='1e3,3e4,1e6,3e7', - help='a comma-separated list of distance bins') +parser.add_argument("COOL_URI", metavar="COOL_URI", type=str, help="input cooler URI") -parser.add_argument('--clr-weight-name', - type=str, - default='weight', - help='Name of the column to use for data balancing') +parser.add_argument( + "--dist-bins", + default="1e3,3e4,1e6,3e7", + help="a comma-separated list of distance bins", +) -parser.add_argument('--ignore-diags', - type=int, - default=2, - help='How many diagonals to ignore') +parser.add_argument( + "--clr-weight-name", + type=str, + default="weight", + help="Name of the column to use for data balancing", +) -parser.add_argument('--outfolder', - type=str, - default='./', - help='The output folder') +parser.add_argument( + "--ignore-diags", type=int, default=2, help="How many diagonals to ignore" +) -parser.add_argument('--prefix', - type=str, - default=None, - help='The prefix for output files') +parser.add_argument("--outfolder", type=str, default="./", help="The output folder") -parser.add_argument('--format', - type=str, - default='bigwig', - help='Comma-separated list of the output formats. Possible values: bigwig, bedgraph.',) +parser.add_argument( + "--prefix", type=str, default=None, help="The prefix for output files" +) -parser.add_argument('--nproc', - type=int, - default=None, - help='The number of processes. By default, use all available cores.') +parser.add_argument( + "--format", + type=str, + default="bigwig", + help="Comma-separated list of the output formats. Possible values: bigwig, bedgraph.", +) -parser.add_argument('--chunksize', - default=1e6, - help='The number of pixels per processed chunk.') +parser.add_argument( + "--nproc", + type=int, + default=None, + help="The number of processes. By default, use all available cores.", +) +parser.add_argument( + "--chunksize", default=1e6, help="The number of pixels per processed chunk." +) def get_dist_margs(clr_path, lo, hi, dist_bins, weight_name, ignore_diags): @@ -72,125 +72,127 @@ def get_dist_margs(clr_path, lo, hi, dist_bins, weight_name, ignore_diags): chunk = clr.pixels()[lo:hi] res = clr.binsize - chunk = chunk[chunk['bin2_id']-chunk['bin1_id']>=ignore_diags] + chunk = chunk[chunk["bin2_id"] - chunk["bin1_id"] >= ignore_diags] chunk = cooler.annotate(chunk, bins) - chunk['balanced'] = np.nan_to_num(chunk['count'] * chunk[f'{weight_name}1'] * chunk[f'{weight_name}2']) + chunk["balanced"] = np.nan_to_num( + chunk["count"] * chunk[f"{weight_name}1"] * chunk[f"{weight_name}2"] + ) chunk = chunk[chunk.chrom1 == chunk.chrom2] - del(clr) + del clr return _dist_margs(chunk, dist_bins, res) -def _dist_margs(chunk, dist_bins, res): - min_bin_id = chunk['bin1_id'].values[0] +def _dist_margs(chunk, dist_bins, res): + min_bin_id = chunk["bin1_id"].values[0] - dists = (chunk['bin2_id'] - chunk['bin1_id']) * res - dist_bin_id = np.searchsorted(dist_bins, dists, 'right') + dists = (chunk["bin2_id"] - chunk["bin1_id"]) * res + dist_bin_id = np.searchsorted(dist_bins, dists, "right") n_dist_bins = len(dist_bins) margs_down_loc = np.bincount( - (chunk['bin1_id'].values - min_bin_id) * n_dist_bins + dist_bin_id, - weights=np.nan_to_num(chunk['balanced'].values) - ) + (chunk["bin1_id"].values - min_bin_id) * n_dist_bins + dist_bin_id, + weights=np.nan_to_num(chunk["balanced"].values), + ) margs_up_loc = np.bincount( - (chunk['bin2_id'].values - min_bin_id) * n_dist_bins + dist_bin_id, - weights=np.nan_to_num(chunk['balanced'].values) - ) + (chunk["bin2_id"].values - min_bin_id) * n_dist_bins + dist_bin_id, + weights=np.nan_to_num(chunk["balanced"].values), + ) return min_bin_id, margs_down_loc, margs_up_loc - def drop_resolution(clrname): - name_parts = clrname.split('.') - if name_parts[-1] in ['cool', 'mcool']: + name_parts = clrname.split(".") + if name_parts[-1] in ["cool", "mcool"]: name_parts = name_parts[:-1] if name_parts[-1].isnumeric(): name_parts = name_parts[:-1] - return '.'.join(name_parts) + return ".".join(name_parts) args = parser.parse_args() logging.basicConfig(level=logging.INFO) +if __name__ == "__main__": + mp.freeze_support() + chunksize = int(float(args.chunksize)) + clr = cooler.Cooler(args.COOL_URI) + bins = clr.bins()[:] + n_pixels = clr.pixels().shape[0] + dist_bins = np.array([int(float(i)) for i in args.dist_bins.split(",")]) + weight_name = args.clr_weight_name + ignore_diags = args.ignore_diags + formats = args.format.split(",") -chunksize = int(float(args.chunksize)) -clr = cooler.Cooler(args.COOL_URI) -bins = clr.bins()[:] -n_pixels = clr.pixels().shape[0] -dist_bins = np.array([int(float(i)) for i in args.distbins.split(',')]) -weight_name = args.clr_weight_name -ignore_diags = args.ignore_diags -formats = args.format.split(',') + n_dist_bins = len(dist_bins) -n_dist_bins = len(dist_bins) + chunk_spans = np.r_[np.arange(0, n_pixels, chunksize), n_pixels] -chunk_spans = np.r_[np.arange(0, n_pixels, chunksize), n_pixels] + nproc = mp.cpu_count() if args.nproc is None else args.nproc -nproc = mp.cpu_count() if args.nproc is None else args.nproc + logging.info(f"Calculating marginals for {args.COOL_URI} using {nproc} processes") + with mp.Pool(nproc) as pool: + out = pool.starmap( + get_dist_margs, + [ + (args.COOL_URI, lo, hi, dist_bins, weight_name, ignore_diags) + for lo, hi in zip(chunk_spans[:-1], chunk_spans[1:]) + ], + ) -logging.info(f'Calculating marginals for {args.COOL_URI} using {nproc} processes') -with mp.Pool(nproc) as pool: - out = pool.starmap(get_dist_margs, - [ - (args.COOL_URI, lo, hi, dist_bins, weight_name, ignore_diags) - for lo, hi in zip(chunk_spans[:-1], chunk_spans[1:]) - ] - ) + margs_up = np.zeros(len(bins) * n_dist_bins + 1) + margs_down = np.zeros(len(bins) * n_dist_bins + 1) + + for min_bin_id, margs_down_loc, margs_up_loc in out: + margs_down[ + min_bin_id * n_dist_bins : min_bin_id * n_dist_bins + len(margs_down_loc) + ] += margs_down_loc + + margs_up[ + min_bin_id * n_dist_bins : min_bin_id * n_dist_bins + len(margs_up_loc) + ] += margs_up_loc + + margs_down = margs_down[:-1].reshape((len(bins), n_dist_bins)).T + margs_up = margs_up[:-1].reshape((len(bins), n_dist_bins)).T + + out_folder = pathlib.Path(args.outfolder) + clr_name = pathlib.Path(args.COOL_URI.split(":")[0]).name + clr_name = drop_resolution(clr_name) + + prefix = clr_name if args.prefix is None else args.prefix + res = clr.binsize -margs_up = np.zeros(len(bins) * n_dist_bins) -margs_down = np.zeros(len(bins) * n_dist_bins) - -for min_bin_id, margs_down_loc, margs_up_loc in out: - margs_down[ - min_bin_id * n_dist_bins: - min_bin_id * n_dist_bins + len(margs_down_loc) - ] += margs_down_loc - - margs_up[ - min_bin_id * n_dist_bins: - min_bin_id * n_dist_bins + len(margs_up_loc) - ] += margs_up_loc - - -margs_down = margs_down.reshape((len(bins), n_dist_bins)).T -margs_up = margs_up.reshape((len(bins), n_dist_bins)).T - - -out_folder = pathlib.Path(args.outfolder) -clr_name = pathlib.Path(args.COOL_URI.split(':')[0]).name -clr_name = drop_resolution(clr_name) - -prefix = clr_name if args.prefix is None else args.prefix -res = clr.binsize - -for dist_bin_id in range(n_dist_bins): - lo = np.r_[0, dist_bins][dist_bin_id] - hi = np.r_[0, dist_bins][dist_bin_id+1] - - for dir_str, margs in [('up', margs_up), ('down', margs_down), ('both', margs_up+margs_down)]: - out_df = bins[['chrom', 'start', 'end']].copy() - out_df['marg'] = margs[dist_bin_id] - out_df['marg'] = out_df['marg'].mask(bins[weight_name].isnull(), np.nan) - - if 'bigwig' in formats: - file_name = f'{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bw' - logging.info(f'Write output into {file_name}') - bioframe.io.to_bigwig( - out_df, - chromsizes=clr.chromsizes, - outpath=str(out_folder / file_name)) - - if 'bedgraph' in formats: - file_name = f'{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bg.gz' - logging.info(f'Write output into {file_name}') - out_df.to_csv( - str(out_folder / file_name), - sep='\t', - index=False, - header=False, - ) - - + for dist_bin_id in range(n_dist_bins): + lo = np.r_[0, dist_bins][dist_bin_id] + hi = np.r_[0, dist_bins][dist_bin_id + 1] + + for dir_str, margs in [ + ("up", margs_up), + ("down", margs_down), + ("both", margs_up + margs_down), + ]: + out_df = bins[["chrom", "start", "end"]].copy() + out_df["marg"] = margs[dist_bin_id] + out_df["marg"] = out_df["marg"].mask(bins[weight_name].isnull(), np.nan) + + if "bigwig" in formats: + file_name = f"{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bw" + logging.info(f"Write output into {file_name}") + bioframe.io.to_bigwig( + out_df, + chromsizes=clr.chromsizes, + outpath=str(out_folder / file_name), + ) + + if "bedgraph" in formats: + file_name = f"{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bg.gz" + logging.info(f"Write output into {file_name}") + out_df.to_csv( + str(out_folder / file_name), + sep="\t", + index=False, + header=False, + )