Skip to content

Commit

Permalink
multiple updates to CLI (#331)
Browse files Browse the repository at this point in the history
* multiple updates to CLI

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ChristopherMancuso and pre-commit-ci[bot] authored Oct 31, 2024
1 parent 69c4d43 commit 84325af
Showing 1 changed file with 77 additions and 40 deletions.
117 changes: 77 additions & 40 deletions geneplexus/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Command line interface for the GenePlexus pipeline."""
import argparse
import atexit
import json
import os
import os.path as osp
import pathlib
Expand Down Expand Up @@ -37,7 +38,7 @@ def parse_args() -> argparse.Namespace:
"--input_file",
metavar="",
required=True,
help="Input gene list (.txt) file (one gene per line).",
help="Input gene list (.txt) file.",
)

parser.add_argument(
Expand All @@ -50,6 +51,15 @@ def parse_args() -> argparse.Namespace:
"tabs. Other generic separator are also supported, e.g. ', '.",
)

parser.add_argument(
"-dd",
"--data_dir",
default=None,
metavar="",
help="Directory in which the data are stored, if set to None, then use "
"the default data directory ~/.data/geneplexus",
)

parser.add_argument(
"-n",
"--network",
Expand All @@ -72,24 +82,31 @@ def parse_args() -> argparse.Namespace:
"--sp_trn",
default="Human",
metavar="",
help="Species of training data {format_choices(config.ALL_SPECIES}",
help=f"Species of training data {format_choices(config.ALL_SPECIES)}",
)

parser.add_argument(
"-s2",
"--sp_tst",
default="Human",
default="Mouse",
metavar="",
help=f"Species of test data {format_choices(config.ALL_SPECIES)}",
)

parser.add_argument(
"-g1",
"--gsc_trn",
default="GO",
metavar="",
help="Species of test data {format_choices(config.ALL_SPECIES}",
help=f"Geneset collection used to generate negatives. {format_choices(config.ALL_GSCS)}",
)

parser.add_argument(
"-g",
"--gsc",
"-g2",
"--gsc_tst",
default="GO",
metavar="",
help="Geneset collection used to generate negatives and the model"
f"similarities. {format_choices(config.ALL_GSCS)}",
help=f"Geneset collection used for model similarities. {format_choices(config.ALL_GSCS)}",
)

parser.add_argument(
Expand All @@ -101,15 +118,6 @@ def parse_args() -> argparse.Namespace:
help="Number of nodes in the small edgelist.",
)

parser.add_argument(
"-dd",
"--data_dir",
default=None,
metavar="",
help="Directory in which the data are stored, if set to None, then use "
"the default data directory ~/.data/geneplexus",
)

parser.add_argument(
"-od",
"--output_dir",
Expand All @@ -126,6 +134,13 @@ def parse_args() -> argparse.Namespace:
help=f"Logging level. {format_choices(config.LOG_LEVELS)}",
)

parser.add_argument(
"-ad",
"--auto_download_off",
action="store_true",
help="Turns off autodownloader which is on by default.",
)

parser.add_argument(
"-q",
"--quiet",
Expand Down Expand Up @@ -155,15 +170,19 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--skip-mdl-sim",
action="store_true",
help="Skip model similarity computation. This computation is not yet "
"available when using custom networks due to the lack of pretrained "
"models for comparison.",
help="Skip model similarity computation",
)

parser.add_argument(
"--skip-sm-edgelist",
action="store_true",
help="Skip making small edgelist.",
)

return parser.parse_args()


def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool):
def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool, skip_sm_edgelist: bool):
"""Run the full GenePlexus pipeline.
Args:
Expand All @@ -175,12 +194,15 @@ def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool):
"""
gp.fit_and_predict()
gp.make_small_edgelist(num_nodes=num_nodes)
gp.alter_validation_df()
if not skip_mdl_sim:
gp.make_sim_dfs()
else:
logger.info("Skipping model similarity computation.")
if not skip_sm_edgelist:
gp.make_small_edgelist(num_nodes=num_nodes)
else:
logger.info("Skipping making small edgelist.")
gp.alter_validation_df()


def df_to_tsv(df: pd.DataFrame, root: str, name: str):
Expand All @@ -195,29 +217,37 @@ def df_to_tsv(df: pd.DataFrame, root: str, name: str):
df.to_csv(osp.join(root, name), sep="\t", index=False)


def save_results(gp, outdir, zip_output, overwrite, skip_mdl_sim):
def save_results(gp, outdir, zip_output, overwrite, skip_mdl_sim, skip_sm_edgelist):
"""Save all results generated by the GenePlexus pipeline.
Args:
outdir: Output directory.
zip_output: Whether or not to zip the output directory into a zip file.
overwrite: Whether or not to overwrite existing results.
skip_mdl_sim: Whether or not to skip the computation of model
similarities with GO and Mondo. This option is not yet available
for custom networks.
similarities with GO, Monarch and/or Mondo.
skip_sm_edgelist: Whether or not to skip making the small edgelist.
"""
zip_outpath = _suffix_fn(f"{outdir}.zip", overwrite=overwrite)
outdir = _suffix_dir(outdir, overwrite=overwrite, mktmp=zip_output)

np.savetxt(osp.join(outdir, "cross_validation.txt"), gp.avgps, fmt="%.18f")
df_to_tsv(gp.df_convert_out, outdir, "df_convert_out.tsv")
np.savetxt(osp.join(outdir, "pos_genes_in_net.txt"), gp.pos_genes_in_net, fmt="%s")
np.savetxt(osp.join(outdir, "negative_genes.txt"), gp.negative_genes, fmt="%s")
np.savetxt(osp.join(outdir, "net_genes.txt"), gp.net_genes, fmt="%s")
with open(osp.join(outdir, "neutral_gene_info.json"), "w") as f:
json.dump(gp.neutral_gene_info, f)
np.savetxt(osp.join(outdir, "avgps.txt"), gp.avgps, fmt="%.18f")
np.savetxt(osp.join(outdir, "mdl_weights.txt"), gp.mdl_weights, fmt="%.18f")
df_to_tsv(gp.df_probs, outdir, "df_probs.tsv")
df_to_tsv(gp.df_edge, outdir, "df_edge.tsv")
df_to_tsv(gp.df_edge_sym, outdir, "df_edge_sym.tsv")
df_to_tsv(gp.df_convert_out_subset, outdir, "df_convert_out_subset.tsv")
if not skip_mdl_sim:
df_to_tsv(gp.df_sim, outdir, "df_sim.tsv")
if not skip_mdl_sim:
df_to_tsv(gp.df_edge, outdir, "df_edge.tsv")
df_to_tsv(gp.df_edge_sym, outdir, "df_edge_sym.tsv")
np.savetxt(osp.join(outdir, "isolated_genes.txt"), gp.isolated_genes, fmt="%s")
np.savetxt(osp.join(outdir, "isolated_genes_sym.txt"), gp.isolated_genes_sym, fmt="%s")
df_to_tsv(gp.df_convert_out_subset, outdir, "df_convert_out_subset.tsv")

# Dump config, close file handler and move run log to result directory
gp.dump_config(outdir)
Expand Down Expand Up @@ -302,18 +332,23 @@ def main():
"""Command line interface."""
args = parse_args()
log_level = "CRITICAL" if args.quiet else args.log_level
if args.auto_download_off:
auto_download = False
else:
auto_download = True

clear_data(args)

# Create geneplexus object and auto download data files
gp = GenePlexus(
args.data_dir,
args.network,
args.feature,
args.sp_trn,
args.sp_tst,
args.gsc,
auto_download=True,
file_loc=args.data_dir,
net_type=args.network,
features=args.feature,
sp_trn=args.sp_trn,
sp_tst=args.sp_tst,
gsc_trn=args.gsc_trn,
gsc_tst=args.gsc_tst,
auto_download=auto_download,
log_level=log_level,
)

Expand All @@ -323,8 +358,10 @@ def main():
# Save config

# Run pipeline and save results
run_pipeline(gp, args.small_edgelist_num_nodes, args.skip_mdl_sim)
save_results(gp, normexpand(args.output_dir), args.zip_output, args.overwrite, args.skip_mdl_sim)
run_pipeline(gp, args.small_edgelist_num_nodes, args.skip_mdl_sim, args.skip_sm_edgelist)
save_results(
gp, normexpand(args.output_dir), args.zip_output, args.overwrite, args.skip_mdl_sim, args.skip_sm_edgelist
)


if __name__ == "__main__":
Expand Down

0 comments on commit 84325af

Please sign in to comment.