Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
Hoeze committed Oct 27, 2018
2 parents b4a335f + 947e8b0 commit 9807c8b
Show file tree
Hide file tree
Showing 28 changed files with 7,735 additions and 1,030 deletions.
168 changes: 168 additions & 0 deletions batchglm/benchmark/nb_glm/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import os

from collections import OrderedDict
import itertools

import pandas as pd
import yaml

from batchglm.api.models.nb_glm import Simulator, Estimator


def init_benchmark(
root_dir: str,
sim: Simulator,
config_file="config.yml",
**kwargs
):
os.makedirs(root_dir, exist_ok=True)

config = {
"sim_data": "sim_data.h5",
"plot_dir": "plot_dir",
}

os.makedirs(os.path.join(root_dir, config["plot_dir"]), exist_ok=True)
sim.save(os.path.join(root_dir, config["sim_data"]))

benchmark_samples = dict()

kvlist = [
(key, val) if isinstance(val, tuple) or isinstance(val, list) else (key, [val]) for key, val in kwargs.items()
]
od = OrderedDict(sorted(kvlist))
cart = list(itertools.product(*od.values()))

df = pd.DataFrame(cart, columns=od.keys())
df.reset_index(inplace=True)
df["working_dir"] = ["idx_%i" % i for i in df["index"]]

for idx, row in df.iterrows():
name = row["working_dir"]
benchmark_samples[name] = prepare_benchmark_sample(
root_dir=root_dir,
**row.to_dict()
)
config["benchmark_samples"] = benchmark_samples

config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="w") as f:
yaml.dump(config, f, default_flow_style=False)


def prepare_benchmark_sample(
root_dir: str,
working_dir: str,
batch_size: int,
save_checkpoint_steps=25,
save_summaries_steps=25,
export_steps=25,
learning_rate: float = 0.05,
convergence_criteria="step",
stopping_criteria=5000,
train_mu: bool = None,
train_r: bool = None,
use_batching=True,
optim_algo="gradient_descent",
**kwargs
):
os.makedirs(os.path.join(root_dir, working_dir), exist_ok=True)

sample_config = {
"working_dir": working_dir,
}

setup_args = {
"batch_size": batch_size,
"extended_summary": True,
"init_a": "standard",
"init_b": "standard",
}
sample_config["setup_args"] = setup_args

init_args = {
# "working_dir": working_dir,
"save_checkpoint_steps": save_checkpoint_steps,
"save_summaries_steps": save_summaries_steps,
"export_steps": export_steps,
"export": ["a", "b", "loss", "gradient", "full_loss", "full_gradient", "batch_log_probs"],
}

sample_config["init_args"] = init_args

training_args = {
"learning_rate": learning_rate,
"convergence_criteria": convergence_criteria,
"stopping_criteria": stopping_criteria,
"train_mu": train_mu,
"train_r": train_r,
"use_batching": use_batching,
"optim_algo": optim_algo,
}

sample_config["training_args"] = training_args

return sample_config


def get_benchmark_samples(root_dir: str, config_file="config.yml"):
config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="r") as f:
config = yaml.load(f)
return list(config["benchmark_samples"].keys())


def touch(path):
with open(path, 'a'):
os.utime(path, None)


def run_benchmark(root_dir: str, sample: str, config_file="config.yml"):
config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="r") as f:
config = yaml.load(f)

sim_data_file = os.path.join(root_dir, config["sim_data"])

sample_config = config["benchmark_samples"][sample]

working_dir = os.path.join(root_dir, sample_config["working_dir"])

# working space locking
if os.path.exists(os.path.join(working_dir, "ready")):
print("benchmark sample '%s' was already estimated" % sample)
return
if os.path.exists(os.path.join(working_dir, "lock")):
print((
"benchmark sample '%s' is locked. " +
"Maybe there is already another instance estimating this sample?"
) % sample)
return

print("locking dir...", end="", flush=True)
touch(os.path.join(working_dir, "lock"))
print("\t[OK]")

print("loading data...", end="", flush=True)
sim = Simulator()
sim.load(sim_data_file)
print("\t[OK]")

setup_args = sample_config["setup_args"]
setup_args["input_data"] = sim.input_data

init_args = sample_config["init_args"]
init_args["working_dir"] = working_dir

training_args = sample_config["training_args"]

print("starting estimation of benchmark sample '%s'..." % sample)
estimator = Estimator(**setup_args)
estimator.initialize(**init_args)
estimator.train(**training_args)
print("estimation of benchmark sample '%s' ready" % sample)

print("unlocking dir and finalizing...", end="", flush=True)
os.remove(os.path.join(working_dir, "lock"))
touch(os.path.join(working_dir, "ready"))
print("\t[OK]")
139 changes: 17 additions & 122 deletions batchglm/benchmark/nb_glm/convergence.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,125 +8,12 @@
import xarray as xr
import yaml

# import pkg_constants
from batchglm.api.models.nb_glm import Simulator, Estimator
from .base import init_benchmark, get_benchmark_samples, run_benchmark, Simulator

import batchglm.utils.stats as stat_utils


def init_benchmark(
root_dir: str,
sim: Simulator,
batch_size,
stop_at_step=10000,
learning_rates=(5, 0.5, 0.05, 0.005),
save_checkpoint_steps=None,
save_summaries_steps=None,
export_steps=1,
config_file="config.yml"
):
os.makedirs(root_dir, exist_ok=True)

config = {
"sim_data": "sim_data.h5",
"plot_dir": "plot_dir",
}

os.makedirs(os.path.join(root_dir, config["plot_dir"]), exist_ok=True)
sim.save(os.path.join(root_dir, config["sim_data"]))

benchmark_samples = dict()
for lr in learning_rates:
benchmark_samples["lr%s" % lr] = prepare_benchmark_sample(
root_dir=root_dir,
working_dir="lr%s" % lr,
batch_size=batch_size,
stop_at_step=stop_at_step,
learning_rate=lr,
save_checkpoint_steps=save_checkpoint_steps,
save_summaries_steps=save_summaries_steps,
export_steps=export_steps,
)
config["benchmark_samples"] = benchmark_samples

config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="w") as f:
yaml.dump(config, f, default_flow_style=False)


def prepare_benchmark_sample(
root_dir: str,
working_dir: str,
batch_size: int,
stop_at_step: int,
learning_rate: float = 0.05,
stop_below_loss_change: float = None,
save_checkpoint_steps=25,
save_summaries_steps=25,
export_steps=25,
**kwargs

):
os.makedirs(os.path.join(root_dir, working_dir), exist_ok=True)

sample_config = {
"working_dir": working_dir,
"learning_rate": learning_rate,
"batch_size": batch_size,
}

init_args = {
# "working_dir": working_dir,
"stop_at_step": stop_at_step,
"stop_below_loss_change": stop_below_loss_change,
"save_checkpoint_steps": save_checkpoint_steps,
"save_summaries_steps": save_summaries_steps,
"export_steps": export_steps,
"export": ["a", "b", "loss", "gradient", "full_loss", "full_gradient", "batch_log_probs"],
**kwargs
}

sample_config["init_args"] = init_args

return sample_config


def get_benchmark_samples(root_dir: str, config_file="config.yml"):
config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="r") as f:
config = yaml.load(f)
return list(config["benchmark_samples"].keys())


def run_benchmark(root_dir: str, sample: str, config_file="config.yml"):
config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="r") as f:
config = yaml.load(f)

sim_data_file = os.path.join(root_dir, config["sim_data"])

sample_config = config["benchmark_samples"][sample]

working_dir = os.path.join(root_dir, sample_config["working_dir"])
batch_size = sample_config["batch_size"]
learning_rate = sample_config["learning_rate"]

init_args = sample_config["init_args"]
init_args["working_dir"] = working_dir

print("loading data...", end="", flush=True)
sim = Simulator()
sim.load(sim_data_file)
print("\t[OK]")

print("starting estimation of benchmark sample '%s'..." % sample)
estimator = Estimator(sim.input_data, batch_size=batch_size, extended_summary=True)
estimator.initialize(**init_args)
estimator.train(learning_rate=learning_rate)
print("estimation of benchmark sample '%s' ready" % sample)


def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Simulator, xr.Dataset]:
def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Simulator, xr.Dataset, dict]:
config_file = os.path.join(root_dir, config_file)
with open(config_file, mode="r") as f:
config = yaml.load(f)
Expand All @@ -150,7 +37,7 @@ def load_benchmark_dataset(root_dir: str, config_file="config.yml") -> Tuple[Sim
benchmark_data.append(data)
benchmark_data = xr.auto_combine(benchmark_data, concat_dim="benchmark", coords="all")

return sim, benchmark_data
return sim, benchmark_data, benchmark_samples


def plot_benchmark(root_dir: str, config_file="config.yml"):
Expand All @@ -163,7 +50,7 @@ def plot_benchmark(root_dir: str, config_file="config.yml"):
plot_dir = os.path.join(root_dir, config["plot_dir"])

print("loading data...", end="", flush=True)
sim, benchmark_data = load_benchmark_dataset(root_dir)
sim, benchmark_data, benchmark_sample_config = load_benchmark_dataset(root_dir)
benchmark_data.coords["time_elapsed"] = benchmark_data.time_elapsed.cumsum("step")
print("\t[OK]")

Expand Down Expand Up @@ -291,10 +178,17 @@ def main():
act_init.add_argument('--batch_size', help='batch size to use for mini-batch SGD', type=int, default=500)
act_init.add_argument('--num_batches', help='number of batches to simulate', type=int, default=4)
act_init.add_argument('--num_conditions', help='number of conditions to simulate', type=int, default=2)
# act_init.add_argument('--learning_rate', help='learning rate to use for all optimizers', type=float, default=0.05)
act_init.add_argument('--stop_at_step', help='number of steps to run', type=int, default=5000)
act_init.add_argument('--learning_rate', help='learning rate to use for all optimizers',
type=float,
nargs='+',
default=0.05)
act_init.add_argument('--stopping_criteria', help='number of steps to run', type=int, default=5000)
act_init.add_argument('--save_checkpoint_steps', help='number of steps to run', type=int, default=100)
act_init.add_argument('--save_summaries_steps', help='number of steps to run', type=int, default=1)
act_init.add_argument('--optim_algo', help='optimization algorithm',
type=str,
nargs='+',
default="gradient_descent")
act_init.add_argument('--export_steps', help='number of steps to run', type=int, default=1)

act_run = subparsers.add_parser('run', help='run a benchmark')
Expand All @@ -317,18 +211,19 @@ def main():
action = args.action
if action == "init":
sim = Simulator(num_observations=args.num_observations, num_features=args.num_features)
sim.generate_sample_description(num_batches=args.num_batches, num_confounders=args.num_conditions)
sim.generate_sample_description(num_batches=args.num_batches, num_conditions=args.num_conditions)
sim.generate()

init_benchmark(
root_dir=root_dir,
sim=sim,
batch_size=args.batch_size,
stop_at_step=args.stop_at_step,
# learning_rate=args.learning_rate,
stopping_criteria=args.stopping_criteria,
learning_rate=args.learning_rate,
save_checkpoint_steps=args.save_checkpoint_steps if args.save_checkpoint_steps > 0 else None,
save_summaries_steps=args.save_summaries_steps if args.save_summaries_steps > 0 else None,
export_steps=args.export_steps if args.export_steps > 0 else None,
optim_algo=args.optim_algo
)
elif action == "run":
if args.benchmark_sample is not None:
Expand Down
Loading

0 comments on commit 9807c8b

Please sign in to comment.