Merge branch 'det-tst' of https://github.com/gidden/cycamore into gid…

…den-det-tst
cyclus · Feb 9, 2014 · cf68e22 · cf68e22
2 parents 1e9431e + 726403f
commit cf68e22
Show file tree

Hide file tree

Showing 5 changed files with 265 additions and 29 deletions.
diff --git a/tests/README.rst b/tests/README.rst
@@ -63,3 +63,15 @@ Finally, feel free to clean up after yourself
 .. code-block:: bash
 
   $ rm *.h5
+
+Nondeterminisitic Analysis
+==========================
+
+An `analysis` python module can assist in analyzing the determinism of Cyclus
+output. It does so by running the regression tests some number of times and
+analyzing the frequency of nondeterminism of output tables and columns within
+those tables. See the module's help: 
+
+.. code-block:: python
+
+  $ python analysis.py -h
diff --git a/tests/analysis.py b/tests/analysis.py
@@ -0,0 +1,122 @@
+from __future__ import print_function
+from __future__ import division
+
+import subprocess
+from multiprocessing import Pool, Manager, cpu_count
+from collections import defaultdict
+import argparse as ap
+import time 
+
+import test_regression as tst
+
+diff_tbl = """table is different"""
+diff_col = """Column"""
+
+def collect(args):
+    """collects information on a determinisitic regression test run
+    """
+    tbl_freq, col_freq = args
+
+    rtn = subprocess.Popen(
+        ["python", "-c", 
+         "import test_regression as t; " +
+         "t.setup(); t.test_regression(check_deterministic=True)"], 
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, err = rtn.communicate()
+    #print(out, err)
+
+    for line in out.split("\n"):
+        line = line.strip()
+        if diff_tbl in line.strip():
+            tbl_name = line.split()[0]
+            tbl_freq[tbl_name] = \
+                tbl_freq[tbl_name] + 1 if tbl_name in tbl_freq else 1
+        if diff_col in line.strip():
+            col_name = line.split()[1]
+            col_freq.append((tbl_name, col_name))
+
+def proxy_lst_to_dict(lst):
+    """converts the col_freq list into a dictionary for easier processing
+    """
+    col_freq = defaultdict(lambda: defaultdict(int))
+    for tbl, col in lst:
+        col_freq[tbl][col] += 1
+    return col_freq
+
+def determ_analysis(niter=1000, fname="report"):
+    """
+    Calls deterministic regression tests for a number of iterations and reports
+    findings of nondeterminism to a file.
+
+    Parameters
+    ----------
+    niter : int
+          The number of times to run regression tests
+         
+    fname : str
+          The output filename to report to
+    """
+    m = Manager()
+
+    tbl_freq = m.dict()
+    col_freq = m.list()
+
+    # collect
+    nproc = cpu_count()
+    count = nproc if nproc == 1 else nproc - 1
+    pool  = Pool(count)
+
+    print("Beginning iterations on " + str(nproc) + " processors.")
+    args = ((tbl_freq, col_freq) for i in range(niter))
+    jobs = pool.map_async(collect, args)
+    while not jobs.ready():
+        print('{0:.1%} of jobs left to start.'.format(
+                jobs._number_left / niter))
+        time.sleep(5.0)
+    pool.close()
+    pool.join()
+    print("Finished iterations.")
+
+    # convert from proxy
+    col_freq = proxy_lst_to_dict(col_freq)
+    tbl_freq = {item[0]: item[1] for item in tbl_freq.items()}
+
+    # normalize
+    for tbl, dic in col_freq.iteritems():
+        for col, freq in dic.iteritems():
+            dic[col] = "{0:.2f}".format(float(freq) / tbl_freq[tbl])    
+    for k, v in tbl_freq.iteritems():
+        tbl_freq[k] = "{0:.2f}".format(float(v) / niter)
+
+    # report
+    lines = []
+    lines.append("Table values are reported as percent nondeterministic" + 
+                 " of total runs.\n\n")
+    lines.append("Column values are reported as percent nondeterministic" +
+                 " of all table nondeterminism occurrences.\n\n")
+    if len(tbl_freq) == 0:
+        lines.append("No nondeterminism found.")
+    for tbl, freq in tbl_freq.iteritems():
+        lines.append(tbl + " " + freq + "\n")
+        for col, freq in col_freq[tbl].iteritems():
+            lines.append("  " + col + " " + freq + "\n") 
+    with open(fname, "w") as f:
+        f.writelines(lines)
+
+def main():
+    description = "A module for analyzing the determinism of Cyclus output." 
+
+    parser = ap.ArgumentParser(description=description)
+
+    niter = 'the number of regression test runs to perform'
+    parser.add_argument('-n', '--niterations', type=int, help=niter, 
+                        default=100)
+
+    report = 'the file to write the report to'
+    parser.add_argument('--report', help=report, default='report')
+
+    args = parser.parse_args()
+    determ_analysis(args.niterations, args.report)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/cyclus_tools.py b/tests/cyclus_tools.py
@@ -1,7 +1,7 @@
 import os
 from tools import check_cmd
 
-from numpy import array_equal
+import numpy as np
 import tables
 
 import visitors
@@ -14,13 +14,117 @@ def run_cyclus(cyclus, cwd, in_path, out_path):
     cmd = [cyclus, "-o", out_path, "--input-file", in_path]
     check_cmd(cmd, cwd, holdsrtn)
 
-def db_comparator(path1, path2):
-    """Compares two Cyclus HDF5 databases 
-
-    Returns:
-            True or False. In case of False, it prints out the names
-            and differences in the compared databases.
-    """
+def compare_nondeterm(path1, path2):
+    """Compares two Cyclus HDF5 databases assuming non-deterministic AgentIDs
+    and TransactionIDs.
+    
+    Returns
+    -------
+    rtn : bool 
+        True if both databases are the same, taking into account
+        nondeterministic id assignments.
+    """    
     v1 = visitors.HDF5RegressionVisitor(path1)
     v2 = visitors.HDF5RegressionVisitor(path2)
     return v1.walk() == v2.walk()
+
+def compare_determ(path1, path2, verbose=False):
+    """Compares two Cyclus HDF5 databases assuming deterministic AgentIDs and
+    TransactionIDs
+
+    Returns
+    -------
+    rtn : bool 
+        True if both databases are identical other than their SimIDs
+    """
+    dbs_same = True
+    db_one = tables.open_file(path1, mode = "r")
+    db_two = tables.open_file(path2, mode = "r")
+    path_one = []
+    path_two = []
+
+    for node in db_one.walk_nodes(classname = "Table"):
+        path_one.append(node._v_pathname)
+
+    for node in db_two.walk_nodes(classname = "Table"):
+        path_two.append(node._v_pathname)
+
+    # Check if databases contain the same tables
+    if not np.all(path_one == path_two):
+        if verbose:
+            print("The number or names of tables in databases are not the same.")
+            print(path_one)
+            print(path_two)
+        # Close databases
+        db_one.close()
+        db_two.close()
+        dbs_same = False
+        return dbs_same
+
+    paths = path_one
+
+    for path in paths:
+        data_one = db_one.get_node(path)[:]
+        data_two = db_two.get_node(path)[:]
+        names = []
+
+        for name in data_one.dtype.names:
+            if name != "SimID":
+                names.append(name)
+
+        data_one = data_one[names]
+        data_two = data_two[names]
+
+        if np.all(data_one == data_two):
+            continue
+
+        dbs_same = False
+        if verbose:
+            msg = ""
+            msg += path.replace("/", "") 
+            msg += " table is different in the databases.\n" 
+            msg += determ_err_msg(names, data_one, data_two)
+            print(msg)
+
+    # Close databases
+    db_one.close()
+    db_two.close()
+    return dbs_same
+
+def determ_err_msg(names, data_one, data_two):
+    """Returns a string describing the deterministic difference between two
+    databases.
+    """
+    msg = ""
+    # Investigation of the differences
+    # check if the lengths are different
+    if len(data_one) != len(data_two):
+        msg += "Length mismatch: " + str(len(data_one)) + ", " + str(len(data_two))
+    else:
+        for name in names:
+            column_one = data_one[name]
+            column_two = data_two[name]
+            # check if data types are the same
+            if column_one.dtype != column_two.dtype:
+                msg += "Datatypes in column " + name +" are different."
+                msg += str(column_one.dtype)
+                msg += str(column_two.dtype)
+            elif not np.all(column_one == column_two):
+                msg += "Column " + name
+                diff = np.equal(column_one, column_two)
+                # find indices and elements for numerical values
+                indices = np.where(diff==False)
+                # check if whole table is different
+                if len(indices) == len(column_one):
+                    msg += " is completely different"
+                else:
+                    # provide mismatch percentage
+                    mismatch = 100*float(len(indices))/len(column_one)
+                    msg += " has a mismatch of" 
+                    msg += " {0:.2f}".format(mismatch) + "% \n"
+                    msg += "Indices of different objects are:\n"
+                    msg += str(indices[0]) + "\n"
+                    msg += "The different elements on these indices: \n"
+                    msg += str(column_one[indices]) + "\n"
+                    msg += str(column_two[indices]) + "\n"
+    return msg
diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -4,8 +4,9 @@
 import json
 import hashlib
 import urllib
+import uuid
 from nose.tools import assert_true
-from cyclus_tools import run_cyclus, db_comparator
+from cyclus_tools import run_cyclus, compare_determ, compare_nondeterm
 
 sim_files = {}
 fetchdir = "fetch"
@@ -18,7 +19,9 @@ def setup():
         refs = json.load(f)
     cyclus_ref = refs[-1]["cyclus-ref"]
     cycamore_ref = refs[-1]["cycamore-ref"]
-    refs = [r for r in refs if r["cyclus-ref"] == cyclus_ref and r["cycamore-ref"] == cycamore_ref]
+    refs = [r for r in refs 
+            if r["cyclus-ref"] == cyclus_ref 
+            and r["cycamore-ref"] == cycamore_ref]
     base_url = "http://regtests.fuelcycle.org/"
     for r in refs:
         fpath = os.path.join(fetchdir, r["fname"])
@@ -31,22 +34,31 @@ def setup():
             raise RuntimeError("They tooks our data!!! All our rackspace are belong to them.")
         sim_files[r["input-file"]] = fpath
 
-def test_regression():
+def test_regression(check_deterministic=False):
     """Test for all inputs in sim_files. Checks if reference and current cyclus 
     output is the same.
 
+    Parameters
+    ----------
+    check_deterministic : bool
+                        If True, also test determinisitc equality of simulations
+
     WARNING: the tests require cyclus executable to be included in PATH
     """    
     for root, dirs, files in os.walk("../input"):
         for f in files:
             if f not in sim_files:
                 continue
 
-            # print("testing input: " + sim_input + " and bench_db: " + bench_db)
-            tmp_file = "tmp.h5"
-
+            tmp_file = str(uuid.uuid4()) + ".h5"
             run_cyclus("cyclus", os.getcwd(), os.path.join(root, f), tmp_file)
 
             if os.path.isfile(tmp_file):
-                assert_true(db_comparator(sim_files[f], tmp_file))
+                nondeterm = compare_nondeterm(sim_files[f], tmp_file)
+                if check_deterministic:
+                    determ = compare_determ(sim_files[f], tmp_file, verbose=True)
                 os.remove(tmp_file)
+
+                assert_true(nondeterm)
+                if check_deterministic:
+                    assert_true(determ) 
diff --git a/tests/tools.py b/tests/tools.py
@@ -95,17 +95,3 @@ def skip_then_continue(msg=""):
     to this function.
     """
     raise SkipTest(msg)
-
-#
-# Here there be Hackons!
-#
-
-# hack to make sure that we are actually in the tests dir when we start running 
-# tests.  This works because this file is imported by many of the other test 
-# files.
-_fdir = os.path.dirname(__file__)
-if os.getcwd() != _fdir:
-    os.chdir(_fdir)
-del _fdir
-
-