diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 000000000..efb193b59
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,69 @@
+# Python CircleCI 2.0 configuration file
+#
+# Check https://circleci.com/docs/2.0/language-python/ for more details
+#
+general:
+  branches:
+    ignore:
+      - gh-pages
+
+version: 2
+jobs:
+  build:
+    docker:
+      # specify the version you desire here
+      # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
+      - image: circleci/python:3.6.1
+
+      # Specify service dependencies here if necessary
+      # CircleCI maintains a library of pre-built images
+      # documented at https://circleci.com/docs/2.0/circleci-images/
+      # - image: circleci/postgres:9.4
+
+    working_directory: ~/repo
+
+    steps:
+      - checkout
+      - restore_cache:
+          key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
+
+      - run:
+          name: install dependencies
+          command: |
+            python3 -m venv venv
+            . venv/bin/activate
+            pip install numpy>=1.12
+            pip install -r requirements.txt
+      - save_cache:
+          key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
+          paths:
+            - "venv"
+      - run:
+          name: install fftw
+          command: |
+            sudo apt-get update
+            sudo apt-get install libfftw3-dev
+      - run:
+          name: install EQcorrscan
+          command: |
+            . venv/bin/activate
+            python setup.py install
+            python setup.py build_ext --inplace
+      # run tests!
+      - run:
+          name: run tests
+          command: |
+            . venv/bin/activate
+            py.test -m "network" -n 2
+            py.test eqcorrscan/doc/tutorials/*.rst eqcorrscan/doc/submodules/*.rst --cov-append
+
+      - run:
+          name: Upload to codecov
+          command: |
+            . venv/bin/activate
+            ls -a
+            codecov
+
+      - store_artifacts:
+          path: test-reports
+          destination: test-reports
diff --git a/.travis.yml b/.travis.yml
index b5104b93b..46b6216b4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,6 +15,7 @@ matrix:
       env: PYTHON_VERSION=3.6
 
     - os: osx
+      osx_image: xcode8
       env:
         - PYTHON_VERSION=3.5
 
@@ -24,8 +25,9 @@ install:
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
       export OS="MacOSX";
       export py=$PYTHON_VERSION;
-      export CC=gcc-4.9;
-      export CXX=g++-4.9;
+      brew install gcc6;
+      export CC=gcc-6;
+      export CXX=g++-6;
       brew install fftw;
     else
       export OS="Linux";
@@ -57,7 +59,7 @@ install:
   - conda create -q -n test-environment python=$PYTHON_VERSION colorama numpy scipy matplotlib obspy flake8 mock coverage bottleneck pyproj
   - source activate test-environment
   - conda install h5py
-  - pip install pep8-naming pytest pytest-cov pytest-pep8 pytest-xdist codecov Cython
+  - pip install pep8-naming pytest pytest-cov pytest-pep8 pytest-xdist pytest-rerunfailures codecov Cython
   - pip freeze
   - conda list
   # done installing dependencies
@@ -66,7 +68,9 @@ install:
   - python setup.py build_ext --inplace
 
 script:
-   - py.test --runslow
+   - py.test --runslow -m "not serial and not network" -n 2
+   - py.test -m "serial and not network" --cov-append
+#   - py.test -m "network" --cov-append -n 2
 
 after_success:
   # Check how much code is actually tested and send this report to codecov
diff --git a/CHANGES.md b/CHANGES.md
index 07bc02a1a..4ba33bc2e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,38 @@
+## Current
+* Added the ability to change the correlation functions used in detection
+  methods through the parameter xcorr_func of match_filter, Template.detect
+  and Tribe.detect, or using the set_xcorr context manager in
+  the utils.correlate module. Supported options are:
+    * numpy
+    * fftw
+    * time-domain
+    * or passing a function that implements the xcorr interface.
+* Added the ability to change the concurrency strategy of xcorr functions
+  using the paramter concurrency of match_filter, Template.detect
+  and Tribe.detect. Supported options are:
+    * None - for single-threaded execution in a single process
+    * multithread - for multi-threaded execution
+    * multiprocess- for multiprocess execution
+    * concurrent - allows functions to describe their own preferred currency
+    methods, defaults to multithread
+* Change debug printing output, it should be a little quieter;
+* Speed-up time-domain using a threaded C-routine - separate from frequency
+  domain C-routines;
+* Expose useful parallel options for all correlation routines;
+* Expose cores argument for match-filter objects to allow limits to be placed
+  on how much of your machine is used;
+* Limit number of workers created during pre-processing to never be more than
+  the number of traces in the stream being processed;
+* Implement openMP parallelisation of cross-correlation sum routines - memory
+  consumption reduced by using shared memory, and by computing the
+  cross-correlation sums rather than individual channel cross-correlations.
+  This also leads to a speed-up.  This routine is the default concurrent
+  correlation routine;
+* Test examples in rst doc files to ensure they are up-to-date;
+* Tests that were prone to timeout issues have been migrated to run on circleci
+  to allow quick re-starting of fails not due to code errors
+
+
 ## 0.2.5
 * Fix bug with \_group_process that resulted in stalled processes.
 * Force NumPy version
@@ -17,19 +52,19 @@ template;
 option (previously only used for debugging in development);
 * Increase test coverage in lag_calc;
 * Speed-up tests for brightness;
-* Increase test coverage for match_filter including testing io of 
+* Increase test coverage for match_filter including testing io of
 detections;
 * Increase subspace test coverage for edge cases;
 * Speed-up catalog_to_dd_tests;
 * Lag-calc will pick S-picks on channels ending E, N, 1 and 2, change
 from only picking on E and N before; warning added to docs;
 * Add full tests for pre-processing;
-* Run tests in parallel on ci, speed-up tests dramatically; 
-* Rename singular-value decomposition functions (with depreciation 
+* Run tests in parallel on ci, speed-up tests dramatically;
+* Rename singular-value decomposition functions (with depreciation
 warnings);
 * Rename SVD_moments to lower-case and add depreciation warning;
 * Increase test coverage in utils.mag_calc;
-* Add Template, Tribe, Family, Party objects and rename DETECTION to 
+* Add Template, Tribe, Family, Party objects and rename DETECTION to
 Detection;
     * Template objects maintain meta-data associated with their creation
     to stream-line processing of data (e.g. reduce chance of using the
@@ -45,10 +80,10 @@ Detection;
     * The Party object is a container for many Family objects.
     * Family objects are containers for detections from the same
     Template.
-    * Family and Party objects have a lag_calc method which computes 
+    * Family and Party objects have a lag_calc method which computes
     the cross-correlation pick-refinements.
     * The upshot of this is that it is possible to, in one line,
-    generate a Tribe of templates, compute their matched-filter 
+    generate a Tribe of templates, compute their matched-filter
     detections, and generate cross-correlation pick refinements, which
     output Event objects, which can be written to a catalog:
         Tribe.construct(method, **kwargs).detect(st, **kwargs).lag_calc(**kwargs).write()
@@ -81,33 +116,33 @@ fewer traces than template;
 
 ## 0.1.6
 * Fix bug introduced in version 0.1.5 for match_filter where looping
-through multiple templates did not correctly match image and template 
+through multiple templates did not correctly match image and template
 data: 0.1.5 fix did not work;
 * Bug-fix in catalog_to_dd for events without magnitudes;
 * Amend match-filter to not edit the list of template names in place.
-Previously, if a template was not used (due to no matching continuous 
-data) then the name of the template was removed: this now copies the 
+Previously, if a template was not used (due to no matching continuous
+data) then the name of the template was removed: this now copies the
 list of template_names internally and does not change the external list.
 
 ## 0.1.5
 * Migrate coverage to codecov;
 * Fix bug introduced in version 0.1.5 for match_filter where looping
-through multiple templates did not correctly match image and template 
+through multiple templates did not correctly match image and template
 data.
 
 ## 0.1.4
 * Bug-fix in plot_repicked removed where data were not normalized
 properly;
-* Bug-fix in lag_calc where data were missing in the continuous data 
+* Bug-fix in lag_calc where data were missing in the continuous data
 fixed (this led to incorrect picks, **major bug!**);
 * Output cross-channel correlation sum in lag-calc output;
 * Add id to DETECTION objects, which is consistent with the events
 within DETECTION objects and catalog output, and used in lag_calc to
 allow linking of detections to catalog events;
-* Add lots of logging and error messages to lag-calc to ensure user 
+* Add lots of logging and error messages to lag-calc to ensure user
 understands limits;
 * Add error to day-proc to ensure user is aware of risks of padding;
-* Change utils.pre_processing.process to accept different length of 
+* Change utils.pre_processing.process to accept different length of
 data enforcement, not just full day (allow for overlap in processing,
 which might be useful for reducing day start and end effects);
 * Bug-fix in mag_calc.amp_pick_event, broke loop if data were missing;
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index aa6755f67..663f8df42 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,5 +1,7 @@
-Calum John Chamberlain
-Chet Hopp
-Emily Warren-Smith
-Konstantinos Michailos
-Shanna Chu
+* Calum John Chamberlain
+* Chet Hopp
+* Emily Warren-Smith
+* Konstantinos Michailos
+* Shanna Chu
+* Derrick Chambers
+* Chris Scott
diff --git a/MANIFEST.in b/MANIFEST.in
index f2946ad13..17bdee047 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,6 +5,7 @@ include README.md
 include LICENCE.txt
 include CHANGES.md
 include CONTRIBUTORS.md
+include requirements.txt
 # include eqcorrscan/test_data/tutorial_data.tgz
 # include eqcorrscan/tutorial.py
 recursive-include eqcorrscan/utils *.py
@@ -16,6 +17,7 @@ recursive-include eqcorrscan/core *.py
 recursive-include eqcorrscan/tests *.py
 include eqcorrscan/lib/libutils.def
 include eqcorrscan/lib/multi_corr.c
+include eqcorrscan/lib/time_corr.c
 
 # exclude rules
 # global-exclude *.pyc *.ms *.tgz
diff --git a/README.md b/README.md
index d824bd196..c082d426c 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,26 @@
 # EQcorrscan
 ## A python package for the detection and analysis of repeating and near-repeating earthquakes.
 
-[![Join the chat at https://gitter.im/eqcorrscan/EQcorrscan](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/eqcorrscan/EQcorrscan?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[![TravisCIStatus](https://travis-ci.org/eqcorrscan/EQcorrscan.svg?branch=master)](https://travis-ci.org/eqcorrscan/EQcorrscan)
-[![Build status](https://ci.appveyor.com/api/projects/status/b0924mp0uwwyap3d/branch/master?svg=true)](https://ci.appveyor.com/project/calum-chamberlain/eqcorrscan-jsycv/branch/master)
-[![codecov](https://codecov.io/gh/eqcorrscan/EQcorrscan/branch/master/graph/badge.svg)](https://codecov.io/gh/eqcorrscan/EQcorrscan)
-[![DOI](https://zenodo.org/badge/35918157.svg)](https://zenodo.org/badge/latestdoi/35918157)
-[![DocumentationStatus](http://readthedocs.org/projects/eqcorrscan/badge/?version=latest)](http://eqcorrscan.readthedocs.org/en/latest/?badge=latest)
-[![Dependency Status](https://dependencyci.com/github/eqcorrscan/EQcorrscan/badge)](https://dependencyci.com/github/eqcorrscan/EQcorrscan)
-[![Stories in Ready](https://badge.waffle.io/eqcorrscan/EQcorrscan.png?label=ready&title=Ready)](http://waffle.io/eqcorrscan/EQcorrscan)
+# Citation:
+We have a manuscript in review, if you make use of EQcorrscan please cite the folloing paper:
+
+Chamberlain, C. J., Hopp, C. J., Boese, C. M., Warren-Smith, E., Chambers, D., Chu, S. X., Michailos, K., Townend, J., EQcorrscan: Repeating and near-repeating earthquake detection and analysis in Python. Seismological Research Letters *in review*
+
+If you want to you should also cite the version number: [![DOI](https://zenodo.org/badge/35918157.svg)](https://zenodo.org/badge/latestdoi/35918157)
+
+# Test status
+Note that tests for travis and appveyor are run daily on master as cron jobs, and may reflect time-out issues.
+
+| Service tests | Badge |
+|---------------|-------|
+| OSX & Linux | [![TravisCIStatus](https://travis-ci.org/eqcorrscan/EQcorrscan.svg?branch=master)](https://travis-ci.org/eqcorrscan/EQcorrscan)
+| Windows | [![Build status](https://ci.appveyor.com/api/projects/status/b0924mp0uwwyap3d/branch/master?svg=true)](https://ci.appveyor.com/project/calum-chamberlain/eqcorrscan-jsycv/branch/master)
+| Code coverage | [![codecov](https://codecov.io/gh/eqcorrscan/EQcorrscan/branch/master/graph/badge.svg)](https://codecov.io/gh/eqcorrscan/EQcorrscan)
+| Documentation | [![DocumentationStatus](http://readthedocs.org/projects/eqcorrscan/badge/?version=latest)](http://eqcorrscan.readthedocs.org/en/latest/?badge=latest)
+| Dependency status | [![Dependency Status](https://dependencyci.com/github/eqcorrscan/EQcorrscan/badge)](https://dependencyci.com/github/eqcorrscan/EQcorrscan)
+| Network tests | [![CircleCI](https://circleci.com/gh/eqcorrscan/EQcorrscan/tree/master.svg?style=svg)](https://circleci.com/gh/eqcorrscan/EQcorrscan/tree/master)
+| Issues ready | [![Stories in Ready](https://badge.waffle.io/eqcorrscan/EQcorrscan.png?label=ready&title=Ready)](http://waffle.io/eqcorrscan/EQcorrscan)
+| Chat on gitter | [![Join the chat at https://gitter.im/eqcorrscan/EQcorrscan](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/eqcorrscan/EQcorrscan?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 # Installation
 Installation has been tested on both OSX and Linux (Ubuntu), and
@@ -17,6 +29,9 @@ Note that, although we support Windows, EQcorrscan is optimized for
 linux style distributions, and the developers are not extensive Windows
 users.
 
+*OSX with gcc-4.9 from homebrew doesn't appear to compile properly, all other gcc versions
+ seem to work*
+
 Instructions for installing EQcorrscan and the required dependency, fftw
 are linked from the
 [docs](http://eqcorrscan.readthedocs.io/en/latest/intro.html#installation)
@@ -79,9 +94,7 @@ LGPL GNU License, Copyright EQcorrscan developers 2015, 2016, 2017.
 # Contributing
 
 Please fork this project and work on it there then create a pull request to
-merge back to this main repository.  If you are working on a bug-fix then
-use the *develop* branch, otherwise, create a feature branch and work
-on your addition there.
+merge back to this main repository.  Please create a branch from *develop*.
 
 When you make changes please run the tests in the test directory to ensure
 everything merges with minimum effort.  If there is not yet a test to cope
diff --git a/appveyor.yml b/appveyor.yml
index 921a9b976..7970ddf66 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -54,7 +54,7 @@ install:
 
   # Install other pip dependancies
   - "pip install -U future"
-  - "pip install pytest pytest-pep8 pytest-cov pytest-xdist codecov"
+  - "pip install pytest pytest-pep8 pytest-cov pytest-xdist pytest-rerunfailures codecov"
   # list package versions
   - "conda list"
   # Copy the windows fftw header file
@@ -66,11 +66,13 @@ build: false
 test_script:
   - "%CMD_IN_ENV% python setup.py build_ext --inplace"
   - "%CMD_IN_ENV% python setup.py install"
-  - "%CMD_IN_ENV% py.test -n4"
+  - "%CMD_IN_ENV% py.test -n4 -m \"not serial and not network\""
+  - "%CMD_IN_ENV% py.test -m \"serial and not network\" --cov-append"
+#  - "%CMD_IN_ENV% py.test -n4 -m \"network\" --cov-append"
 
 after_test:
   #  - "coverage combine"
-  - "powershell copy-item .coverage ..\\.coverage.empty"
-  - "cd .."
-  - "coverage combine"
+ #  - "powershell copy-item .coverage ..\\.coverage.empty"
+ #  - "cd .."
+ #  - "coverage combine"
   - "codecov"
diff --git a/conftest.py b/conftest.py
index 67fe04dd3..17071e019 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,3 +1,113 @@
+import os
+import shutil
+from os.path import join, dirname
+
+import pytest
+
+# ------------------------- Paths constants
+# these are added to pytest namespace for convinience if finding things
+# eg test data
+
+PKG_PATH = join(dirname(__file__), 'eqcorrscan')
+TEST_PATH = join(PKG_PATH, 'tests')
+TEST_DATA_PATH = join(TEST_PATH, 'test_data')
+
+
+# --------------- pytest configuration
+
+
+def pytest_addoption(parser):
+    parser.addoption("--runslow", action="store_true",
+                     help="run slow tests")
+
+
+# ------------------ session fixtures
+
+
+@pytest.fixture(scope='session', autouse=True)
+def clean_up_test_files():
+    """ cleanup the file droppings produced by doc tests.
+     This fixture will run after all other tests have finished """
+    files_to_kill = [
+        'test_csv_write.csv',
+        'test_family.tgz',
+        'test_quakeml.ml',
+        'test_tar_write.tgz',
+        'test_template.tgz',
+        'test_template_read.tgz',
+        'test_tribe.tgz',
+        'test_waveforms.ms',
+        'mag_calc.out',
+        'station.dat',
+        'test_waveform.ms',
+        '01-0410-35L.S201309',
+        '04-0007-55R.S201601',
+        '04-0045-52L.S201601',
+        'dt.cc',
+        'dt.cc2',
+        'dt.ct',
+        'dt.ct2',
+        'phase.dat'
+    ]
+
+    yield
+
+    # remove files
+    for fi in files_to_kill:
+        if os.path.isfile(fi):
+            os.remove(fi)
+
+
+@pytest.fixture(scope='session', autouse=True)
+def clean_up_test_directories():
+    """ cleanup the file droppings produced by doc tests.
+     This fixture will run after all other tests have finished """
+    directories_to_kill = [
+        'temp1',
+        'temp2',
+        'test_family',
+        'test_party_out',
+        'test_tar_write',
+        'tmp1',
+    ]
+
+    yield
+
+    # remove files
+    for directory in directories_to_kill:
+        if os.path.isdir(directory):
+            shutil.rmtree(directory)
+
+
+# ------------- add objects to global pytest scope
+
+
+def append_name(list_like):
+    """
+    Decorator to append a function name to an object with an append method.
+
+    :param list_like:  Any object with append method
+    :return: func
+    """
+
+    def _append_func_name(func):
+        list_like.append(func.__name__)
+        return func
+
+    return _append_func_name
+
+
+# add key variables to the pytest name space. All these can now be accessed
+# within any test by using getattr notation on the pytest package itself
+# eg pytest.test_path is bound to TEST_PATH
+def pytest_namespace():
+    odict = {'test_path': TEST_PATH,
+             'test_data_path': TEST_DATA_PATH,
+             'pkg_path': PKG_PATH,
+             'append_name': append_name,
+             }
+    return odict
+
 # Over-ride the -n auto in travis as this goes of the wall
 # import sys
 # import os
@@ -11,8 +121,3 @@
 #         else:
 #             args[:] = args + ["-n2"]
 #     print(args)
-
-
-def pytest_addoption(parser):
-    parser.addoption("--runslow", action="store_true",
-                     help="run slow tests")
diff --git a/eqcorrscan/__init__.py b/eqcorrscan/__init__.py
index 4093ca7c3..41ab960f6 100755
--- a/eqcorrscan/__init__.py
+++ b/eqcorrscan/__init__.py
@@ -7,15 +7,13 @@
     GNU Lesser General Public License, Version 3
     (https://www.gnu.org/copyleft/lesser.html)
 """
-import sys
 import importlib
+import sys
 import warnings
 
 __all__ = ['core', 'utils', 'tutorials']
 
-
-__version__ = '0.2.5'
-
+__version__ = '0.2.6'
 
 # Cope with changes to name-space to remove most of the camel-case
 _import_map = {
@@ -35,6 +33,7 @@ class EQcorrscanRestructureAndLoad(object):
     """
     Path finder and module loader for transitioning
     """
+
     def find_module(self, fullname, path=None):
         # Compatibility with namespace paths.
         if hasattr(path, "_path"):
@@ -87,4 +86,5 @@ def load_module(self, name):
 
 if __name__ == '__main__':
     import doctest
+
     doctest.testmod(exclude_empty=True)
diff --git a/eqcorrscan/core/bright_lights.py b/eqcorrscan/core/bright_lights.py
index b10b77d01..f13a17551 100644
--- a/eqcorrscan/core/bright_lights.py
+++ b/eqcorrscan/core/bright_lights.py
@@ -631,6 +631,7 @@ def brightness(stations, nodes, lags, stream, threshold, thresh_type,
         import matplotlib.pyplot as plt
         plt.ioff()
     from eqcorrscan.utils import plotting
+    from eqcorrscan.utils.debug_log import debug_print
     # Check that we actually have the correct stations
     realstations = []
     for station in stations:
@@ -750,9 +751,8 @@ def brightness(stations, nodes, lags, stream, threshold, thresh_type,
         # Generate a catalog of detections
         # detections_cat = Catalog()
         for j, detection in enumerate(detections):
-            if debug > 3:
-                print('Converting for detection ' + str(j) + ' of ' +
-                      str(len(detections)))
+            debug_print('Converting for detection %i of %i'
+                        % (j, len(detections)), 3, debug)
             # Create an event for each detection
             event = Event()
             # Set up some header info for the event
@@ -791,8 +791,7 @@ def brightness(stations, nodes, lags, stream, threshold, thresh_type,
                             time=tr.stats.starttime + detect_lag +
                             detection.detect_time + pre_pick,
                             onset='emergent', evalutation_mode='automatic'))
-            if debug > 0:
-                print('Generating template for detection: ' + str(j))
+            debug_print('Generating template for detection: %i' % j, 0, debug)
             template = template_gen(
                 picks=event.picks, st=copy_of_stream, length=template_length,
                 swin='all')
@@ -811,17 +810,15 @@ def brightness(stations, nodes, lags, stream, threshold, thresh_type,
                 print('---------------------------------coherence LEVEL: ' +
                       str(temp_coher))
                 coherent = True
-            elif debug > 0:
-                print('Template was incoherent, coherence level: ' +
-                      str(temp_coher))
+                debug_print('Template was incoherent, coherence level: ' +
+                            str(temp_coher), 0, debug)
                 coherent = False
             del copy_of_stream, tr, template
             if coherent:
                 templates.append(obsread(template_name))
                 nodesout += [node]
                 good_detections.append(detection)
-            elif debug > 0:
-                print('No template for you')
+            debug_print('No template for you', 0, debug)
             # detections_cat += event
     if plotvar:
         good_detections = [(cum_net_trace[-1].stats.starttime +
diff --git a/eqcorrscan/core/lag_calc.py b/eqcorrscan/core/lag_calc.py
index d242a7af0..473045eee 100644
--- a/eqcorrscan/core/lag_calc.py
+++ b/eqcorrscan/core/lag_calc.py
@@ -26,6 +26,7 @@
 from obspy.core.event import ResourceIdentifier, Comment
 
 from eqcorrscan.utils.plotting import plot_repicked, detection_multiplot
+from eqcorrscan.utils.debug_log import debug_print
 
 
 class LagCalcError(Exception):
@@ -158,8 +159,8 @@ def _channel_loop(detection, template, min_cc, detection_id, interpolate, i,
         temp_net = tr.stats.network
         temp_sta = tr.stats.station
         temp_chan = tr.stats.channel
-        if debug > 3:
-            print('Working on: %s.%s.%s' % (temp_net, temp_sta, temp_chan))
+        debug_print('Working on: %s.%s.%s' % (temp_net, temp_sta, temp_chan),
+                    3, debug)
         image = detection.select(station=temp_sta, channel=temp_chan)
         if len(image) == 0:
             print('No match in image.')
@@ -193,13 +194,11 @@ def _channel_loop(detection, template, min_cc, detection_id, interpolate, i,
             cc_max = np.amax(ccc)
             picktime = image[0].stats.starttime + (
                 np.argmax(ccc) * image[0].stats.delta)
-        if debug > 3:
-            print('Maximum cross-corr=%s' % cc_max)
+        debug_print('Maximum cross-corr=%s' % cc_max, 3, debug)
         checksum += cc_max
         used_chans += 1
         if cc_max < min_cc:
-            if debug > 3:
-                print('Correlation below threshold, not used')
+            debug_print('Correlation below threshold, not used', 3, debug)
             continue
         cccsum += cc_max
         # Perhaps weight each pick by the cc val or cc val^2?
@@ -209,9 +208,8 @@ def _channel_loop(detection, template, min_cc, detection_id, interpolate, i,
         # Only take the S-pick with the best correlation
         elif temp_chan[-1] in horizontal_chans:
             phase = 'S'
-            if debug > 4:
-                print('Making S-pick on: %s.%s.%s' %
-                      (temp_net, temp_sta, temp_chan))
+            debug_print('Making S-pick on: %s.%s.%s' %
+                        (temp_net, temp_sta, temp_chan), 4, debug)
             if temp_sta not in s_stachans.keys():
                 s_stachans[temp_sta] = ((temp_chan, np.amax(ccc),
                                          picktime))
@@ -298,8 +296,7 @@ def _day_loop(detection_streams, template, min_cc, detections,
         num_cores = len(detection_streams)
     if parallel:
         pool = Pool(processes=num_cores)
-        if debug > 4:
-            print('Made pool of %i workers' % num_cores)
+        debug_print('Made pool of %i workers' % num_cores, 4, debug)
         # Parallel generation of events for each detection:
         # results will be a list of (i, event class)
         results = [pool.apply_async(
@@ -577,8 +574,8 @@ def lag_calc(detections, detect_data, template_names, templates,
         template_detections = [detection for detection in detections
                                if detection.template_name == template[0]]
         t_delays = [d for d in delays if d[0] == template[0]][0][1]
-        if debug > 2:
-            print('There are %i detections' % len(template_detections))
+        debug_print(
+            'There are %i detections' % len(template_detections), 2, debug)
         detect_streams = _prepare_data(
             detect_data=detect_data, detections=template_detections,
             template=template, delays=t_delays, shift_len=shift_len,
diff --git a/eqcorrscan/core/match_filter.py b/eqcorrscan/core/match_filter.py
index 0e332654c..0271fbf2f 100644
--- a/eqcorrscan/core/match_filter.py
+++ b/eqcorrscan/core/match_filter.py
@@ -18,32 +18,43 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-
-import warnings
 import ast
-import os
-import time
+import contextlib
 import copy
 import getpass
+import glob
+import os
 import re
-import tarfile
 import shutil
+import tarfile
 import tempfile
-import glob
-
-from multiprocessing import cpu_count
+import time
+import warnings
 from collections import Counter
+from os.path import join
+
+import numpy as np
 from obspy import Trace, Catalog, UTCDateTime, Stream, read, read_events
-from obspy.core.event import Event, Pick, CreationInfo, ResourceIdentifier
 from obspy.core.event import Comment, WaveformStreamID
+from obspy.core.event import Event, Pick, CreationInfo, ResourceIdentifier
 
+from eqcorrscan.core import template_gen
+from eqcorrscan.core.lag_calc import lag_calc
+from eqcorrscan.utils.catalog_utils import _get_origin
+from eqcorrscan.utils.correlate import get_array_xcorr, get_stream_xcorr
+from eqcorrscan.utils.debug_log import debug_print
 from eqcorrscan.utils.findpeaks import find_peaks2_short, decluster
 from eqcorrscan.utils.plotting import cumulative_detections
 from eqcorrscan.utils.pre_processing import dayproc, shortproc
-from eqcorrscan.utils.catalog_utils import _get_origin
-from eqcorrscan.core import template_gen
-from eqcorrscan.core.lag_calc import lag_calc
+
+
+@contextlib.contextmanager
+def temporary_directory():
+    """ make a temporary directory, yeild its name, cleanup on exit """
+    dir_name = tempfile.mkdtemp()
+    yield dir_name
+    if os.path.exists(dir_name):
+        shutil.rmtree(dir_name)
 
 
 def _spike_test(stream, percent=0.99, multiplier=1e6):
@@ -75,6 +86,7 @@ class MatchFilterError(Exception):
     """
     Default error for match-filter errors.
     """
+
     def __init__(self, value):
         """
         Raise error.
@@ -111,6 +123,7 @@ class Party(object):
     """
     Container for multiple Family objects.
     """
+
     def __init__(self, families=None):
         """Instantiate the Party object."""
         self.families = []
@@ -511,7 +524,7 @@ def decluster(self, trig_int, timing='detect', metric='avg_cor'):
         detect_info = [(d[1], _total_microsec(d[0].datetime, min_det.datetime))
                        for d in detect_info]
         peaks_out, inds_out = decluster(
-            peaks=detect_info, trig_int=trig_int * 10**6, return_ind=True)
+            peaks=detect_info, trig_int=trig_int * 10 ** 6, return_ind=True)
         # Trig_int must be converted from seconds to micro-seconds
         declustered_detections = [all_detections[ind] for ind in inds_out]
         # Convert this list into families
@@ -591,26 +604,26 @@ def write(self, filename, format='tar'):
                 for detection in family.detections:
                     detection.write(fname=filename, append=True)
         elif format.lower() == 'tar':
-            if os.path.isdir(filename) or os.path.isfile(filename):
+            if os.path.exists(filename):
                 raise IOError('Will not overwrite existing file: %s'
                               % filename)
-            os.makedirs(filename)
-            Tribe([f.template for f in self.families]).write(
-                filename=filename, compress=False)
-            all_cat = Catalog()
-            for family in self.families:
-                all_cat += family.catalog
-            if not len(all_cat) == 0:
-                all_cat.write(filename + os.sep + 'catalog.xml',
-                              format='QUAKEML')
-            for i, family in enumerate(self.families):
-                print('Writing family %i' % i)
-                _write_family(
-                    family=family, filename=filename + os.sep +
-                    family.template.name + '_detections.csv')
-            with tarfile.open(filename + '.tgz', "w:gz") as tar:
-                tar.add(filename, arcname=os.path.basename(filename))
-            shutil.rmtree(filename)
+            # os.makedirs(filename)
+            with temporary_directory() as temp_dir:
+                Tribe([f.template for f in self.families]).write(
+                    filename=temp_dir, compress=False)
+                all_cat = Catalog()
+                for family in self.families:
+                    all_cat += family.catalog
+                if not len(all_cat) == 0:
+                    all_cat.write(join(temp_dir, 'catalog.xml'),
+                                  format='QUAKEML')
+                for i, family in enumerate(self.families):
+                    print('Writing family %i' % i)
+                    name = family.template.name + '_detections.csv'
+                    name_to_write = join(temp_dir, name)
+                    _write_family(family=family, filename=name_to_write)
+                with tarfile.open(filename + '.tgz', "w:gz") as tar:
+                    tar.add(temp_dir, arcname=os.path.basename(filename))
         else:
             warnings.warn('Writing only the catalog component, metadata '
                           'will not be preserved')
@@ -646,10 +659,8 @@ def read(self, filename=None):
             all_cat = read_events(party_dir + os.sep + 'catalog.xml')
         else:
             all_cat = Catalog()
-        for family_file in glob.glob(party_dir + os.sep + '*_detections.csv'):
-            template = [t for t in tribe
-                        if t.name == family_file.split(os.sep)[-1].
-                        split('_detections.csv')[0]]
+        for family_file in glob.glob(join(party_dir, '*_detections.csv')):
+            template = [t for t in tribe if _templates_match(t, family_file)]
             if len(template) == 0:
                 raise MatchFilterError(
                     'Missing template for detection file: ' + family_file)
@@ -754,7 +765,7 @@ def lag_calc(self, stream, pre_processed, shift_len=0.2, min_cc=0.4,
                 new_group = [master.template.copy()]
                 new_det_group = copy.deepcopy(master.detections)
                 for slave in self.families:
-                    if master.template.same_processing(slave.template) and\
+                    if master.template.same_processing(slave.template) and \
                                     master.template != slave.template:
                         slave_chans = [
                             (tr.stats.station,
@@ -799,7 +810,7 @@ def lag_calc(self, stream, pre_processed, shift_len=0.2, min_cc=0.4,
             temp_cat = lag_calc(
                 detections=det_group, detect_data=processed_stream,
                 template_names=[t.name for t in group],
-                templates=[t.st for t in group],  shift_len=shift_len,
+                templates=[t.st for t in group], shift_len=shift_len,
                 min_cc=min_cc, horizontal_chans=horizontal_chans,
                 vertical_chans=vertical_chans, cores=cores,
                 interpolate=interpolate, plot=plot, parallel=parallel,
@@ -874,6 +885,7 @@ class Family(object):
     :param catalog:
         Catalog of detections, with information for the individual detections.
     """
+
     def __init__(self, template, detections=None, catalog=None):
         """Instantiation of Family object."""
         self.template = template
@@ -1397,6 +1409,7 @@ class Template(object):
     Template object holder. Contains waveform data and metadata parameters
     used to generate the template.
     """
+
     def __init__(self, name=None, st=None, lowcut=None, highcut=None,
                  samp_rate=None, filt_order=None, process_length=None,
                  prepick=None, event=None):
@@ -1423,11 +1436,11 @@ def __init__(self, name=None, st=None, lowcut=None, highcut=None,
         self.prepick = prepick
         if event is not None:
             if "eqcorrscan_template_" + temp_name not in \
-               [c.text for c in event.comments]:
+                    [c.text for c in event.comments]:
                 event.comments.append(Comment(
-                        text="eqcorrscan_template_" + temp_name,
-                        creation_info=CreationInfo(agency='eqcorrscan',
-                                                   author=getpass.getuser())))
+                    text="eqcorrscan_template_" + temp_name,
+                    creation_info=CreationInfo(agency='eqcorrscan',
+                                               author=getpass.getuser())))
         self.event = event
 
     def __repr__(self):
@@ -1516,8 +1529,9 @@ def __eq__(self, other):
         """
         for key in self.__dict__.keys():
             if key == 'st':
-                if isinstance(self.st, Stream) and \
-                   isinstance(other.st, Stream):
+                self_is_stream = isinstance(self.st, Stream)
+                other_is_stream = isinstance(other.st, Stream)
+                if self_is_stream and other_is_stream:
                     for tr, oth_tr in zip(self.st.sort(),
                                           other.st.sort()):
                         if not np.array_equal(tr.data, oth_tr.data):
@@ -1528,24 +1542,20 @@ def __eq__(self, other):
                                       'calib']:
                             if tr.stats[trkey] != oth_tr.stats[trkey]:
                                 return False
-                elif isinstance(
-                        self.st, Stream) and not isinstance(other.st, Stream):
+                elif self_is_stream and not other_is_stream:
                     return False
-                elif not isinstance(
-                        self.st, Stream) and isinstance(other.st, Stream):
+                elif not self_is_stream and other_is_stream:
                     return False
             elif key == 'event':
-                if isinstance(
-                        self.event, Event) and isinstance(other.event, Event):
+                self_is_event = isinstance(self.event, Event)
+                other_is_event = isinstance(other.event, Event)
+                if self_is_event and other_is_event:
                     if not _test_event_similarity(
                             self.event, other.event, verbose=False):
                         return False
-                elif isinstance(
-                        self.event, Event) and not isinstance(
-                        other.event, Event):
+                elif self_is_event and not other_is_event:
                     return False
-                elif not isinstance(
-                        self.event, Event) and isinstance(other.event, Event):
+                elif not self_is_event and other_is_event:
                     return False
             elif not self.__dict__[key] == other.__dict__[key]:
                 return False
@@ -1683,6 +1693,7 @@ def read(self, filename):
 
     def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
                pre_processed=False, daylong=False, parallel_process=True,
+               xcorr_func=None, concurrency=None, cores=None,
                ignore_length=False, overlap="calculate", debug=0):
         """
         Detect using a single template within a continuous stream.
@@ -1720,6 +1731,18 @@ def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
             over other methods.
         :type parallel_process: bool
         :param parallel_process:
+        :type xcorr_func: str or callable
+        :param xcorr_func:
+            A str of a registered xcorr function or a callable for implementing
+            a custom xcorr function. For more details see
+            :func:`eqcorrscan.utils.correlate.register_array_xcorr`.
+        :type concurrency: str
+        :param concurrency:
+            The type of concurrency to apply to the xcorr function. Options are
+            'multithread', 'multiprocess', 'concurrent'. For more details see
+            :func:`eqcorrscan.utils.correlate.get_stream_xcorr`.
+        :type cores: int
+        :param cores: Number of workers for processing and detection.
         :type ignore_length: bool
         :param ignore_length:
             If using daylong=True, then dayproc will try check that the data
@@ -1798,11 +1821,12 @@ def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
             See tutorials for example.
         """
         party = _group_detect(
-                templates=[self], stream=stream.copy(), threshold=threshold,
-                threshold_type=threshold_type, trig_int=trig_int,
-                plotvar=plotvar, pre_processed=pre_processed, daylong=daylong,
-                parallel_process=parallel_process,
-                ignore_length=ignore_length, overlap=overlap, debug=debug)
+            templates=[self], stream=stream.copy(), threshold=threshold,
+            threshold_type=threshold_type, trig_int=trig_int,
+            plotvar=plotvar, pre_processed=pre_processed, daylong=daylong,
+            parallel_process=parallel_process, xcorr_func=xcorr_func,
+            concurrency=concurrency, cores=cores, ignore_length=ignore_length,
+            overlap=overlap, debug=debug)
         return party[0]
 
     def construct(self, method, name, lowcut, highcut, samp_rate, filt_order,
@@ -1897,6 +1921,7 @@ def construct(self, method, name, lowcut, highcut, samp_rate, filt_order,
 
 class Tribe(object):
     """Holder for multiple templates."""
+
     def __init__(self, templates=None):
         self.templates = []
         if isinstance(templates, Template):
@@ -2130,8 +2155,8 @@ def _par_write(self, dirname):
         :type dirname: str
         :param dirname: Directory to write the parameter file to.
         """
-        with open(dirname + '/' +
-                  'template_parameters.csv', 'w') as parfile:
+        filename = dirname + '/' + 'template_parameters.csv'
+        with open(filename, 'w') as parfile:
             for template in self.templates:
                 for key in template.__dict__.keys():
                     if key not in ['st', 'event']:
@@ -2224,7 +2249,8 @@ def cluster(self, method, **kwargs):
         return tribes
 
     def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
-               daylong=False, parallel_process=True, ignore_length=False,
+               daylong=False, parallel_process=True, xcorr_func=None,
+               concurrency=None, cores=None, ignore_length=False,
                group_size=None, overlap="calculate", debug=0):
         """
         Detect using a Tribe of templates within a continuous stream.
@@ -2255,6 +2281,18 @@ def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
             over other methods.
         :type parallel_process: bool
         :param parallel_process:
+        :type xcorr_func: str or callable
+        :param xcorr_func:
+            A str of a registered xcorr function or a callable for implementing
+            a custom xcorr function. For more information see:
+            :func:`eqcorrscan.utils.correlate.register_array_xcorr`
+        :type concurrency: str
+        :param concurrency:
+            The type of concurrency to apply to the xcorr function. Options are
+            'multithread', 'multiprocess', 'concurrent'. For more details see
+            :func:`eqcorrscan.utils.correlate.get_stream_xcorr`
+        :type cores: int
+        :param cores: Number of workers for procesisng and detection.
         :type ignore_length: bool
         :param ignore_length:
             If using daylong=True, then dayproc will try check that the data
@@ -2284,6 +2322,10 @@ def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
         .. Note::
             `stream` must not be pre-processed.
 
+        .. warning::
+            Picks included in the output Party.get_catalog() will not be
+            corrected for pre-picks in the template.
+
         .. warning::
             Plotting within the match-filter routine uses the Agg backend
             with interactive plotting turned off.  This is because the function
@@ -2371,6 +2413,7 @@ def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
                 threshold_type=threshold_type, trig_int=trig_int,
                 plotvar=plotvar, group_size=group_size, pre_processed=False,
                 daylong=daylong, parallel_process=parallel_process,
+                xcorr_func=xcorr_func, concurrency=concurrency, cores=cores,
                 ignore_length=ignore_length, overlap=overlap, debug=debug)
             party += group_party
         for family in party:
@@ -2381,7 +2424,8 @@ def detect(self, stream, threshold, threshold_type, trig_int, plotvar,
 
     def client_detect(self, client, starttime, endtime, threshold,
                       threshold_type, trig_int, plotvar, daylong=False,
-                      parallel_process=True, ignore_length=False,
+                      parallel_process=True, xcorr_func=None,
+                      concurrency=None, cores=None, ignore_length=False,
                       group_size=None, debug=0, return_stream=False):
         """
         Detect using a Tribe of templates within a continuous stream.
@@ -2416,6 +2460,18 @@ def client_detect(self, client, starttime, endtime, threshold,
             over other methods.
         :type parallel_process: bool
         :param parallel_process:
+        :type xcorr_func: str or callable
+        :param xcorr_func:
+            A str of a registered xcorr function or a callable for implementing
+            a custom xcorr function. For more information see:
+            :func:`eqcorrscan.utils.correlate.register_array_xcorr`
+        :type concurrency: str
+        :param concurrency:
+            The type of concurrency to apply to the xcorr function. Options are
+            'multithread', 'multiprocess', 'concurrent'. For more details see
+            :func:`eqcorrscan.utils.correlate.get_stream_xcorr`
+        :type cores: int
+        :param cores: Number of workers for processing and detection.
         :type ignore_length: bool
         :param ignore_length:
             If using daylong=True, then dayproc will try check that the data
@@ -2439,6 +2495,10 @@ def client_detect(self, client, starttime, endtime, threshold,
             :class:`eqcorrscan.core.match_filter.Party` of Families of
             detections.
 
+        .. warning::
+            Picks included in the output Party.get_catalog() will not be
+            corrected for pre-picks in the template.
+
         .. Note::
             Ensures that data overlap between loops, which will lead to no
             missed detections at data start-stop points (see note for
@@ -2498,25 +2558,25 @@ def client_detect(self, client, starttime, endtime, threshold,
         for template in self.templates:
             for tr in template.st:
                 if tr.stats.network not in [None, '']:
-                    chan_id = (tr.stats.network, )
+                    chan_id = (tr.stats.network,)
                 else:
-                    chan_id = ('*', )
+                    chan_id = ('*',)
                 if tr.stats.station not in [None, '']:
-                    chan_id += (tr.stats.station, )
+                    chan_id += (tr.stats.station,)
                 else:
-                    chan_id += ('*', )
+                    chan_id += ('*',)
                 if tr.stats.location not in [None, '']:
-                    chan_id += (tr.stats.location, )
+                    chan_id += (tr.stats.location,)
                 else:
-                    chan_id += ('*', )
+                    chan_id += ('*',)
                 if tr.stats.channel not in [None, '']:
                     if len(tr.stats.channel) == 2:
                         chan_id += (tr.stats.channel[0] + '?' +
-                                    tr.stats.channel[-1], )
+                                    tr.stats.channel[-1],)
                     else:
-                        chan_id += (tr.stats.channel, )
+                        chan_id += (tr.stats.channel,)
                 else:
-                    chan_id += ('*', )
+                    chan_id += ('*',)
                 template_channel_ids.append(chan_id)
         template_channel_ids = list(set(template_channel_ids))
         if return_stream:
@@ -2541,7 +2601,8 @@ def client_detect(self, client, starttime, endtime, threshold,
                     stream=st, threshold=threshold,
                     threshold_type=threshold_type, trig_int=trig_int,
                     plotvar=plotvar, daylong=daylong,
-                    parallel_process=parallel_process,
+                    parallel_process=parallel_process, xcorr_func=xcorr_func,
+                    concurrency=concurrency, cores=cores,
                     ignore_length=ignore_length, group_size=group_size,
                     overlap=None, debug=debug)
                 if return_stream:
@@ -2589,6 +2650,11 @@ def construct(self, method, lowcut, highcut, samp_rate, filt_order,
             Methods: `from_contbase`, `from_sfile` and `from_sac` are not
             supported by Tribe.construct and must use Template.construct.
 
+        .. Note::
+            The Method `multi_template_gen` is not supported because the
+            processing parameters for the stream are not known. Use
+            `from_meta_file` instead.
+
         .. Note:: Templates will be named according to their start-time.
         """
         if method in ['from_contbase', 'from_sfile', 'from_sac']:
@@ -2611,7 +2677,7 @@ def construct(self, method, lowcut, highcut, samp_rate, filt_order,
                 print('Empty Template')
                 continue
             t.st = template
-            t.name = template.sort(['starttime'])[0].\
+            t.name = template.sort(['starttime'])[0]. \
                 stats.starttime.strftime('%Y_%m_%dt%H_%M_%S')
             t.lowcut = lowcut
             t.highcut = highcut
@@ -2713,19 +2779,16 @@ def __str__(self):
 
     def __eq__(self, other):
         for key in self.__dict__.keys():
+            self_is_event = isinstance(self.event, Event)
+            other_is_event = isinstance(other.event, Event)
             if key == 'event':
-                if isinstance(self.event, Event) and \
-                   isinstance(other.event, Event):
+                if self_is_event and other_is_event:
                     if not _test_event_similarity(
                             self.event, other.event, verbose=False):
                         return False
-                elif isinstance(
-                        self.event, Event) and not isinstance(
-                        other.event, Event):
+                elif self_is_event and not other_is_event:
                     return False
-                elif not isinstance(
-                        self.event, Event) and isinstance(
-                        other.event, Event):
+                elif not self_is_event and other_is_event:
                     return False
             elif self.__dict__[key] != other.__dict__[key]:
                 return False
@@ -2814,7 +2877,18 @@ def _total_microsec(t1, t2):
     -31536000000000
     """
     td = t1 - t2
-    return (td.seconds + td.days * 24 * 3600) * 10**6 + td.microseconds
+    return (td.seconds + td.days * 24 * 3600) * 10 ** 6 + td.microseconds
+
+
+def _templates_match(t, family_file):
+    """
+    Return True if a tribe matches a family file path.
+
+    :type t: Tribe
+    :type family_file: str
+    :return: bool
+    """
+    return t.name == family_file.split(os.sep)[-1].split('_detections.csv')[0]
 
 
 def _test_event_similarity(event_1, event_2, verbose=False):
@@ -2864,11 +2938,11 @@ def _test_event_similarity(event_1, event_2, verbose=False):
                                 return False
                     if arr_1["distance"] and round(
                             arr_1["distance"]) != round(arr_2["distance"]):
-                            if verbose:
-                                print('%s does not match %s for key %s' %
-                                      (arr_1[arr_key], arr_2[arr_key],
-                                       arr_key))
-                            return False
+                        if verbose:
+                            print('%s does not match %s for key %s' %
+                                  (arr_1[arr_key], arr_2[arr_key],
+                                   arr_key))
+                        return False
     # Check picks
     if len(event_1.picks) != len(event_2.picks):
         if verbose:
@@ -2912,8 +2986,8 @@ def _test_event_similarity(event_1, event_2, verbose=False):
                     if verbose:
                         print('Channel codes do not match')
                     return False
-                if pick_1[key].channel_code[-1] !=\
-                   pick_2[key].channel_code[-1]:
+                if pick_1[key].channel_code[-1] != \
+                        pick_2[key].channel_code[-1]:
                     if verbose:
                         print('Channel codes do not match')
                     return False
@@ -2940,8 +3014,8 @@ def _test_event_similarity(event_1, event_2, verbose=False):
                     if verbose:
                         print('Channel codes do not match')
                     return False
-                if pick_1[key].channel_code[-1] !=\
-                   pick_2[key].channel_code[-1]:
+                if pick_1[key].channel_code[-1] != \
+                        pick_2[key].channel_code[-1]:
                     if verbose:
                         print('Channel codes do not match')
                     return False
@@ -2950,8 +3024,9 @@ def _test_event_similarity(event_1, event_2, verbose=False):
 
 def _group_detect(templates, stream, threshold, threshold_type, trig_int,
                   plotvar, group_size=None, pre_processed=False, daylong=False,
-                  parallel_process=True, ignore_length=False,
-                  overlap="calculate", debug=0):
+                  parallel_process=True, xcorr_func=None, concurrency=None,
+                  cores=None, ignore_length=False, overlap="calculate",
+                  debug=0):
     """
     Pre-process and compute detections for a group of templates.
 
@@ -2997,6 +3072,18 @@ def _group_detect(templates, stream, threshold, threshold_type, trig_int,
         over other methods.
     :type parallel_process: bool
     :param parallel_process:
+    :type xcorr_func: str or callable
+    :param xcorr_func:
+        A str of a registered xcorr function or a callable for implementing
+        a custom xcorr function. For more details see:
+        :func:`eqcorrscan.utils.correlate.register_array_xcorr`
+    :type concurrency: str
+    :param concurrency:
+        The type of concurrency to apply to the xcorr function. Options are
+        'multithread', 'multiprocess', 'concurrent'. For more details see
+        :func:`eqcorrscan.utils.correlate.get_stream_xcorr`
+    :type cores: int
+    :param cores: Number of workers for processing and correlation.
     :type ignore_length: bool
     :param ignore_length:
         If using daylong=True, then dayproc will try check that the data
@@ -3018,10 +3105,6 @@ def _group_detect(templates, stream, threshold, threshold_type, trig_int,
     :return:
         :class:`eqcorrscan.core.match_filter.Party` of families of detections.
     """
-    if parallel_process:
-        ncores = cpu_count()
-    else:
-        ncores = None
     master = templates[0]
     # Check that they are all processed the same.
     lap = 0.0
@@ -3041,7 +3124,7 @@ def _group_detect(templates, stream, threshold, threshold_type, trig_int,
     if not pre_processed:
         st = _group_process(
             template_group=templates, parallel=parallel_process, debug=debug,
-            cores=False, stream=stream, daylong=daylong,
+            cores=cores, stream=stream, daylong=daylong,
             ignore_length=ignore_length, overlap=overlap)
     else:
         warnings.warn('Not performing any processing on the '
@@ -3056,9 +3139,9 @@ def _group_detect(templates, stream, threshold, threshold_type, trig_int,
     else:
         n_groups = 1
     for st_chunk in st:
-        if debug > 0:
-            print('Computing detections between %s and %s' %
-                  (st_chunk[0].stats.starttime, st_chunk[0].stats.endtime))
+        debug_print(
+            'Computing detections between %s and %s' %
+            (st_chunk[0].stats.starttime, st_chunk[0].stats.endtime), 0, debug)
         st_chunk.trim(starttime=st_chunk[0].stats.starttime,
                       endtime=st_chunk[0].stats.endtime)
         for tr in st_chunk:
@@ -3077,8 +3160,9 @@ def _group_detect(templates, stream, threshold, threshold_type, trig_int,
             detections += match_filter(
                 template_names=[t.name for t in template_group],
                 template_list=[t.st for t in template_group], st=st_chunk,
+                xcorr_func=xcorr_func, concurrency=concurrency,
                 threshold=threshold, threshold_type=threshold_type,
-                trig_int=trig_int, plotvar=plotvar, debug=debug, cores=ncores)
+                trig_int=trig_int, plotvar=plotvar, debug=debug, cores=cores)
             for template in template_group:
                 family = Family(template=template, detections=[])
                 for detection in detections:
@@ -3295,10 +3379,9 @@ def _read_family(fname, all_cat):
                 if key == 'event':
                     if len(all_cat) == 0:
                         continue
-                    det_dict.update(
-                        {'event': [e for e in all_cat
-                                   if str(e.resource_id).
-                                   split('/')[-1] == value][0]})
+                    el = [e for e in all_cat
+                          if str(e.resource_id).split('/')[-1] == value][0]
+                    det_dict.update({'event': el})
                 elif key == 'detect_time':
                     det_dict.update(
                         {'detect_time': UTCDateTime(value)})
@@ -3368,7 +3451,7 @@ def read_detections(fname):
             continue  # Skip any repeated headers
         detection = line.rstrip().split('; ')
         detection[1] = UTCDateTime(detection[1])
-        detection[2] = int(detection[2])
+        detection[2] = int(float(detection[2]))
         detection[3] = ast.literal_eval(detection[3])
         detection[4] = float(detection[4])
         detection[5] = float(detection[5])
@@ -3490,18 +3573,18 @@ def normxcorr2(template, image):
         correlation of the image with the template.
     :rtype: numpy.ndarray
     """
-    from eqcorrscan.utils.correlate import fftw_normxcorr
+    array_xcorr = get_array_xcorr()
     # Check that we have been passed numpy arrays
     if type(template) != np.ndarray or type(image) != np.ndarray:
         print('You have not provided numpy arrays, I will not convert them')
         return 'NaN'
     if len(template) > len(image):
-        ccc = fftw_normxcorr(
+        ccc = array_xcorr(
             templates=np.array([image]).astype(np.float32),
             stream=template.astype(np.float32), pads=[0],
             threaded=False)[0][0]
     else:
-        ccc = fftw_normxcorr(
+        ccc = array_xcorr(
             templates=np.array([template]).astype(np.float32),
             stream=image.astype(np.float32), pads=[0], threaded=False)[0][0]
     ccc = ccc.reshape((1, len(ccc)))
@@ -3509,7 +3592,8 @@ def normxcorr2(template, image):
 
 
 def match_filter(template_names, template_list, st, threshold,
-                 threshold_type, trig_int, plotvar, plotdir='.', cores=None,
+                 threshold_type, trig_int, plotvar, plotdir='.',
+                 xcorr_func=None, concurrency=None, cores=None,
                  debug=0, plot_format='png', output_cat=False,
                  output_event=True, extract_detections=False,
                  arg_check=True):
@@ -3550,6 +3634,16 @@ def match_filter(template_names, template_list, st, threshold,
     :param plotdir:
         Path to plotting folder, plots will be output here, defaults to run
         location.
+    :type xcorr_func: str or callable
+    :param xcorr_func:
+        A str of a registered xcorr function or a callable for implementing
+        a custom xcorr function. For more information see:
+        :func:`eqcorrscan.utils.correlate.register_array_xcorr`
+    :type concurrency: str
+    :param concurrency:
+        The type of concurrency to apply to the xcorr function. Options are
+        'multithread', 'multiprocess', 'concurrent'. For more details see
+        :func:`eqcorrscan.utils.correlate.get_stream_xcorr`
     :type cores: int
     :param cores: Number of cores to use
     :type debug: int
@@ -3654,7 +3748,7 @@ def match_filter(template_names, template_list, st, threshold,
         length is the number of channels within this template.
 
     .. note::
-        The output_cat flag will create an :class:`obspy.core.eventCatalog`
+        The output_cat flag will create an :class:`obspy.core.event.Catalog`
         containing one event for each
         :class:`eqcorrscan.core.match_filter.Detection`'s generated by
         match_filter. Each event will contain a number of comments dealing
@@ -3666,15 +3760,40 @@ def match_filter(template_names, template_list, st, threshold,
         prepick times inherent in each template. For example, if a template
         trace starts 0.1 seconds before the actual arrival of that phase,
         then the pick time generated by match_filter for that phase will be
-        0.1 seconds early. We are working on a solution that will involve
-        saving templates alongside associated metadata.
+        0.1 seconds early.
 
+    .. Note:: xcorr_func can be used as follows:
+
+    .. rubric:: Example
+    >>> import obspy
+    >>> import numpy as np
+    >>> from eqcorrscan.core.match_filter import match_filter
+    >>> from eqcorrscan.utils.correlate import time_multi_normxcorr
+    >>> # define a custom xcorr function
+    >>> def custom_normxcorr(templates, stream, pads, *args, **kwargs):
+    ...     # Just to keep example short call other xcorr function
+    ...     # in practice you would define your own function here
+    ...     print('calling custom xcorr function')
+    ...     return time_multi_normxcorr(templates, stream, pads)
+    >>> # generate some toy templates and stream
+    >>> random = np.random.RandomState(42)
+    >>> template = obspy.read()
+    >>> stream = obspy.read()
+    >>> for num, tr in enumerate(stream):  # iter stream and embed templates
+    ...     data = tr.data
+    ...     tr.data = random.randn(6000) * 5
+    ...     tr.data[100: 100 + len(data)] = data
+    >>> # call match_filter ane ensure the custom function is used
+    >>> detections = match_filter(
+    ...     template_names=['1'], template_list=[template], st=stream,
+    ...     threshold=.5, threshold_type='absolute', trig_int=1, plotvar=False,
+    ...     xcorr_func=custom_normxcorr)  # doctest:+ELLIPSIS
+    calling custom xcorr function...
     """
     _spike_test(st)
     import matplotlib
     matplotlib.use('Agg')
     from eqcorrscan.utils.plotting import _match_filter_plot
-    from eqcorrscan.utils.correlate import multichannel_normxcorr
     if arg_check:
         # Check the arguments to be nice - if arguments wrong type the parallel
         # output for the error won't be useful
@@ -3686,7 +3805,7 @@ def match_filter(template_names, template_list, st, threshold,
             raise MatchFilterError('Not the same number of templates as names')
         for template in template_list:
             if not type(template) == Stream:
-                msg = 'template in template_list must be of type: ' +\
+                msg = 'template in template_list must be of type: ' + \
                       'obspy.core.stream.Stream'
                 raise MatchFilterError(msg)
         if not type(st) == Stream:
@@ -3725,11 +3844,10 @@ def match_filter(template_names, template_list, st, threshold,
             data_stachan.append(tr.stats.station + '.' + tr.stats.channel)
         template_stachan = list(set(template_stachan))
         data_stachan = list(set(data_stachan))
-        if debug >= 3:
-            print('I have template info for these stations:')
-            print(template_stachan)
-            print('I have daylong data for these stations:')
-            print(data_stachan)
+        debug_print('I have template info for these stations:\n' +
+                    template_stachan.__str__() +
+                    '\nI have daylong data for these stations:\n' +
+                    data_stachan.__str__(), 3, debug)
     # Perform a check that the continuous data are all the same length
     min_start_time = min([tr.stats.starttime for tr in stream])
     max_end_time = max([tr.stats.endtime for tr in stream])
@@ -3752,8 +3870,8 @@ def match_filter(template_names, template_list, st, threshold,
                    'not currently supported' % _template_names[i])
             raise MatchFilterError(msg)
     outtic = time.clock()
-    if debug >= 2:
-        print('Ensuring all template channels have matches in continuous data')
+    debug_print('Ensuring all template channels have matches in'
+                ' continuous data', 2, debug)
     template_stachan = {}
     # Work out what station-channel pairs are in the templates, including
     # duplicate station-channel pairs.  We will use this information to fill
@@ -3766,12 +3884,11 @@ def match_filter(template_names, template_list, st, threshold,
                                          tr.stats.location, tr.stats.channel))
         stachans_in_template = dict(Counter(stachans_in_template))
         for stachan in stachans_in_template.keys():
+            stachans = stachans_in_template[stachan]
             if stachan not in template_stachan.keys():
-                template_stachan.update({stachan:
-                                         stachans_in_template[stachan]})
+                template_stachan.update({stachan: stachans})
             elif stachans_in_template[stachan] > template_stachan[stachan]:
-                template_stachan.update({stachan:
-                                         stachans_in_template[stachan]})
+                template_stachan.update({stachan: stachans})
     # Remove un-matched channels from templates.
     _template_stachan = copy.deepcopy(template_stachan)
     for stachan in template_stachan.keys():
@@ -3825,7 +3942,7 @@ def match_filter(template_names, template_list, st, threshold,
                 network=stachan[0], station=stachan[1], location=stachan[2],
                 channel=stachan[3]))
             if number_of_channels < template_stachan[stachan]:
-                missed_channels = template_stachan[stachan] -\
+                missed_channels = template_stachan[stachan] - \
                                   number_of_channels
                 nulltrace = Trace()
                 nulltrace.stats.update(
@@ -3846,25 +3963,22 @@ def match_filter(template_names, template_list, st, threshold,
                                    'lengths, report this error.')
     templates = _templates
     _template_names = used_template_names
-    if debug >= 2:
-        print('Starting the correlation run for these data')
-    if debug >= 3:
-        for template in templates:
-            print(template)
-        print(stream)
+    debug_print('Starting the correlation run for these data', 2, debug)
+    for template in templates:
+        debug_print(template.__str__(), 3, debug)
+    debug_print(stream.__str__(), 3, debug)
+    multichannel_normxcorr = get_stream_xcorr(xcorr_func, concurrency)
     [cccsums, no_chans, chans] = multichannel_normxcorr(
         templates=templates, stream=stream, cores=cores)
     if len(cccsums[0]) == 0:
         raise MatchFilterError('Correlation has not run, zero length cccsum')
     outtoc = time.clock()
-    print(' '.join(['Looping over templates and streams took:',
-                    str(outtoc - outtic), 's']))
-    if debug >= 2:
-        print(' '.join(['The shape of the returned cccsums is:',
-                        str(np.shape(cccsums))]))
-        print(' '.join(['This is from', str(len(templates)), 'templates']))
-        print(' '.join(['Correlated with', str(len(stream)),
-                        'channels of data']))
+    debug_print(' '.join(['Looping over templates and streams took:',
+                          str(outtoc - outtic), 's']), 0, debug)
+    debug_print('The shape of the returned cccsums is: %s\n'
+                'This is from %i templates\nCorrelated with %i channels of '
+                'data' % (cccsums.shape, len(templates), len(stream)), 2,
+                debug)
     detections = []
     if output_cat:
         det_cat = Catalog()
@@ -3877,9 +3991,9 @@ def match_filter(template_names, template_list, st, threshold,
         elif str(threshold_type) == str('av_chan_corr'):
             rawthresh = threshold * no_chans[i]
         # Findpeaks returns a list of tuples in the form [(cccsum, sample)]
-        print(' '.join(['Threshold is set at:', str(rawthresh)]))
-        print(' '.join(['Max of data is:', str(max(cccsum))]))
-        print(' '.join(['Mean of data is:', str(np.mean(cccsum))]))
+        debug_print("Threshold is set at: %f\nMax of data is %f\nMean of "
+                    "data is %f" % (rawthresh, max(cccsum), np.mean(cccsum)),
+                    0, debug)
         if np.abs(np.mean(cccsum)) > 0.05:
             warnings.warn('Mean is not zero!  Check this!')
         # Set up a trace object for the cccsum as this is easier to plot and
@@ -3890,38 +4004,39 @@ def match_filter(template_names, template_list, st, threshold,
                                rawthresh=rawthresh, plotdir=plotdir,
                                plot_format=plot_format, i=i)
         if debug >= 4:
-            print(' '.join(['Saved the cccsum to:', _template_names[i],
-                            stream[0].stats.starttime.datetime.
-                           strftime('%Y%j')]))
             np.save(_template_names[i] +
                     stream[0].stats.starttime.datetime.strftime('%Y%j'),
                     cccsum)
+        debug_print(
+            ' '.join(['Saved the cccsum to:', _template_names[i],
+                      stream[0].stats.starttime.datetime.strftime('%Y%j')]),
+            4, debug)
         tic = time.clock()
         if max(cccsum) > rawthresh:
             peaks = find_peaks2_short(
                 arr=cccsum, thresh=rawthresh,
-                trig_int=trig_int * stream[0].stats.sampling_rate, debug=debug,
+                trig_int=trig_int * stream[0].stats.sampling_rate,
+                debug=debug,
                 starttime=stream[0].stats.starttime,
                 samp_rate=stream[0].stats.sampling_rate)
         else:
-            print('No peaks found above threshold')
+            debug_print('No peaks found above threshold', 0, debug)
             peaks = False
         toc = time.clock()
-        if debug >= 1:
-            print(' '.join(['Finding peaks took:', str(toc - tic), 's']))
+        debug_print('Finding peaks took: %f s' % (toc - tic), 0, debug)
         if peaks:
             for peak in peaks:
-                detecttime = stream[0].stats.starttime +\
-                    peak[1] / stream[0].stats.sampling_rate
+                detecttime = stream[0].stats.starttime + \
+                             peak[1] / stream[0].stats.sampling_rate
                 # Detect time must be valid QuakeML uri within resource_id.
                 # This will write a formatted string which is still
                 # readable by UTCDateTime
                 if not output_event and not output_cat:
                     det_ev = None
                 else:
+                    det_time = str(detecttime.strftime('%Y%m%dT%H%M%S.%f'))
                     ev = Event(resource_id=ResourceIdentifier(
-                        id=_template_names[i] + '_' +
-                        str(detecttime.strftime('%Y%m%dT%H%M%S.%f')),
+                        id=_template_names[i] + '_' + det_time,
                         prefix='smi:local'))
                     ev.creation_info = CreationInfo(
                         author='EQcorrscan', creation_time=UTCDateTime())
@@ -3936,7 +4051,7 @@ def match_filter(template_names, template_list, st, threshold,
                         [tr.stats.starttime for tr in template])
                     for tr in template:
                         if (tr.stats.station, tr.stats.channel) \
-                           not in chans[i]:
+                                not in chans[i]:
                             continue
                         else:
                             pick_tm = detecttime + (tr.stats.starttime -
@@ -3974,6 +4089,7 @@ def match_filter(template_names, template_list, st, threshold,
 
 if __name__ == "__main__":
     import doctest
+
     doctest.testmod()
     # List files to be removed after doctest
     cleanup = ['test_tar_write.tgz', 'test_csv_write.csv', 'test_quakeml.ml',
diff --git a/eqcorrscan/core/subspace.py b/eqcorrscan/core/subspace.py
index cf52a4d97..44d88fc4d 100644
--- a/eqcorrscan/core/subspace.py
+++ b/eqcorrscan/core/subspace.py
@@ -32,6 +32,7 @@
     WaveformStreamID, Pick
 
 from eqcorrscan.utils.clustering import svd
+from eqcorrscan.utils.debug_log import debug_print
 from eqcorrscan.utils import findpeaks, pre_processing, stacking, plotting
 from eqcorrscan.core.match_filter import Detection, extract_from_stream
 from eqcorrscan.utils.plotting import subspace_detector_plot, subspace_fc_plot
@@ -477,8 +478,7 @@ def _detect(detector, st, threshold, trig_int, moveout=0, min_trig=0,
     detections = []
     # First process the stream
     if process:
-        if debug > 0:
-            print('Processing Stream')
+        debug_print('Processing Stream', 0, debug)
         stream, stachans = _subspace_process(
             streams=[st.copy()], lowcut=detector.lowcut,
             highcut=detector.highcut, filt_order=detector.filt_order,
@@ -500,10 +500,8 @@ def _detect(detector, st, threshold, trig_int, moveout=0, min_trig=0,
         Nc = 1
     # Here do all ffts
     fft_vars = _do_ffts(detector, stream, Nc)
-    if debug > 0:
-        print('Computing detection statistics')
-    if debug > 0:
-        print('Preallocating stats matrix')
+    debug_print('Computing detection statistics', 0, debug)
+    debug_print('Preallocating stats matrix', 0, debug)
     stats = np.zeros((len(stream[0]),
                       (len(stream[0][0]) // Nc) - (fft_vars[4] // Nc) + 1))
     for det_freq, data_freq_sq, data_freq, i in zip(fft_vars[0], fft_vars[1],
@@ -512,8 +510,7 @@ def _detect(detector, st, threshold, trig_int, moveout=0, min_trig=0,
         # Calculate det_statistic in frequency domain
         stats[i] = _det_stat_freq(det_freq, data_freq_sq, data_freq,
                                   fft_vars[3], Nc, fft_vars[4], fft_vars[5])
-        if debug >= 1:
-            print('Stats matrix is shape %s' % str(stats[i].shape))
+        debug_print('Stats matrix is shape %s' % str(stats[i].shape), 0, debug)
         if debug >= 3:
             fig, ax = plt.subplots()
             t = np.arange(len(stats[i]))
@@ -526,8 +523,7 @@ def _detect(detector, st, threshold, trig_int, moveout=0, min_trig=0,
             plt.title('%s' % str(stream[0][i].stats.station))
             plt.show()
     trig_int_samples = detector.sampling_rate * trig_int
-    if debug > 0:
-        print('Finding peaks')
+    debug_print('Finding peaks', 0, debug)
     peaks = []
     for i in range(len(stream[0])):
         peaks.append(findpeaks.find_peaks2_short(
diff --git a/eqcorrscan/core/template_gen.py b/eqcorrscan/core/template_gen.py
index bc2bc5a0e..ce25225ad 100644
--- a/eqcorrscan/core/template_gen.py
+++ b/eqcorrscan/core/template_gen.py
@@ -49,6 +49,7 @@
 
 from obspy import Stream, read, Trace, UTCDateTime, read_events
 
+from eqcorrscan.utils.debug_log import debug_print
 from eqcorrscan.utils.sac_util import sactoevent
 from eqcorrscan.utils import pre_processing, sfile_util
 
@@ -276,9 +277,8 @@ def from_sfile(sfile, lowcut, highcut, samp_rate, filt_order, length, swin,
     # Read in waveform file
     st = Stream()
     for wavefile in wavefiles:
-        if debug > 0:
-            print(''.join(["I am going to read waveform data from: ", wavpath,
-                           wavefile]))
+        debug_print(''.join(["I am going to read waveform data from: ",
+                             wavpath, wavefile]), 0, debug)
         if os.path.isfile(wavpath + wavefile):
             # Read from a given directory
             st += read(wavpath + wavefile)
@@ -551,25 +551,22 @@ def from_meta_file(meta_file, st, lowcut, highcut, samp_rate, filt_order,
         # Check that the event is within the data
         for pick in event.picks:
             if not data_start < pick.time < data_end:
-                if debug > 0:
-                    print('Pick outside of data span:')
-                    print('Pick time: ' + str(pick.time))
-                    print('Start time: ' + str(data_start))
-                    print('End time: ' + str(data_end))
+                debug_print("Pick outside of data span:\nPick time %s\n"
+                            "Start time %s\nEnd time: %s" %
+                            (str(pick.time), str(data_start), str(data_end)),
+                            0, debug)
                 use_event = False
         if not use_event:
             warnings.warn('Event is not within data time-span')
             continue
         # Read in pick info
-        if debug > 0:
-            print("I have found the following picks")
+        debug_print("I have found the following picks", 0, debug)
         for pick in event.picks:
             if not pick.waveform_id:
                 print('Pick not associated with waveforms, will not use.')
                 print(pick)
                 continue
-            if debug > 0:
-                print(pick)
+            debug_print(pick, 0, debug)
             stations.append(pick.waveform_id.station_code)
             channels.append(pick.waveform_id.channel_code)
         # Check to see if all picks have a corresponding waveform
@@ -698,11 +695,10 @@ def from_seishub(catalog, url, lowcut, highcut, samp_rate, filt_order,
             endtime = starttime + process_len
             if not endtime > sub_catalog[-1].origins[0].time + data_pad:
                 raise IOError('Events do not fit in processing window')
-            if debug > 0:
-                print('start-time: ' + str(starttime))
-                print('end-time: ' + str(endtime))
-                print('pick-time: ' + str(pick.time))
-            print('.'.join([net, sta, loc, chan]))
+            debug_print('start-time: %s\nend-time: %s\npick-time: %s' %
+                        (str(starttime), str(endtime), str(pick.time)),
+                        0, debug)
+            debug_print('.'.join([net, sta, loc, chan]), 0, debug)
             if sta in client.waveform.get_station_ids(network=net):
                 if 'st' not in locals():
                     st = client.waveform.get_waveform(net, sta, loc, chan,
@@ -817,8 +813,6 @@ def from_client(catalog, client_id, lowcut, highcut, samp_rate, filt_order,
     ...                         filt_order=4, length=3.0, prepick=0.15,
     ...                         swin='all', process_len=300,
     ...                         all_horiz=True)
-    BG.CLV..DPZ
-    BK.BKS.00.HHZ
     Pre-processing data
     >>> templates[0].plot(equal_scale=False, size=(800,600)) # doctest: +SKIP
 
@@ -860,12 +854,11 @@ def from_client(catalog, client_id, lowcut, highcut, samp_rate, filt_order,
             if not endtime > sub_catalog[-1].origins[0].time + data_pad:
                 raise TemplateGenError('Events do not fit in '
                                        'processing window')
-            if debug > 0:
-                print('start-time: ' + str(starttime))
-                print('end-time: ' + str(endtime))
-                print('pick-time: ' + str(pick.time))
-                print('pick phase: ' + pick.phase_hint)
-            print('.'.join([net, sta, loc, chan]))
+            debug_print('start-time: %s\nend-time: %s\npick-time: %s\n'
+                        'pick phase: %s' %
+                        (str(starttime), str(endtime), str(pick.time),
+                         pick.phase_hint), 0, debug)
+            debug_print('.'.join([net, sta, loc, chan]), 0, debug)
             try:
                 st += client.get_waveforms(net, sta, loc, chan,
                                            starttime, endtime)
@@ -1054,6 +1047,7 @@ def template_gen(picks, st, length, swin='all', prepick=0.05,
     .. warning:: If there is no phase_hint included in picks, and swin=all, \
         all channels with picks will be used.
     """
+    from eqcorrscan.utils.debug_log import debug_print
     from eqcorrscan.utils.plotting import pretty_template_plot as tplot
     from eqcorrscan.core.bright_lights import _rms
     stations = []
@@ -1147,9 +1141,8 @@ def template_gen(picks, st, length, swin='all', prepick=0.05,
                         swin in pick.phase_hint.upper():
                     starttime = pick.time - prepick
             if starttime is not None:
-                if debug > 0:
-                    print("Cutting " + tr.stats.station + '.' +
-                          tr.stats.channel)
+                debug_print("Cutting " + tr.stats.station + '.' +
+                            tr.stats.channel, 0, debug)
                 if not delayed:
                     starttime = event_start_time
                 tr_cut = tr.copy().trim(starttime=starttime,
@@ -1163,9 +1156,9 @@ def template_gen(picks, st, length, swin='all', prepick=0.05,
                 if len(tr_cut.data) == (tr_cut.stats.sampling_rate *
                                         length) + 1:
                     tr_cut.data = tr_cut.data[0:-1]
-                if debug > 0:
-                    print('Cut starttime = ' + str(tr_cut.stats.starttime))
-                    print('Cut endtime = ' + str(tr_cut.stats.endtime))
+                debug_print('Cut starttime = %s\nCut endtime %s' %
+                            (str(tr_cut.stats.starttime),
+                             str(tr_cut.stats.endtime)), 0, debug)
                 if min_snr is not None and \
                    max(tr_cut.data) / noise_amp < min_snr:
                     print('Signal-to-noise ratio below threshold for %s.%s' %
@@ -1173,8 +1166,9 @@ def template_gen(picks, st, length, swin='all', prepick=0.05,
                     continue
                 st1 += tr_cut
                 used_tr = True
-        if debug > 0 and not used_tr:
-            print('No pick for ' + tr.stats.station + '.' + tr.stats.channel)
+        if not used_tr:
+            debug_print('No pick for ' + tr.stats.station + '.' +
+                        tr.stats.channel, 0, debug)
     if plot:
         background = stplot.trim(
             st1.sort(['starttime'])[0].stats.starttime - 10,
diff --git a/eqcorrscan/doc/intro.rst b/eqcorrscan/doc/intro.rst
index d62575568..117cbb746 100644
--- a/eqcorrscan/doc/intro.rst
+++ b/eqcorrscan/doc/intro.rst
@@ -30,7 +30,7 @@ get involved the best place to start, and the most valuable thing for your
 understanding, and for the health of this package would be to contribute tests and
 documentation.
 
-Installation - Updated for version 0.2.0
+Installation - Updated for version 0.2.x
 ----------------------------------------
 
 In general we recommend users to install EQcorrscan in a virtual environment,
@@ -42,6 +42,9 @@ environment with the following:
     conda create -n eqcorrscan colorama numpy scipy matplotlib obspy bottleneck pyproj
     source activate eqcorrscan
 
+Non-Python dependancies: Ubuntu:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 Prior to installing the python routines you will need to install the fftw
 library.  On linux use apt (or your default package manager - note you may need
 sudo access):
@@ -50,6 +53,9 @@ sudo access):
 
     apt-get install libfftw3-dev
 
+Non-Python dependancies: OSX:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 For OS-X systems install fftw from conda, available via the menpo channel:
 
 .. code-block:: bash
@@ -62,6 +68,13 @@ not clang. To do this you should also install gcc into your conda environment:
 .. code-block:: bash
 
     conda install gcc
+    
+Note that you can install fftw and gcc from other sources, however, we know there
+is an issue with homebrew gcc-4.9 (but not with macports gcc-4.9) - other
+gcc versions tested do work however.
+
+Non-Python dependancies: Windows:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 For Windows systems you should follow the instructions on the |fftw-windows|
 page and use the pre-compiled dynamic libraries. These should be installed
@@ -72,6 +85,9 @@ used), as such, by default, the correlation routines are compiled as serial
 workflows on windows.  If you have a need for this threading in windows please
 get in touch with the developers.
 
+Final EQcorrscan install:
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
 Once you have installed fftw the EQcorrscan install should be as simple as:
 
 .. code-block:: bash
diff --git a/eqcorrscan/doc/submodules/utils.correlate.rst b/eqcorrscan/doc/submodules/utils.correlate.rst
index bc8d3eb1c..28b3e9652 100644
--- a/eqcorrscan/doc/submodules/utils.correlate.rst
+++ b/eqcorrscan/doc/submodules/utils.correlate.rst
@@ -14,8 +14,128 @@ correlate
 
        fftw_multi_normxcorr
        fftw_normxcorr
-       multichannel_normxcorr
        numpy_normxcorr
        time_multi_normxcorr
+       get_array_xcorr
+       get_stream_xcorr
+       register_array_xcorr
 
-    .. comment to end block
\ No newline at end of file
+
+    .. comment to end block
+
+All functions within this module (and therefore all correlation functions used
+in EQcorrscan) are normalised cross-correlations, which follow the definition
+that Matlab uses for |normxcorr2|.
+
+In the time-domain this is as follows
+    .. math::
+        c(t) = \frac{\sum_{y}(a_{y} - \overline{a})(b_{t+y} - \overline{b_t})}{\sqrt{\sum_{y}(a_{y} - \overline{a})(a_{y} - \overline{a})\sum_{y}(b_{t+y} - \overline{b_t})(b_{t+y} - \overline{b_t})}}
+
+where :math:`c(t)` is the normalised cross-correlation at at time :math:`t`,
+:math:`a` is the template window which has length :math:`y`, :math:`b` is the
+continuous data, and :math:`\overline{a}` is the mean of the template and
+:math:`\overline{b_t}` is the local mean of the continuous data.
+
+In practice the correlations only remove the mean of the template once, but we do
+remove the mean from the continuous data at every time-step.  Without doing this
+(just removing the global mean), correlations are affected by jumps in average
+amplitude, which is most noticeable during aftershock sequences, or when using
+short templates.
+
+In the frequency domain (functions :func:`eqcorrscan.utils.correlate.numpy_normxcorr`,
+:func:`eqcorrscan.utils.correlate.fftw_normxcorr`, :func:`fftw_multi_normxcorr`),
+correlations are computed using an equivalent method.  All methods are tested
+against one-another to check for internal consistency, and checked against results
+computed using Matlab's |normxcorr2| function. These routines also give the same
+(within 0.00001) of openCV cross-correlations.
+
+.. |normxcorr2| raw:: html
+
+    <a href="https://au.mathworks.com/help/images/ref/normxcorr2.html?s_tid=gn_loc_drop" target="_blank">normxcorr2</a>
+
+Selecting a correlation function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+EQcorrscan strives to use sensible default algorithms for calculating
+correlation values, however, you may want to change how correlations are
+caclulated to be more advantageous to your specific needs.
+
+
+There are currently 3 different correlations functions currently included in EQcorrscan:
+
+    1. :func:`eqcorrscan.utils.correlate.numpy_normxcorr` known as "numpy"
+
+    2. :func:`eqcorrscan.utils.correlate.time_multi_normxcorr` known as "time_domain"
+
+    3. :func:`eqcorrscan.utils.correlate.fftw_normxcorr` known as "fftw"
+
+Number 3 is the default.
+
+Switching which correlation function is used
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can switch between correlation functions using the `xcorr_func` parameter
+included in:
+- :func:`eqcorrscan.core.match_filter.match_filter`,
+- :meth:`eqcorrscan.core.match_filter.Tribe.detect`,
+- :meth:`eqcorrscan.core.match_filter.Template.detect`
+
+by:
+
+1. passing a string (eg "numpy", "time_domain", or "fftw") or;
+2. passing a function
+
+for example:
+
+.. code-block:: python
+
+    >>> import obspy
+    >>> import numpy as np
+    >>> from eqcorrscan.utils.correlate import numpy_normxcorr, set_xcorr
+    >>> from eqcorrscan.core.match_filter import match_filter
+
+
+    >>> # generate some toy templates and stream
+    >>> random = np.random.RandomState(42)
+    >>> template = obspy.read()
+    >>> stream = obspy.read()
+    >>> for num, tr in enumerate(stream):  # iter stream and embed templates
+    ...     data = tr.data
+    ...     tr.data = random.randn(6000) * 5
+    ...     tr.data[100: 100 + len(data)] = data
+
+    >>> # do correlation using numpy rather than fftw
+    >>> detections = match_filter(['1'], [template], stream, .5, 'absolute',
+    ...                           1, False, xcorr_func='numpy')
+
+    >>> # do correlation using a custom function
+    >>> def custom_normxcorr(templates, stream, pads, *args, **kwargs):
+    ...     # Just to keep example short call other xcorr function
+    ...     print('calling custom xcorr function')
+    ...     return numpy_normxcorr(templates, stream, pads, *args, **kwargs)
+
+    >>> detections = match_filter(
+    ...     ['1'], [template], stream, .5, 'absolute', 1, False,
+    ...     xcorr_func=custom_normxcorr) # doctest:+ELLIPSIS
+    calling custom xcorr function...
+
+
+You can also use the set_xcorr object (eqcorrscan.utils.correlate.set_xcorr)
+to change which correlation function is used. This can be done permanently
+or within the scope of a context manager:
+
+.. code-block:: python
+
+    >>> # change the default xcorr function for all code in the with block
+    >>> with set_xcorr(custom_normxcorr):
+    ...     detections = match_filter(['1'], [template], stream, .5,
+    ...                               'absolute', 1, False) # doctest:+ELLIPSIS
+    calling custom xcorr function...
+
+    >>> # permanently set the xcorr function (until the python kernel restarts)
+    >>> set_xcorr(custom_normxcorr)
+    default changed to custom_normxcorr
+    >>> detections = match_filter(['1'], [template], stream, .5, 'absolute',
+    ...                           1, False) # doctest:+ELLIPSIS
+    calling custom xcorr function...
+    >>> set_xcorr.revert()  # change it back to the previous state
diff --git a/eqcorrscan/doc/submodules/utils.findpeaks.rst b/eqcorrscan/doc/submodules/utils.findpeaks.rst
index 84a64b216..b00695c68 100644
--- a/eqcorrscan/doc/submodules/utils.findpeaks.rst
+++ b/eqcorrscan/doc/submodules/utils.findpeaks.rst
@@ -14,6 +14,5 @@ findpeaks
 
        coin_trig
        find_peaks2_short
-       find_peaks_dep
 
     .. comment to end block
diff --git a/eqcorrscan/doc/tutorials/clustering.rst b/eqcorrscan/doc/tutorials/clustering.rst
index b6e1422ca..7ef250a73 100644
--- a/eqcorrscan/doc/tutorials/clustering.rst
+++ b/eqcorrscan/doc/tutorials/clustering.rst
@@ -18,30 +18,26 @@ threshold to 1,000km
 
 .. code-block:: python
 
-    from eqcorrscan.utils.clustering import space_cluster
-    from obspy.clients.fdsn import Client
-    from obspy import UTCDateTime
-    client = Client("IRIS")
-    starttime = UTCDateTime("2002-01-01")
-    endtime = UTCDateTime("2002-02-01")
-    cat = client.get_events(starttime=starttime, endtime=endtime,
-                            minmagnitude=6, catalog="ISC")
-    groups = space_cluster(catalog=cat, d_thresh=1000, show=False)
+    >>> from eqcorrscan.utils.clustering import space_cluster
+    >>> from obspy.clients.fdsn import Client
+    >>> from obspy import UTCDateTime
+
+    >>> client = Client("IRIS")
+    >>> starttime = UTCDateTime("2002-01-01")
+    >>> endtime = UTCDateTime("2002-02-01")
+    >>> cat = client.get_events(starttime=starttime, endtime=endtime,
+    ...                         minmagnitude=6, catalog="ISC")
+    >>> groups = space_cluster(catalog=cat, d_thresh=1000, show=False)
 
 Download a local catalog of earthquakes and cluster much finer (distance
 threshold of 2km).
 
 .. code-block:: python
 
-    from eqcorrscan.utils.clustering import space_cluster
-    from obspy.clients.fdsn import Client
-    from obspy import UTCDateTime
-    client = Client("NCEDC")
-    starttime = UTCDateTime("2002-01-01")
-    endtime = UTCDateTime("2002-02-01")
-    cat = client.get_events(starttime=starttime, endtime=endtime,
-                            minmagnitude=2)
-    groups = space_cluster(catalog=cat, d_thresh=2, show=False)
+    >>> client = Client("NCEDC")
+    >>> cat = client.get_events(starttime=starttime, endtime=endtime,
+    ...                         minmagnitude=2)
+    >>> groups = space_cluster(catalog=cat, d_thresh=2, show=False)
 
 
 Setting show to true will plot the dendrogram for grouping with individual
@@ -66,15 +62,11 @@ distance threshold and a one-day temporal limit.
 
 .. code-block:: python
 
-    from eqcorrscan.utils.clustering import space_time_cluster
-    from obspy.clients.fdsn import Client
-    from obspy import UTCDateTime
-    client = Client("IRIS")
-    starttime = UTCDateTime("2002-01-01")
-    endtime = UTCDateTime("2002-02-01")
-    cat = client.get_events(starttime=starttime, endtime=endtime,
-                            minmagnitude=6, catalog="ISC")
-    groups = space_time_cluster(catalog=cat, t_thresh=86400, d_thresh=1000)
+    >>> from eqcorrscan.utils.clustering import space_time_cluster
+    >>> client = Client("IRIS")
+    >>> cat = client.get_events(starttime=starttime, endtime=endtime,
+    ...                         minmagnitude=6, catalog="ISC")
+    >>> groups = space_time_cluster(catalog=cat, t_thresh=86400, d_thresh=1000)
 
 
 Cluster according to cross-correlation values
@@ -91,25 +83,32 @@ in the tests directory.
 
 .. code-block:: python
 
-    from obspy import read
-    import glob
-    import os
-    from eqcorrscan.utils.clustering import cluster
-    # You will need to edit this line to the location of your eqcorrscan repo.
-    testing_path = 'eqcorrscan/tests/test_data/similar_events'
-    stream_files = glob.glob(os.path.join(testing_path, '*'))
-    stream_list = [(read(stream_file), i)
-                   for i, stream_file in enumerate(stream_files)]
-    for stream in stream_list:
-        for tr in stream[0]:
-            if tr.stats.station not in ['WHAT2', 'WV04', 'GCSZ']:
-                stream[0].remove(tr)
-                continue
-            tr.detrend('simple')
-            tr.filter('bandpass', freqmin=5.0, freqmax=15.0)
-            tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45)
-    groups = cluster(template_list=stream_list, show=False,
-                     corr_thresh=0.3)
+    >>> from obspy import read
+    >>> import glob
+    >>> import os
+    >>> from eqcorrscan.utils.clustering import cluster
+    >>> # You will need to edit this line to the location of your eqcorrscan repo.
+    >>> testing_path = 'eqcorrscan/tests/test_data/similar_events'
+    >>> stream_files = glob.glob(os.path.join(testing_path, '*'))
+    >>> stream_list = [(read(stream_file), i)
+    ...                for i, stream_file in enumerate(stream_files)]
+    >>> for stream in stream_list:
+    ...     for tr in stream[0]:
+    ...         if tr.stats.station not in ['WHAT2', 'WV04', 'GCSZ']:
+    ...             stream[0].remove(tr) # doctest:+ELLIPSIS
+    ...             continue
+    ...         tr = tr.detrend('simple')
+    ...         tr = tr.filter('bandpass', freqmin=5.0, freqmax=15.0)
+    ...         tr = tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45)
+    <obspy.core.stream.Stream object at ...>
+    >>> groups = cluster(template_list=stream_list, show=False,
+    ...                  corr_thresh=0.3, cores=2)
+    Computing the distance matrix using 2 cores
+    Computing linkage
+    Clustering
+    Found 9 groups
+    Extracting and grouping
+
 
 Stack waveforms (linear)
 ------------------------
@@ -122,30 +121,12 @@ The following examples use the test data in the eqcorrscan github repository.
 
 .. code-block:: python
 
-    from obspy import read
-    import glob
-    import os
-    from eqcorrscan.utils.clustering import cluster
-    from eqcorrscan.utils.stacking import linstack
-    # You will need to edit this line to the location of your eqcorrscan repo.
-    testing_path = 'eqcorrscan/tests/test_data/similar_events'
-    stream_files = glob.glob(os.path.join(testing_path, '*'))
-    stream_list = [(read(stream_file), i)
-                   for i, stream_file in enumerate(stream_files)]
-    for stream in stream_list:
-        for tr in stream[0]:
-            if tr.stats.station not in ['WHAT2', 'WV04', 'GCSZ']:
-                stream[0].remove(tr)
-                continue
-            tr.detrend('simple')
-            tr.filter('bandpass', freqmin=5.0, freqmax=15.0)
-            tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45)
-    groups = cluster(template_list=stream_list, show=False,
-                     corr_thresh=0.3)
-    # groups[0] should contain 3 streams, which we can now stack
-    # Groups are returned as lists of tuples, of the stream and event index
-    group_streams = [st_tuple[0] for st_tuple in groups[0]]
-    stack = linstack(streams=group_streams)
+    >>> from eqcorrscan.utils.stacking import linstack
+
+    >>> # groups[0] should contain 3 streams, which we can now stack
+    >>> # Groups are returned as lists of tuples, of the stream and event index
+    >>> group_streams = [st_tuple[0] for st_tuple in groups[0]]
+    >>> stack = linstack(streams=group_streams)
 
 
 
@@ -162,27 +143,10 @@ of the instantaneous phase.  In this manor coherent signals are amplified.
 
 .. code-block:: python
 
-    from obspy import read
-    import glob
-    import os
-    from eqcorrscan.utils.clustering import cluster
-    from eqcorrscan.utils.stacking import PWS_stack
-    # You will need to edit this line to the location of your eqcorrscan repo.
-    testing_path = 'eqcorrscan/tests/test_data/similar_events'
-    stream_files = glob.glob(os.path.join(testing_path, '*'))
-    stream_list = [(read(stream_file), i)
-                   for i, stream_file in enumerate(stream_files)]
-    for stream in stream_list:
-        for tr in stream[0]:
-            if tr.stats.station not in ['WHAT2', 'WV04', 'GCSZ']:
-                stream[0].remove(tr)
-                continue
-            tr.detrend('simple')
-            tr.filter('bandpass', freqmin=5.0, freqmax=15.0)
-            tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45)
-    groups = cluster(template_list=stream_list, show=False,
-                     corr_thresh=0.3)
-    # groups[0] should contain 3 streams, which we can now stack
-    # Groups are returned as lists of tuples, of the stream and event index
-    group_streams = [st_tuple[0] for st_tuple in groups[0]]
-    stack = PWS_stack(streams=group_streams)
\ No newline at end of file
+    >>> from eqcorrscan.utils.stacking import PWS_stack
+
+    >>> # groups[0] should contain 3 streams, which we can now stack
+    >>> # Groups are returned as lists of tuples, of the stream and event index
+    >>> stack = PWS_stack(streams=group_streams)
+    Computing instantaneous phase
+    Computing the phase stack
diff --git a/eqcorrscan/doc/tutorials/mag-calc.rst b/eqcorrscan/doc/tutorials/mag-calc.rst
index fe4c9caeb..35567ab27 100644
--- a/eqcorrscan/doc/tutorials/mag-calc.rst
+++ b/eqcorrscan/doc/tutorials/mag-calc.rst
@@ -15,18 +15,19 @@ This example requires data downloaded from the eqcorrscan github repository.
 
 .. code-block:: python
 
-    from eqcorrscan.utils.mag_calc import amp_pick_sfile
-    from obspy.core.event import Event
-    import os
-    testing_path = 'eqcorrscan/tests/test_data'
-    sfile = os.path.join(testing_path, 'REA', 'TEST_',
-                         '01-0411-15L.S201309')
-    datapath = os.path.join(testing_path, 'WAV', 'TEST_')
-    respdir = testing_path
-    event = amp_pick_sfile(sfile=sfile, datapath=datapath,
-                           respdir=respdir, chans=['Z'], var_wintype=True,
-                           winlen=0.9, pre_pick=0.2, pre_filt=True,
-                           lowcut=1.0, highcut=20.0, corners=4)
+    >>> from eqcorrscan.utils.mag_calc import amp_pick_sfile
+    >>> from obspy.core.event import Event
+    >>> import os
+    >>> testing_path = 'eqcorrscan/tests/test_data'
+    >>> sfile = os.path.join(testing_path, 'REA', 'TEST_',
+    ...                      '01-0411-15L.S201309')
+    >>> datapath = os.path.join(testing_path, 'WAV', 'TEST_')
+    >>> respdir = testing_path
+    >>> event = amp_pick_sfile(sfile=sfile, datapath=datapath,
+    ...                        respdir=respdir, chans=['Z'], var_wintype=True,
+    ...                        winlen=0.9, pre_pick=0.2, pre_filt=True,
+    ...                        lowcut=1.0, highcut=20.0, corners=4) # doctest:+ELLIPSIS
+    Working on ...
 
 Relative moment by singular-value decomposition
 -----------------------------------------------
@@ -41,32 +42,33 @@ This example requires data downloaded from the eqcorrscan github repository.
 
 .. code-block:: python
 
-    from eqcorrscan.utils.mag_calc import SVD_moments
-    from obspy import read
-    import glob
-    import os
-    from eqcorrscan.utils.clustering import SVD
-    import numpy as np
-    # Do the set-up
-    testing_path = 'eqcorrscan/tests/test_data/similar_events'
-    stream_files = glob.glob(os.path.join(testing_path, '*'))
-    stream_list = [read(stream_file) for stream_file in stream_files]
-    event_list = []
-    for i, stream in enumerate(stream_list):
-        st_list = []
-        for tr in stream:
-            # Only use the vertical channels of sites with known high similarity.
-            # You do not need to use this step for your data.
-            if (tr.stats.station, tr.stats.channel) not in\
-                    [('WHAT2', 'SH1'), ('WV04', 'SHZ'), ('GCSZ', 'EHZ')]:
-                stream.remove(tr)
-                continue
-            tr.detrend('simple')
-            tr.filter('bandpass', freqmin=5.0, freqmax=15.0)
-            tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45)
-            st_list.append(i)
-        event_list.append(st_list)
-    event_list = np.asarray(event_list).T.tolist()
-    SVectors, SValues, Uvectors, stachans = SVD(stream_list=stream_list)
-    M, events_out = SVD_moments(U=Uvectors, s=SValues, V=SVectors,
-                                stachans=stachans, event_list=event_list)
\ No newline at end of file
+    >>> from eqcorrscan.utils.mag_calc import svd_moments
+    >>> from obspy import read
+    >>> import glob
+    >>> from eqcorrscan.utils.clustering import svd
+    >>> import numpy as np
+    >>> # Do the set-up
+    >>> testing_path = 'eqcorrscan/tests/test_data/similar_events'
+    >>> stream_files = glob.glob(os.path.join(testing_path, '*'))
+    >>> stream_list = [read(stream_file) for stream_file in stream_files]
+    >>> event_list = []
+    >>> for i, stream in enumerate(stream_list): # doctest:+ELLIPSIS
+    ...     st_list = []
+    ...     for tr in stream:
+    ...         # Only use the vertical channels of sites with known high similarity.
+    ...         # You do not need to use this step for your data.
+    ...         if (tr.stats.station, tr.stats.channel) not in\
+    ...                 [('WHAT2', 'SH1'), ('WV04', 'SHZ'), ('GCSZ', 'EHZ')]:
+    ...             stream.remove(tr)
+    ...             continue
+    ...         tr.detrend('simple')
+    ...         tr.filter('bandpass', freqmin=5.0, freqmax=15.0)
+    ...         tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45)
+    ...         st_list.append(i)
+    ...     event_list.append(st_list)
+    <obspy.core.stream.Stream object at ...>
+    >>> event_list = np.asarray(event_list).T.tolist()
+    >>> SVectors, SValues, Uvectors, stachans = svd(stream_list=stream_list)
+    >>> M, events_out = svd_moments(u=Uvectors, s=SValues, v=SVectors,
+    ...                             stachans=stachans, event_list=event_list) # doctest:+ELLIPSIS
+    Created Kernel matrix: ...
\ No newline at end of file
diff --git a/eqcorrscan/doc/tutorials/matched-filter.rst b/eqcorrscan/doc/tutorials/matched-filter.rst
index d4fd1a6ea..26e32d609 100644
--- a/eqcorrscan/doc/tutorials/matched-filter.rst
+++ b/eqcorrscan/doc/tutorials/matched-filter.rst
@@ -64,14 +64,14 @@ event:
 
 .. code-block:: python
 
-     import glob
-     from eqcorrscan.core.match_filter import Template
-     sac_files = glob.glob('eqcorrscan/tests/test_data/SAC/2014p611252')
-     # sac_files is now a list of all the SAC files for event id:2014p611252
-     template = Template().construct(
-          method='from_sac', name='test', lowcut=2.0, highcut=8.0,
-          samp_rate=20.0, filt_order=4, prepick=0.1, swin='all',
-          length=2.0, sac_files=sac_files)
+     >>> import glob
+     >>> from eqcorrscan.core.match_filter import Template
+     >>> sac_files = glob.glob('eqcorrscan/tests/test_data/SAC/2014p611252/*')
+     >>> # sac_files is now a list of all the SAC files for event id:2014p611252
+     >>> template = Template().construct(
+     ...      method='from_sac', name='test', lowcut=2.0, highcut=8.0,
+     ...      samp_rate=20.0, filt_order=4, prepick=0.1, swin='all',
+     ...      length=2.0, sac_files=sac_files)
 
 
 Tribe creation
@@ -84,19 +84,20 @@ to their start-time, but you can rename them later if you wish:
 
 .. code-block:: python
 
-     from eqcorrscan.core.match_filter import Tribe
-     from obspy.clients.fdsn import Client
-
-     client = Client('NCEDC')
-     catalog = client.get_events(eventid='72572665', includearrivals=True)
-     # To speed the example we have a catalog of one event, but you can have
-     # more, we are also only using the first five picks, again to speed the
-     # example.
-     catalog[0].picks = catalog[0].picks[0:5]
-     tribe = Tribe().construct(
-          method='from_client', catalog=catalog, client_id='NCEDC', lowcut=2.0,
-          highcut=8.0,  samp_rate=20.0, filt_order=4, length=6.0, prepick=0.1,
-          swin='all', process_len=3600, all_horiz=True)
+     >>> from eqcorrscan.core.match_filter import Tribe
+     >>> from obspy.clients.fdsn import Client
+
+     >>> client = Client('NCEDC')
+     >>> catalog = client.get_events(eventid='72572665', includearrivals=True)
+     >>> # To speed the example we have a catalog of one event, but you can have
+     >>> # more, we are also only using the first five picks, again to speed the
+     >>> # example.
+     >>> catalog[0].picks = catalog[0].picks[0:5]
+     >>> tribe = Tribe().construct(
+     ...      method='from_client', catalog=catalog, client_id='NCEDC', lowcut=2.0,
+     ...      highcut=8.0,  samp_rate=20.0, filt_order=4, length=6.0, prepick=0.1,
+     ...      swin='all', process_len=3600, all_horiz=True)
+     Pre-processing data
 
 Matched-filter detection using a Tribe
 --------------------------------------
@@ -115,12 +116,12 @@ data by running the following:
 
 .. code-block:: python
 
-     from obspy import UTCDateTime
+     >>> from obspy import UTCDateTime
 
-     party, stream = tribe.client_detect(
-          client=client, starttime=UTCDateTime(2016, 1, 2),
-          endtime=UTCDateTime(2016, 1, 3), threshold=8, threshold_type='MAD',
-          trig_int=6, plotvar=False, return_stream=True)
+     >>> party, stream = tribe.client_detect(
+     ...      client=client, starttime=UTCDateTime(2016, 1, 2),
+     ...      endtime=UTCDateTime(2016, 1, 3), threshold=8, threshold_type='MAD',
+     ...      trig_int=6, plotvar=False, return_stream=True)
 
 Lag-calc using a Party
 ----------------------
@@ -130,8 +131,10 @@ generate re-picked catalogues using lag-calc:
 
 .. code-block:: python
 
-     repicked_catalog = party.lag_calc(
-          stream, pre_processed=False, shift_len=0.2, min_cc=0.4)
+     >>> stream = stream.merge().sort(['station'])
+     >>> repicked_catalog = party.lag_calc(stream, pre_processed=False,
+     ...                                   shift_len=0.2, min_cc=0.4) # doctest:+ELLIPSIS
+     5 Trace(s) in Stream:...
 
 By using the above examples you can go from a standard catalog available from
 data centers, to a matched-filter detected and cross-correlation repicked
diff --git a/eqcorrscan/doc/tutorials/subspace.rst b/eqcorrscan/doc/tutorials/subspace.rst
index 01d41ba9a..c0130643b 100644
--- a/eqcorrscan/doc/tutorials/subspace.rst
+++ b/eqcorrscan/doc/tutorials/subspace.rst
@@ -44,8 +44,8 @@ To begin with you will need to create a **Detector**:
 
 .. code-block:: python
 
-    from eqcorrscan.core import subspace
-    detector = subspace.Detector()
+    >>> from eqcorrscan.core import subspace
+    >>> detector = subspace.Detector()
 
 This will create an empty *detector* object.  These objects have various attributes,
 including the data to be used as a detector (*detector.data*), alongside the full
@@ -62,9 +62,19 @@ aligned (see clustering submodule for alignment methods).
 
 .. code-block:: python
 
-    detector.construct(streams=streams, lowcut=2, highcut=9, filt_order=4,
-                       sampling_rate=20, multiplex=True, name='Test_1',
-                       align=True, shift_len=0.5)
+    >>> from obspy import read
+    >>> import glob
+    >>> wavefiles = glob.glob('eqcorrscan/tests/test_data/similar_events/*')
+    >>> streams = [read(w) for w in wavefiles[0:3]]
+    >>> # Channels must all be the same length
+    >>> for st in streams:
+    ...    for tr in st:
+    ...       tr.data = tr.data[int(41.5 * tr.stats.sampling_rate):
+    ...                         int(44 * tr.stats.sampling_rate)]
+    >>> detector.construct(streams=streams, lowcut=2, highcut=9, filt_order=4,
+    ...                    sampling_rate=20, multiplex=True, name='Test_1',
+    ...                    align=True, shift_len=0.5, reject=0.2)
+    Detector: Test_1
 
 This will populate all the attributes of your *detector* object, and fill the
 *detector.data* with the full input basis vector matrix.
@@ -76,7 +86,8 @@ EQcorrscan simply use the *partition* method:
 
 .. code-block:: python
 
-    detector.partition(4)
+    >>> detector.partition(2)
+    Detector: Test_1
 
 This will populate *detector.data* with the first four, left-most input basis
 vectors.  You can test to see how much of your original design set is
@@ -84,7 +95,7 @@ described by this detector by using the *energy_capture* method:
 
 .. code-block:: python
 
-    percent_capture = detector.energy_capture()
+    >>> percent_capture = detector.energy_capture()
 
 This will return a percentage capture, you can run this for multiple dimensions
 to test what dimension best suits your application.  Again, details for this
@@ -99,7 +110,12 @@ True.
 
 .. code-block:: python
 
-    detections = detector.detect(st=stream, threshold=0.5, trig_int=3)
+    >>> stream = read(wavefiles[0])
+    >>> for tr in stream:
+    ...   tr.data = tr.data[int(41.5 * tr.stats.sampling_rate):
+    ...                     int(44 * tr.stats.sampling_rate)]
+    >>> detections = detector.detect(st=stream, threshold=0.5, trig_int=3) # doctest:+ELLIPSIS
+    Detection took ...
 
 
 Advanced Example
diff --git a/eqcorrscan/doc/tutorials/template-creation.rst b/eqcorrscan/doc/tutorials/template-creation.rst
index 4f6c4a554..efc709291 100644
--- a/eqcorrscan/doc/tutorials/template-creation.rst
+++ b/eqcorrscan/doc/tutorials/template-creation.rst
@@ -1,6 +1,10 @@
 Template creation
 =================
 
+Note: These tutorials are for the functional workflow - for details on creating
+Template and Tribe objects see the :doc:`matched filter <matched-filter>` docs
+page.
+
 Simple example
 --------------
 
@@ -18,15 +22,16 @@ FDSN (see |obspy_fdsn| for a list of possible clients):
 
 .. code-block:: python
 
-    from obspy.clients.fdsn import Client
-    from obspy.core.event import Catalog
-    from eqcorrscan.core.template_gen import from_client
-    client = Client('NCEDC')
-    catalog = client.get_events(eventid='72572665', includearrivals=True)
-    templates = from_client(catalog=catalog, client_id='NCEDC',
-                            lowcut=2.0, highcut=9.0, samp_rate=20.0,
-                            filt_order=4, length=3.0, prepick=0.15,
-                            swin='all', process_len=200)
+    >>> from obspy.clients.fdsn import Client
+    >>> from obspy.core.event import Catalog
+    >>> from eqcorrscan.core.template_gen import from_client
+    >>> client = Client('NCEDC')
+    >>> catalog = client.get_events(eventid='72572665', includearrivals=True)
+    >>> templates = from_client(catalog=catalog, client_id='NCEDC',
+    ...                         lowcut=2.0, highcut=9.0, samp_rate=20.0,
+    ...                         filt_order=4, length=3.0, prepick=0.15,
+    ...                         swin='all', process_len=200)
+    Pre-processing data
 
 This will download data for a single event (given by eventid) from the NCEDC
 database, then use that information to download relevant waveform data.  These
@@ -77,7 +82,7 @@ which is useful for template storage.  However we do not constrain you to this.
 
 .. code-block:: python
 
-    template.write('template.ms', format="MSEED")
+    >>> templates[0].write('template.ms', format="MSEED")
 
 
 Advanced example
diff --git a/eqcorrscan/lib/libutils.def b/eqcorrscan/lib/libutils.def
index d7c03aa58..3efe031e2 100644
--- a/eqcorrscan/lib/libutils.def
+++ b/eqcorrscan/lib/libutils.def
@@ -3,5 +3,7 @@ EXPORTS
     normxcorr_fftw
     normxcorr_fftw_threaded
     normxcorr_time
+    normxcorr_time_threaded
     multi_normxcorr_fftw
     multi_normxcorr_time
+    multi_normxcorr_time_threaded
diff --git a/eqcorrscan/lib/multi_corr.c b/eqcorrscan/lib/multi_corr.c
index 1d8a2faa9..d36d3f1b9 100644
--- a/eqcorrscan/lib/multi_corr.c
+++ b/eqcorrscan/lib/multi_corr.c
@@ -20,7 +20,18 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <math.h>
+#if (defined(_MSC_VER))
+    #if (_MSC_VER < 1800)
+        #include <float.h>
+        #define isnanf(x) _isnanf(x)
+    #endif
+    #define inline __inline
+#endif
+#if (defined(__APPLE__) && !isnanf)
+    #define isnanf isnan
+#endif
 #include <fftw3.h>
 #if defined(__linux__) || defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
     #include <omp.h>
@@ -30,19 +41,23 @@
 #endif
 
 // Prototypes
-int normxcorr_fftw(float*, int, int, float*, int, float*, int);
+int normxcorr_fftw(float*, int, int, float*, int, float*, int, int*, int*);
+
+static inline int set_ncc(int t, int i, int template_len, int image_len, float value, int *used_chans, int *pad_array, float *ncc);
 
-int normxcorr_fftw_threaded(float*, int, int, float*, int, float*, int);
+int normxcorr_fftw_main(float*, int, int, float*, int, float*, int, double*, double*, double*,
+        fftw_complex*, fftw_complex*, fftw_complex*, fftw_plan, fftw_plan, fftw_plan, int*, int*);
 
-int normxcorr_time(float*, int, float*, int, float*);
+int normxcorr_fftw_threaded(float*, int, int, float*, int, float*, int, int*, int*);
 
-int multi_normxcorr_fftw(float*, int, int, int, float*, int, float*, int);
+void free_fftw_arrays(int, double**, double**, double**, fftw_complex**, fftw_complex**, fftw_complex**);
 
-int multi_normxcorr_time(float*, int, int, float*, int, float*);
+int multi_normxcorr_fftw(float*, int, int, int, float*, int, float*, int, int*, int*);
 
 // Functions
 int normxcorr_fftw_threaded(float *templates, int template_len, int n_templates,
-					        float *image, int image_len, float *ncc, int fft_len){
+					        float *image, int image_len, float *ncc, int fft_len,
+                            int *used_chans, int *pad_array) {
   /*
   Purpose: compute frequency domain normalised cross-correlation of real data using fftw
   Author: Calum J. Chamberlain
@@ -58,8 +73,8 @@ int normxcorr_fftw_threaded(float *templates, int template_len, int n_templates,
 	fft_len:        Size for fft (n1)
   */
 	int N2 = fft_len / 2 + 1;
-	int i, t, startind;
-	double mean, stdev, old_mean, new_samp, old_samp, c, var=0.0, sum=0.0, acceptedDiff = 0.0000001;
+	int i, t, startind, status = 0;
+	double mean, stdev, old_mean, new_samp, old_samp, var=0.0, sum=0.0, acceptedDiff = 0.0000001;
 	double * norm_sums = (double *) calloc(n_templates, sizeof(double));
 	double * template_ext = (double *) calloc(fft_len * n_templates, sizeof(double));
 	double * image_ext = (double *) calloc(fft_len, sizeof(double));
@@ -121,15 +136,12 @@ int normxcorr_fftw_threaded(float *templates, int template_len, int n_templates,
 	stdev = sqrt(var);
     // Used for centering - taking only the valid part of the cross-correlation
 	startind = template_len - 1;
-	for (t = 0; t < n_templates; ++t){
-    	if (var < acceptedDiff){
-	    	ncc[t * (image_len - template_len + 1)] = 0;
-    	}
-	    else {
-		    c = ((ccc[(t * fft_len) + startind] / (fft_len * n_templates)) - norm_sums[t] * mean) / stdev;
-    		ncc[t * (image_len - template_len + 1)] = (float) c;
-	    }
-	}
+    if (var >= acceptedDiff) {
+        for (t = 0; t < n_templates; ++t){
+            double c = ((ccc[(t * fft_len) + startind] / (fft_len * n_templates)) - norm_sums[t] * mean) / stdev;
+            status += set_ncc(t, 0, template_len, image_len, (float) c, used_chans, pad_array, ncc);
+        }
+    }
 	// Center and divide by length to generate scaled convolution
 	for(i = 1; i < (image_len - template_len + 1); ++i){
 		// Need to cast to double otherwise we end up with annoying floating
@@ -140,13 +152,10 @@ int normxcorr_fftw_threaded(float *templates, int template_len, int n_templates,
 		mean = mean + (new_samp - old_samp) / template_len;
 		var += (new_samp - old_samp) * (new_samp - mean + old_samp - old_mean) / (template_len);
 		stdev = sqrt(var);
-		for (t=0; t < n_templates; ++t){
-			if (var > acceptedDiff){
-				c = ((ccc[(t * fft_len) + i + startind] / (fft_len * n_templates)) - norm_sums[t] * mean ) / stdev;
-				ncc[(t * (image_len - template_len + 1)) + i] = (float) c;
-			}
-			else{
-				ncc[(t * (image_len - template_len + 1)) + i] = 0.0;
+        if (var > acceptedDiff) { // TODO: above is '>=', should they be the same?
+            for (t = 0; t < n_templates; ++t){
+                double c = ((ccc[(t * fft_len) + i + startind] / (fft_len * n_templates)) - norm_sums[t] * mean ) / stdev;
+                status += set_ncc(t, i, template_len, image_len, (float) c, used_chans, pad_array, ncc);
 			}
 		}
 	}
@@ -166,12 +175,13 @@ int normxcorr_fftw_threaded(float *templates, int template_len, int n_templates,
 	free(template_ext);
 	free(image_ext);
 
-	return 0;
+	return status;
 }
 
 
 int normxcorr_fftw(float *templates, int template_len, int n_templates,
-                   float *image, int image_len, float *ncc, int fft_len){
+                   float *image, int image_len, float *ncc, int fft_len,
+                   int *used_chans, int *pad_array){
   /*
   Purpose: compute frequency domain normalised cross-correlation of real data using fftw
   Author: Calum J. Chamberlain
@@ -185,23 +195,89 @@ int normxcorr_fftw(float *templates, int template_len, int n_templates,
 	ncc:            Output for cross-correlation - should be pointer to memory -
 					must be n_templates x image_len - template_len + 1
 	fft_len:        Size for fft (n1)
+  Notes:
+    This is a wrapper around `normxcorr_fftw_main`, allocating required memory and plans
+    for that function. We have taken this outside the main function because creating plans
+    is not thread-safe and we want to call the main function from within an OpenMP loop.
   */
+	int status = 0;
 	int N2 = fft_len / 2 + 1;
-	int i, t, startind;
-	double mean, stdev, old_mean, new_samp, old_samp, c, var=0.0, sum=0.0, acceptedDiff = 0.0000001;
-	double * norm_sums = (double *) calloc(n_templates, sizeof(double));
-	double * template_ext = (double *) calloc(fft_len * n_templates, sizeof(double));
-	double * image_ext = (double *) calloc(fft_len, sizeof(double));
-	double * ccc = (double *) fftw_malloc(sizeof(double) * fft_len * n_templates);
-	fftw_complex * outa = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * N2 * n_templates);
-	fftw_complex * outb = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * N2);
-	fftw_complex * out = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * N2 * n_templates);
+	// All memory allocated with `fftw_malloc` to ensure 16-byte aligned
+	double * template_ext = fftw_alloc_real(fft_len * n_templates);
+	double * image_ext = fftw_alloc_real(fft_len);
+	double * ccc = fftw_alloc_real(fft_len * n_templates);
+	fftw_complex * outa = fftw_alloc_complex(N2 * n_templates);
+	fftw_complex * outb = fftw_alloc_complex(N2);
+	fftw_complex * out = fftw_alloc_complex(N2 * n_templates);
 	// Plan
-
 	fftw_plan pa = fftw_plan_dft_r2c_2d(n_templates, fft_len, template_ext, outa, FFTW_ESTIMATE);
 	fftw_plan pb = fftw_plan_dft_r2c_1d(fft_len, image_ext, outb, FFTW_ESTIMATE);
 	fftw_plan px = fftw_plan_dft_c2r_2d(n_templates, fft_len, out, ccc, FFTW_ESTIMATE);
 
+	// Initialise to zero
+	memset(template_ext, 0, fft_len * n_templates * sizeof(double));
+	memset(image_ext, 0, fft_len * sizeof(double));
+
+	// Call the function to do the work
+	status = normxcorr_fftw_main(templates, template_len, n_templates, image, image_len,
+			ncc, fft_len, template_ext, image_ext, ccc, outa, outb, out, pa, pb, px,
+            used_chans, pad_array);
+
+	// free memory and plans
+	fftw_destroy_plan(pa);
+	fftw_destroy_plan(pb);
+	fftw_destroy_plan(px);
+
+	fftw_free(out);
+	fftw_free(outa);
+	fftw_free(outb);
+	fftw_free(ccc);
+	fftw_free(template_ext);
+	fftw_free(image_ext);
+
+	fftw_cleanup();
+	fftw_cleanup_threads();
+
+	return status;
+}
+
+
+int normxcorr_fftw_main(float *templates, int template_len, int n_templates,
+                        float *image, int image_len, float *ncc, int fft_len,
+                        double *template_ext, double *image_ext, double *ccc,
+                        fftw_complex *outa, fftw_complex *outb, fftw_complex *out,
+                        fftw_plan pa, fftw_plan pb, fftw_plan px, int *used_chans,
+                        int *pad_array) {
+  /*
+  Purpose: compute frequency domain normalised cross-correlation of real data using fftw
+  Author: Calum J. Chamberlain
+  Date: 12/06/2017
+  Args:
+    templates:      Template signals
+    template_len:   Length of template
+    n_templates:    Number of templates (n0)
+    image:          Image signal (to scan through)
+    image_len:      Length of image
+    ncc:            Output for cross-correlation - should be pointer to memory -
+                    must be n_templates x image_len - template_len + 1
+                    It is assumed that ncc will be initialised to zero before
+                    passing into this function
+    fft_len:        Size for fft (n1)
+    template_ext:   Input FFTW array for template transform (must be allocated)
+    image_ext:      Input FFTW array for image transform (must be allocated)
+    ccc:            Output FFTW array for reverse transform (must be allocated)
+    outa:           Output FFTW array for template transform (must be allocatd)
+    outb:           Output FFTW array for image transform (must be allocated)
+    out:            Input array for reverse transform (must be allocated)
+    pa:             Forward plan for templates
+    pb:             Forward plan for image
+    px:             Reverse plan
+  */
+	int N2 = fft_len / 2 + 1;
+	int i, t, startind, status = 0;
+	double mean, stdev, old_mean, new_samp, old_samp, var=0.0, sum=0.0, acceptedDiff = 0.0000001;
+	double * norm_sums = (double *) calloc(n_templates, sizeof(double));
+
 	// zero padding - and flip template
 	for (t = 0; t < n_templates; ++t){
 		for (i = 0; i < template_len; ++i)
@@ -215,8 +291,8 @@ int normxcorr_fftw(float *templates, int template_len, int n_templates,
 		image_ext[i] = (double) image[i];
 	}
 	//  Compute ffts of template and image
-	fftw_execute(pa);
-	fftw_execute(pb);
+	fftw_execute_dft_r2c(pa, template_ext, outa);
+	fftw_execute_dft_r2c(pb, image_ext, outb);
 
 	//  Compute dot product
 	for (t = 0; t < n_templates; ++t){
@@ -227,7 +303,8 @@ int normxcorr_fftw(float *templates, int template_len, int n_templates,
     	}
     }
 	//  Compute inverse fft
-	fftw_execute(px);
+	fftw_execute_dft_c2r(px, out, ccc);
+
 	//  Procedures for normalisation
 	// Compute starting mean, will update this
 	for (i=0; i < template_len; ++i){
@@ -240,17 +317,17 @@ int normxcorr_fftw(float *templates, int template_len, int n_templates,
 		var += pow(image[i] - mean, 2) / (template_len);
 	}
 	stdev = sqrt(var);
+
+
     // Used for centering - taking only the valid part of the cross-correlation
 	startind = template_len - 1;
-	for (t = 0; t < n_templates; ++t){
-    	if (var < acceptedDiff){
-	    	ncc[t * (image_len - template_len + 1)] = 0;
-    	}
-	    else {
-		    c = ((ccc[(t * fft_len) + startind] / (fft_len * n_templates)) - norm_sums[t] * mean) / stdev;
-    		ncc[t * (image_len - template_len + 1)] = (float) c;
-	    }
-	}
+    if (var >= acceptedDiff) {
+        for (t = 0; t < n_templates; ++t){
+            double c = ((ccc[(t * fft_len) + startind] / (fft_len * n_templates)) - norm_sums[t] * mean) / stdev;
+            status += set_ncc(t, 0, template_len, image_len, (float) c, used_chans, pad_array, ncc);
+        }
+    }
+
 	// Center and divide by length to generate scaled convolution
 	for(i = 1; i < (image_len - template_len + 1); ++i){
 		// Need to cast to double otherwise we end up with annoying floating
@@ -261,81 +338,223 @@ int normxcorr_fftw(float *templates, int template_len, int n_templates,
 		mean = mean + (new_samp - old_samp) / template_len;
 		var += (new_samp - old_samp) * (new_samp - mean + old_samp - old_mean) / (template_len);
 		stdev = sqrt(var);
-		for (t=0; t < n_templates; ++t){
-			if (var > acceptedDiff){
-				c = ((ccc[(t * fft_len) + i + startind] / (fft_len * n_templates)) - norm_sums[t] * mean ) / stdev;
-				ncc[(t * (image_len - template_len + 1)) + i] = (float) c;
-			}
-			else{
-				ncc[(t * (image_len - template_len + 1)) + i] = 0.0;
+        if (var > acceptedDiff) { // TODO: above is '>=', should they be the same?
+            for (t = 0; t < n_templates; ++t){
+                double c = ((ccc[(t * fft_len) + i + startind] / (fft_len * n_templates)) - norm_sums[t] * mean ) / stdev;
+                status += set_ncc(t, i, template_len, image_len, (float) c, used_chans, pad_array, ncc);
 			}
 		}
 	}
 	//  Clean up
-	fftw_destroy_plan(pa);
-	fftw_destroy_plan(pb);
-	fftw_destroy_plan(px);
+	free(norm_sums);
 
-	fftw_free(out);
-	fftw_free(outa);
-	fftw_free(outb);
-	fftw_free(ccc);
+	return status;
+}
 
-	fftw_cleanup();
-	fftw_cleanup_threads();
 
-	free(template_ext); free(image_ext); free(norm_sums);
-	return 0;
-}
+static inline int set_ncc(int t, int i, int template_len, int image_len, float value, int *used_chans, int *pad_array, float *ncc) {
+    int status = 0;
 
+    if (used_chans[t] && (i >= pad_array[t])) {
+        size_t ncc_index = t * (image_len - template_len + 1) + i - pad_array[t];
 
-int normxcorr_time(float *template, int template_len, float *image, int image_len, float *ccc){
-    // Time domain cross-correlation - requires zero-mean template and image
-	int p, k;
-	int steps = image_len - template_len + 1;
-	double numerator = 0.0, denom, mean = 0.0;
-	double auto_a = 0.0, auto_b = 0.0;
+        if (isnanf(value)) {
+            // set NaNs to zero
+            value = 0.0;
+        }
+        else if (fabsf(value) > 1.01) {
+            // this will raise an exception when we return to Python
+            status = 1;
+        }
+        else if (value > 1.0) {
+            value = 1.0;
+        }
+        else if (value < -1.0) {
+            value = -1.0;
+        }
 
-    for (k=0; k < template_len; ++k){
-		mean += image[k];
-	}
-	mean = mean / template_len;
+        #pragma omp atomic
+        ncc[ncc_index] += value;
+    }
 
-	for(p = 0; p < template_len; ++p){
-		auto_a += (double) template[p] * (double) template[p];
-		numerator += (double) template[p] * ((double) image[p] - mean);
-		auto_b += ((double) image[p] - mean) * ((double) image[p] - mean);
-	}
-	denom = sqrt(auto_a * auto_b);
-	ccc[0] = (float) (numerator / denom);
-	for(k = 1; k < steps; ++k){
-		mean = mean + (image[k + template_len - 1] - image[k - 1]) / template_len;
-	    numerator = 0.0;
-	    auto_b = 0.0;
-		for(p = 0; p < template_len; ++p){
-			numerator += (double) template[p] * ((double) image[p + k] - mean);
-			auto_b += ((double) image[p + k] - mean) * ((double) image[p + k] - mean);
-		}
-		denom = sqrt(auto_a * auto_b);
-		ccc[k] = (float) (numerator / denom);
-	}
-	return 0;
+    return status;
 }
 
-int multi_normxcorr_fftw(float *templates, int n_templates, int template_len, int n_channels, float *image, int image_len, float *ccc, int fft_len){
-    int i, r=0;
+
+void free_fftw_arrays(int size, double **template_ext, double **image_ext, double **ccc,
+        fftw_complex **outa, fftw_complex **outb, fftw_complex **out) {
+    int i;
+
+    /* free memory */
+    for (i = 0; i < size; i++) {
+        fftw_free(template_ext[i]);
+        fftw_free(image_ext[i]);
+        fftw_free(ccc[i]);
+        fftw_free(outa[i]);
+        fftw_free(outb[i]);
+        fftw_free(out[i]);
+    }
+    free(template_ext);
+    free(image_ext);
+    free(ccc);
+    free(outa);
+    free(outb);
+    free(out);
+}
+
+
+int multi_normxcorr_fftw(float *templates, int n_templates, int template_len, int n_channels,
+        float *image, int image_len, float *ncc, int fft_len, int *used_chans, int *pad_array){
+    int i;
+    int r = 0;
+    size_t N2 = (size_t) fft_len / 2 + 1;
+    double **template_ext = NULL;
+    double **image_ext = NULL;
+    double **ccc = NULL;
+    fftw_complex **outa = NULL;
+    fftw_complex **outb = NULL;
+    fftw_complex **out = NULL;
+    fftw_plan pa, pb, px;
+    int num_threads = 1;
+
+
+    #ifdef N_THREADS
+    /* set the number of threads - the minimum of the numbers of channels and threads */
+    num_threads = (N_THREADS > n_channels) ? n_channels : N_THREADS;
+    #endif
+
+    /* allocate memory for all threads here */
+    template_ext = (double**) malloc(num_threads * sizeof(double*));
+    if (template_ext == NULL) {
+        printf("Error allocating template_ext\n");
+        free_fftw_arrays(0, template_ext, image_ext, ccc, outa, outb, out);
+        return -1;
+    }
+    image_ext = (double**) malloc(num_threads * sizeof(double*));
+    if (image_ext == NULL) {
+        printf("Error allocating image_ext\n");
+        free_fftw_arrays(0, template_ext, image_ext, ccc, outa, outb, out);
+        return -1;
+    }
+    ccc = (double**) malloc(num_threads * sizeof(double*));
+    if (ccc == NULL) {
+        printf("Error allocating ccc\n");
+        free_fftw_arrays(0, template_ext, image_ext, ccc, outa, outb, out);
+        return -1;
+    }
+    outa = (fftw_complex**) malloc(num_threads * sizeof(fftw_complex*));
+    if (outa == NULL) {
+        printf("Error allocating outa\n");
+        free_fftw_arrays(0, template_ext, image_ext, ccc, outa, outb, out);
+        return -1;
+    }
+    outb = (fftw_complex**) malloc(num_threads * sizeof(fftw_complex*));
+    if (outb == NULL) {
+        printf("Error allocating outb\n");
+        free_fftw_arrays(0, template_ext, image_ext, ccc, outa, outb, out);
+        return -1;
+    }
+    out = (fftw_complex**) malloc(num_threads * sizeof(fftw_complex*));
+    if (out == NULL) {
+        printf("Error allocating out\n");
+        free_fftw_arrays(0, template_ext, image_ext, ccc, outa, outb, out);
+        return -1;
+    }
+
+    // All memory allocated with `fftw_malloc` to ensure 16-byte aligned.
+    for (i = 0; i < num_threads; i++) {
+        /* initialise all to NULL so that freeing on error works */
+        template_ext[i] = NULL;
+        image_ext[i] = NULL;
+        ccc[i] = NULL;
+        outa[i] = NULL;
+        outb[i] = NULL;
+        out[i] = NULL;
+
+        /* allocate template_ext arrays */
+        template_ext[i] = fftw_alloc_real((size_t) fft_len * n_templates);
+        if (template_ext[i] == NULL) {
+            printf("Error allocating template_ext[%d]\n", i);
+            free_fftw_arrays(i + 1, template_ext, image_ext, ccc, outa, outb, out);
+            return -1;
+        }
+
+        /* allocate image_ext arrays */
+        image_ext[i] = fftw_alloc_real(fft_len);
+        if (image_ext[i] == NULL) {
+            printf("Error allocating image_ext[%d]\n", i);
+            free_fftw_arrays(i + 1, template_ext, image_ext, ccc, outa, outb, out);
+            return -1;
+        }
+
+        /* allocate ccc arrays */
+        ccc[i] = fftw_alloc_real((size_t) fft_len * n_templates);
+        if (ccc[i] == NULL) {
+            printf("Error allocating ccc[%d]\n", i);
+            free_fftw_arrays(i + 1, template_ext, image_ext, ccc, outa, outb, out);
+            return -1;
+        }
+
+        /* allocate outa arrays */
+        outa[i] = fftw_alloc_complex((size_t) N2 * n_templates);
+        if (outa[i] == NULL) {
+            printf("Error allocating outa[%d]\n", i);
+            free_fftw_arrays(i + 1, template_ext, image_ext, ccc, outa, outb, out);
+            return -1;
+        }
+
+        /* allocate outb arrays */
+        outb[i] = fftw_alloc_complex((size_t) N2);
+        if (outb[i] == NULL) {
+            printf("Error allocating outb[%d]\n", i);
+            free_fftw_arrays(i + 1, template_ext, image_ext, ccc, outa, outb, out);
+            return -1;
+        }
+
+        /* allocate out arrays */
+        out[i] = fftw_alloc_complex((size_t) N2 * n_templates);
+        if (out[i] == NULL) {
+            printf("Error allocating out[%d]\n", i);
+            free_fftw_arrays(i + 1, template_ext, image_ext, ccc, outa, outb, out);
+            return -1;
+        }
+    }
+
+    //TODO: touch all arrays - NUMA first touch???
+
+    // We create the plans here since they are not thread safe.
+    pa = fftw_plan_dft_r2c_2d(n_templates, fft_len, template_ext[0], outa[0], FFTW_ESTIMATE);
+    pb = fftw_plan_dft_r2c_1d(fft_len, image_ext[0], outb[0], FFTW_ESTIMATE);
+    px = fftw_plan_dft_c2r_2d(n_templates, fft_len, out[0], ccc[0], FFTW_ESTIMATE);
+
+    /* loop over the channels */
+    #pragma omp parallel for reduction(+:r) num_threads(num_threads)
     for (i = 0; i < n_channels; ++i){
-        r = normxcorr_fftw_threaded(&templates[n_templates * template_len * i], template_len,
-                                    n_templates, &image[image_len * i], image_len,
-                                    &ccc[(image_len - template_len + 1) * n_templates * i], fft_len);
+        int tid = 0; /* each thread has its own workspace */
+
+        #ifdef N_THREADS
+        /* get the id of this thread */
+        tid = omp_get_thread_num();
+        #endif
+
+        /* initialise memory to zero */
+        memset(template_ext[tid], 0, (size_t) fft_len * n_templates * sizeof(double));
+        memset(image_ext[tid], 0, (size_t) fft_len * sizeof(double));
+
+        /* call the routine */
+        r += normxcorr_fftw_main(&templates[(size_t) n_templates * template_len * i], template_len,
+                                 n_templates, &image[(size_t) image_len * i], image_len, ncc, fft_len,
+                                 template_ext[tid], image_ext[tid], ccc[tid], outa[tid], outb[tid], out[tid],
+                                 pa, pb, px, &used_chans[(size_t) i * n_templates],
+                                 &pad_array[(size_t) i * n_templates]);
     }
+
+    /* free fftw memory */
+    free_fftw_arrays(num_threads, template_ext, image_ext, ccc, outa, outb, out);
+    fftw_destroy_plan(pa);
+    fftw_destroy_plan(pb);
+    fftw_destroy_plan(px);
+    fftw_cleanup();
+
     return r;
 }
-
-int multi_normxcorr_time(float *templates, int template_len, int n_templates, float *image, int image_len, float *ccc){
-	int i;
-	for (i = 0; i < n_templates; ++i){
-		normxcorr_time(&templates[template_len * i], template_len, image, image_len, &ccc[(image_len - template_len + 1) * i]);
-	}
-	return 0;
-}
\ No newline at end of file
diff --git a/eqcorrscan/lib/time_corr.c b/eqcorrscan/lib/time_corr.c
new file mode 100644
index 000000000..ad15adb17
--- /dev/null
+++ b/eqcorrscan/lib/time_corr.c
@@ -0,0 +1,120 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  multi_corr.c
+ *
+ *        Purpose:  Routines for computing cross-correlations in the time-domain
+ *
+ *        Created:  03/07/17 02:25:07
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  YOUR NAME (Calum Chamberlain),
+ *   Organization:  EQcorrscan
+ *      Copyright:  EQcorrscan developers.
+ *        License:  GNU Lesser General Public License, Version 3
+ *                  (https://www.gnu.org/copyleft/lesser.html)
+ *
+ * =====================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#if defined(__linux__) || defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+    #include <omp.h>
+    #ifndef N_THREADS
+        #define N_THREADS omp_get_max_threads()
+    #endif
+#endif
+
+int normxcorr_time_threaded(float*, int, float*, int, float*, int);
+
+int normxcorr_time(float*, int, float*, int, float*);
+
+int multi_normxcorr_time(float*, int, int, float*, int, float*);
+
+int multi_normxcorr_time_threaded(float*, int, int, float*, int, float*, int);
+
+int normxcorr_time_threaded(float *template, int template_len, float *image, int image_len, float *ccc, int num_threads){
+    // Time domain cross-correlation - requires zero-mean template
+	int p, k;
+	int steps = image_len - template_len + 1;
+	double * mean = (double *) calloc(steps, sizeof(double));
+	double numerator = 0.0, denom;
+	double auto_a = 0.0, auto_b = 0.0;
+
+    mean[0] = 0;
+    for (k=0; k < template_len; ++k){
+		mean[0] += image[k];
+	}
+	mean[0] = mean[0] / template_len;
+
+    for(k = 1; k < steps; ++k){
+        mean[k] = mean[k - 1] + (image[k + template_len - 1] - image[k - 1]) / template_len;
+    }
+	for(p = 0; p < template_len; ++p){
+		auto_a += (double) template[p] * (double) template[p];
+	}
+	#pragma omp parallel for private(numerator, denom, auto_b, p) num_threads(num_threads)
+	for(k = 0; k < steps; ++k){
+	    numerator = 0.0;
+	    auto_b = 0.0;
+		for(p = 0; p < template_len; ++p){
+			numerator += (double) template[p] * ((double) image[p + k] - mean[k]);
+			auto_b += ((double) image[p + k] - mean[k]) * ((double) image[p + k] - mean[k]);
+		}
+		denom = sqrt(auto_a * auto_b);
+		ccc[k] = (float) (numerator / denom);
+	}
+	return 0;
+}
+
+int normxcorr_time(float *template, int template_len, float *image, int image_len, float *ccc){
+    // Time domain cross-correlation - requires zero-mean template
+	int p, k;
+	int steps = image_len - template_len + 1;
+	double * mean = (double *) calloc(steps, sizeof(double));
+	double numerator = 0.0, denom;
+	double auto_a = 0.0, auto_b = 0.0;
+
+    mean[0] = 0;
+    for (k=0; k < template_len; ++k){
+		mean[0] += image[k];
+	}
+	mean[0] = mean[0] / template_len;
+
+    for(k = 1; k < steps; ++k){
+        mean[k] = mean[k - 1] + (image[k + template_len - 1] - image[k - 1]) / template_len;
+    }
+	for(p = 0; p < template_len; ++p){
+		auto_a += (double) template[p] * (double) template[p];
+	}
+	for(k = 0; k < steps; ++k){
+	    numerator = 0.0;
+	    auto_b = 0.0;
+		for(p = 0; p < template_len; ++p){
+			numerator += (double) template[p] * ((double) image[p + k] - mean[k]);
+			auto_b += ((double) image[p + k] - mean[k]) * ((double) image[p + k] - mean[k]);
+		}
+		denom = sqrt(auto_a * auto_b);
+		ccc[k] = (float) (numerator / denom);
+	}
+	return 0;
+}
+
+int multi_normxcorr_time(float *templates, int template_len, int n_templates, float *image, int image_len, float *ccc){
+	int i;
+	for (i = 0; i < n_templates; ++i){
+		normxcorr_time(&templates[template_len * i], template_len, image, image_len, &ccc[(image_len - template_len + 1) * i]);
+	}
+	return 0;
+}
+
+int multi_normxcorr_time_threaded(float *templates, int template_len, int n_templates, float *image, int image_len, float *ccc, int num_threads){
+	int i;
+	for (i = 0; i < n_templates; ++i){
+		normxcorr_time_threaded(&templates[template_len * i], template_len, image, image_len, &ccc[(image_len - template_len + 1) * i], num_threads);
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/eqcorrscan/tests/catalog_to_dd_test.py b/eqcorrscan/tests/catalog_to_dd_test.py
index 1c2a79b58..4d9670a32 100644
--- a/eqcorrscan/tests/catalog_to_dd_test.py
+++ b/eqcorrscan/tests/catalog_to_dd_test.py
@@ -268,16 +268,6 @@ def setUpClass(cls):
                       max_sep=cls.maximum_separation,
                       min_link=cls.minimum_links)
 
-    # @classmethod
-    # def tearDownClass(cls):
-    #     os.remove('phase.dat')
-    #     os.remove('dt.ct')
-    #     if os.path.isfile('dt.ct2'):
-    #         os.remove('dt.ct2')
-    #     os.remove('dt.cc')
-    #     if os.path.isfile('dt.cc2'):
-    #         os.remove('dt.cc2')
-
     def test_write_catalog(self):
         """
         Simple testing function for the write_catalogue function in \
diff --git a/eqcorrscan/tests/catalog_utils_test.py b/eqcorrscan/tests/catalog_utils_test.py
index e6824a1f6..491b7788a 100644
--- a/eqcorrscan/tests/catalog_utils_test.py
+++ b/eqcorrscan/tests/catalog_utils_test.py
@@ -8,13 +8,16 @@
 from __future__ import unicode_literals
 
 import unittest
+import pytest
 from obspy.clients.fdsn import Client
 from obspy import UTCDateTime
 
 from eqcorrscan.utils.catalog_utils import filter_picks
 
 
+@pytest.mark.network
 class CatalogUtilsTests(unittest.TestCase):
+    @pytest.mark.flaky(reruns=2)  # Rerun the test in case of network timeout
     def test_filter_picks(self):
         """ Test various methods of filtering picks in a catalog."""
         client = Client(str("NCEDC"))
diff --git a/eqcorrscan/tests/clustering_test.py b/eqcorrscan/tests/clustering_test.py
index 6d8304c8a..59b334077 100644
--- a/eqcorrscan/tests/clustering_test.py
+++ b/eqcorrscan/tests/clustering_test.py
@@ -7,6 +7,7 @@
 from __future__ import unicode_literals
 
 import unittest
+import pytest
 import os
 import glob
 import warnings
@@ -25,9 +26,11 @@
 from eqcorrscan.utils.clustering import space_cluster, space_time_cluster
 
 
+@pytest.mark.network
 class ClusteringTests(unittest.TestCase):
     """Main testing routines"""
     @classmethod
+    @pytest.mark.flaky(reruns=2)
     def setUpClass(cls):
         cls.st1 = read()
         cls.st2 = cls.st1.copy()
@@ -157,9 +160,11 @@ def test_space_time_cluster(self):
                          len(self.cat))
 
 
+@pytest.mark.network
 class ClusteringTestWarnings(unittest.TestCase):
     """Main testing routines"""
     @classmethod
+    @pytest.mark.flaky(reruns=2)
     def setUpClass(cls):
         cls.st1 = read()
         cls.st2 = cls.st1.copy()
diff --git a/eqcorrscan/tests/correlate_test.py b/eqcorrscan/tests/correlate_test.py
index 6a4607f15..267517545 100644
--- a/eqcorrscan/tests/correlate_test.py
+++ b/eqcorrscan/tests/correlate_test.py
@@ -2,112 +2,415 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-import unittest
-import numpy as np
+
+import copy
+import itertools
 import time
+from collections import defaultdict
+from functools import wraps
 
+import numpy as np
+import pytest
 from obspy import Trace, Stream
 
-from eqcorrscan.utils.correlate import numpy_normxcorr, fftw_normxcorr, \
-    time_multi_normxcorr, multichannel_normxcorr
-
-
-class CorrelateTests(unittest.TestCase):
-    def test_same_various_methods(self):
-        templates = np.random.randn(200, 200)
-        stream = np.random.randn(10000)
-        stream *= stream ** 5
-        pads = np.zeros(templates.shape[0], dtype=int)
-        tic = time.time()
-        numpy_ccc, no_chans = numpy_normxcorr(templates, stream, pads)
-        toc = time.time()
-        print('Scipy took: %f seconds' % (toc-tic))
-        tic = time.time()
-        fftw_ccc, no_chans = fftw_normxcorr(
-            templates, stream, pads, threaded=False)
-        toc = time.time()
-        print('FFTW took: %f seconds' % (toc-tic))
-        tic = time.time()
-        fftw_ccc_2d, no_chans = fftw_normxcorr(
-            templates, stream, pads, threaded=True)
-        toc = time.time()
-        print('FFTW-threaded took: %f seconds' % (toc-tic))
-        tic = time.time()
-        time_ccc, no_chans = time_multi_normxcorr(templates, stream, pads)
-        toc = time.time()
-        print('Time-domain took: %f seconds' % (toc-tic))
-        self.assertTrue(np.allclose(numpy_ccc, fftw_ccc, atol=0.001))
-        self.assertTrue(np.allclose(numpy_ccc, fftw_ccc_2d, atol=0.001))
-        self.assertTrue(np.allclose(numpy_ccc, time_ccc, atol=0.001))
-        self.assertTrue(np.allclose(fftw_ccc, time_ccc, atol=0.001))
-
-    def test_multi_channel_xcorr(self):
-        chans = ['EHZ', 'EHN', 'EHE']
-        stas = ['COVA', 'FOZ', 'LARB', 'GOVA', 'MTFO', 'MTBA']
-        n_templates = 20
-        stream_len = 100000
-        template_len = 200
-        templates = []
-        stream = Stream()
+import eqcorrscan.utils.correlate as corr
+from eqcorrscan.utils.correlate import register_array_xcorr
+
+# set seed state for consistent arrays
+random = np.random.RandomState(7)
+
+
+# -------------------------- helper functions
+
+
+def gen_xcorr_func(name):
+    """ return an xcorr function with desired name """
+
+    def func(templates, stream, pads, *args, **kwargs):
+        pass
+
+    func.__name__ = str(name)
+    return func
+
+
+def time_func(func, name, *args, **kwargs):
+    """ call a func with args and kwargs, print name of func and how
+    long it took. """
+    tic = time.time()
+    out = func(*args, **kwargs)
+    toc = time.time()
+    print('%s took %0.2f seconds' % (name, toc - tic))
+    return out
+
+
+def measure_counts(self, func):
+    """ decorator for counter how often func get called """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        self.counter[func.__name__] += 1
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+# generate streams for templates and search space
+
+chans = ['EHZ', 'EHN', 'EHE']
+stas = ['COVA', 'FOZ', 'LARB', 'GOVA', 'MTFO', 'MTBA']
+n_templates = 20
+stream_len = 100000
+template_len = 200
+
+
+def generate_multichannel_stream():
+    stream = Stream()
+    for station in stas:
+        for channel in chans:
+            stream += Trace(data=random.randn(stream_len))
+            stream[-1].stats.channel = channel
+            stream[-1].stats.station = station
+    return stream
+
+
+def generate_multichannel_templates():
+    templates = []
+    for i in range(n_templates):
+        template = Stream()
         for station in stas:
             for channel in chans:
-                stream += Trace(data=np.random.randn(stream_len))
-                stream[-1].stats.channel = channel
-                stream[-1].stats.station = station
-        for i in range(n_templates):
-            template = Stream()
-            for station in stas:
-                for channel in chans:
-                    template += Trace(data=np.random.randn(template_len))
-                    template[-1].stats.channel = channel
-                    template[-1].stats.station = station
-            templates.append(template)
-        # print("Running time serial")
-        # tic = time.time()
-        # cccsums_t_s, no_chans, chans = multichannel_normxcorr(
-        #     templates=templates, stream=stream, cores=None, time_domain=True)
-        # toc = time.time()
-        # print('Time-domain in serial took: %f seconds' % (toc-tic))
-        print("Running time parallel")
-        tic = time.time()
-        cccsums_t_p, no_chans, chans = multichannel_normxcorr(
-            templates=templates, stream=stream, cores=4, time_domain=True)
-        toc = time.time()
-        print('Time-domain in parallel took: %f seconds' % (toc-tic))
-        print("Running frequency serial")
-        tic = time.time()
-        cccsums_f_s, no_chans, chans = multichannel_normxcorr(
-            templates=templates, stream=stream, cores=None, time_domain=False)
-        toc = time.time()
-        print('Frequency-domain in serial took: %f seconds' % (toc-tic))
-        print("Running frequency parallel")
-        tic = time.time()
-        cccsums_f_p, no_chans, chans = multichannel_normxcorr(
-            templates=templates, stream=stream, cores=4, time_domain=False)
-        toc = time.time()
-        print('Frequency-domain in parallel took: %f seconds' % (toc-tic))
-        print("Running frequency openmp parallel")
-        tic = time.time()
-        cccsums_f_op, no_chans, chans = multichannel_normxcorr(
-            templates=templates, stream=stream, cores=2, time_domain=False,
-            openmp=True)
-        toc = time.time()
-        print('Frequency-domain in parallel took: %f seconds' % (toc-tic))
-        print("Finished")
-        # self.assertTrue(np.allclose(cccsums_t_s, cccsums_t_p, atol=0.00001))
-        self.assertTrue(np.allclose(cccsums_f_s, cccsums_f_p, atol=0.00001))
-        self.assertTrue(np.allclose(cccsums_f_s, cccsums_f_op, atol=0.00001))
-        if not np.allclose(cccsums_t_p, cccsums_f_s, atol=0.00001):
-            import matplotlib.pyplot as plt
-            plt.plot(cccsums_t_p[0], 'k', label='Time')
-            plt.plot(cccsums_f_s[0], 'r', label='Frequency')
-            plt.legend()
-            plt.show()
-        self.assertTrue(np.allclose(cccsums_t_p, cccsums_f_s, atol=0.00001))
-
-
-if __name__ == '__main__':
+                template += Trace(data=random.randn(template_len))
+                template[-1].stats.channel = channel
+                template[-1].stats.station = station
+        templates.append(template)
+    return templates
+
+
+# ----------------------------- module fixtures
+
+
+# auto run fixtures
+
+@pytest.fixture(scope='class', autouse=True)
+def print_class_name(request):
+    """ prints the class name before a class test suite starts """
+    try:
+        cls_name = request.cls.__name__
+    except AttributeError:  # test does not belong to a class
+        return
+    else:
+        dash = '-' * 70
+        print('\nstarting tests contained in class %s\n%s' % (cls_name, dash))
+
+
+# array fixtures
+
+starting_index = 500
+
+
+@pytest.fixture(scope='module')
+def array_template():
+    """
+    return a set of templates, generated with randomn, for correlation tests.
+    """
+    return random.randn(200, 200)
+
+
+@pytest.fixture(scope='module')
+def array_stream(array_template):
     """
-    Run core tests
+    Return a stream genearted with randomn for testing normxcorr functions
     """
-    unittest.main()
+    stream = random.randn(10000) * 5
+    # insert a template into the stream so cc == 1 at some place
+    ar = array_template[0]
+    stream[starting_index: starting_index + len(ar)] = ar
+    return stream
+
+
+@pytest.fixture(scope='module')
+def pads(array_template):
+    """
+    return an array of zeros for padding, matching templates len.
+    """
+    return np.zeros(array_template.shape[0], dtype=int)
+
+
+@pytest.fixture(scope='module')
+def array_ccs(array_template, array_stream, pads):
+    """  use each function stored in the normcorr cache to correlate the
+     templates and arrays, return a dict with keys as func names and values
+     as the cc calculated by said function"""
+    out = {}
+
+    for name in list(corr.XCORR_FUNCS_ORIGINAL.keys()):
+        func = corr.get_array_xcorr(name)
+        cc, _ = time_func(func, name, array_template, array_stream, pads)
+        out[name] = cc
+    return out
+
+
+# stream fixtures
+
+
+@pytest.fixture(scope='module')
+def multichannel_templates():
+    """ create multichannel templates """
+    return generate_multichannel_templates()
+
+
+@pytest.fixture(scope='module')
+def multichannel_stream():
+    """ create a multichannel stream for tests """
+    return generate_multichannel_stream()
+
+
+# a dict of all registered stream functions (this is a bit long)
+stream_funcs = {fname + '_' + mname: corr.get_stream_xcorr(fname, mname)
+                for fname in corr.XCORR_FUNCS_ORIGINAL.keys()
+                for mname in corr.XCORR_STREAM_METHODS
+                if fname != 'default'}
+
+
+@pytest.fixture(scope='module')
+def stream_cc_output_dict(multichannel_templates, multichannel_stream):
+    """ return a dict of outputs from all stream_xcorr functions """
+    # corr._get_array_dicts(multichannel_templates, multichannel_stream)
+    out = {}
+    for name, func in stream_funcs.items():
+        cc_out = time_func(func, name, multichannel_templates,
+                           multichannel_stream, cores=1)
+        out[name] = cc_out
+    return out
+
+
+@pytest.fixture(scope='module')
+def stream_cc_dict(stream_cc_output_dict):
+    """ return just the cc arrays from the stream_cc functions """
+    return {name: result[0] for name, result in stream_cc_output_dict.items()}
+
+
+# ----------------------------------- tests
+
+
+class TestArrayCorrelateFunctions:
+    """ these tests ensure the various implementations of normxcorr return
+    approximately the same answers """
+    atol = .00001  # how close correlations have to be
+
+    # tests
+    def test_single_channel_similar(self, array_ccs):
+        """ ensure each of the correlation methods return similar answers
+        given the same input data """
+        cc_list = list(array_ccs.values())
+        for cc1, cc2 in itertools.combinations(cc_list, 2):
+            assert np.allclose(cc1, cc2, atol=self.atol)
+
+    def test_test_autocorrelation(self, array_ccs):
+        """ ensure an auto correlationoccurred in each of ccs where it is
+        expected, defined by starting_index variable """
+        for name, cc in array_ccs.items():
+            assert np.isclose(cc[0, starting_index], 1., atol=self.atol)
+
+
+@pytest.mark.serial
+class TestStreamCorrelateFunctions:
+    """ same thing as TestArrayCorrelateFunction but for stream interface """
+    atol = TestArrayCorrelateFunctions.atol
+
+    def test_multi_channel_xcorr(self, stream_cc_dict):
+        """ test various correlation methods with multiple channels """
+        # get correlation results into a list
+        cc_names = list(stream_cc_dict.keys())
+        cc_list = [stream_cc_dict[cc_name] for cc_name in cc_names]
+        cc_1 = cc_list[0]
+        # loop over correlations and compare each with the first in the list
+        # this will ensure all cc are "close enough"
+        for cc_name, cc in zip(cc_names[2:], cc_list[2:]):
+            assert np.allclose(cc_1, cc, atol=self.atol)
+
+
+class TestXcorrContextManager:
+    # fake_cache = copy.deepcopy(corr.XCOR_FUNCS)
+
+    @pytest.fixture
+    def cache(self):
+        """ this fixtures resets the class level cache after every test """
+        yield copy.deepcopy(corr.XCOR_FUNCS)
+
+    @pytest.fixture
+    def set_xcorr(self, cache):
+        return corr._Context(cache, 'default')
+
+    @pytest.fixture
+    def set_value(self, set_xcorr):
+        """ set a value in a 'permanent' fashion, return the set function """
+
+        def func(templates, stream, pads):
+            pass
+
+        set_xcorr(func)
+        return func
+
+    # tests
+    def test_value_was_set(self, set_value, cache):
+        assert cache['default'] == set_value
+
+    def test_context_manager(self, cache):
+        """ ensure the context manager reverts changes """
+        context = corr._Context(cache, 'default')
+        old_default = cache['default']
+        new_val = corr.numpy_normxcorr
+        with context(new_val):
+            assert cache['default'] == new_val
+        assert old_default
+
+    def test_str_accepted(self):
+        """ ensure a str of the xcorr function can be passed as well """
+        with corr.set_xcorr('numpy'):
+            func = corr.get_array_xcorr()
+            assert func is corr.numpy_normxcorr
+
+
+class TestGenericStreamXcorr:
+    """ tests for stream_xocrr function """
+
+    # tests
+    def test_noargs_returns_default(self):
+        """ ensure passing no args to get_stream_xcorr returns default """
+        func = corr.get_stream_xcorr()
+        default = corr.XCOR_FUNCS['default'].stream_xcorr
+        assert func is default
+
+    def test_callable_registered(self, multichannel_templates,
+                                 multichannel_stream):
+        """ ensure a callable can be registered """
+        small_count = {}
+
+        def some_callable(template_array, stream_array, pad_array):
+            small_count['name'] = 1
+            return corr.numpy_normxcorr(template_array, stream_array,
+                                        pad_array)
+
+        func = corr.get_stream_xcorr(some_callable)
+        func(multichannel_templates, multichannel_stream)
+        assert 'name' in small_count
+
+    def test_bad_concurrency_raises(self):
+        """ ensure passing an invalid concurrency argument raises a
+        ValueError"""
+        with pytest.raises(ValueError):
+            corr.get_stream_xcorr(concurrency='node.js')
+
+    def test_loading_unregistered_function_registers(self):
+        """ ensure if a function in cache hasn't been decoratored it gets
+        decorated when returned """
+
+        def func(templates, streams, pads):
+            pass
+
+        corr.XCOR_FUNCS['func_test'] = func
+        corr.get_stream_xcorr('func_test')
+        assert hasattr(corr.XCOR_FUNCS['func_test'], 'registered')
+
+    def test_using_custom_function_doesnt_change_default(self):
+        """ ensure a custom function will not change the default """
+
+        def func(templates, streams, pads):
+            pass
+
+        default = corr.get_array_xcorr(None)
+
+        corr.get_array_xcorr(func)
+
+        assert corr.get_array_xcorr(None) is default
+
+
+class TestRegisterNormXcorrs:
+    """ Tests for register_normxcorr function, which holds global context
+    for which xcorr to use """
+
+    # helper functions
+    def name_func_is_registered(self, func_name):
+        """ return True if func is registered as a normxcorr func """
+        # Note: don not remove this fixture or bad things will happen
+        name = func_name.__name__ if callable(func_name) else func_name
+        return name in corr.XCOR_FUNCS
+
+    # fixtures
+    @pytest.fixture(scope='class', autouse=True)
+    def swap_registery(self):
+        """ copy the current registry, restore it when tests finish"""
+        current = copy.deepcopy(corr.XCOR_FUNCS)
+        yield
+        corr.XCOR_FUNCS = current
+
+    # tests
+    def test_register_as_decorator_no_args(self):
+        """ ensure register_normxcorr works as a decorator with no args """
+
+        @register_array_xcorr
+        def func1(templates, stream, pads, *args, **kwargs):
+            pass
+
+        assert self.name_func_is_registered(func1)
+
+    def test_register_as_decorator_with_args(self):
+        """ ensure register can be used as a decorator with args """
+
+        @register_array_xcorr(name='func2')
+        def func(templates, stream, pads, *args, **kwargs):
+            pass
+
+        assert self.name_func_is_registered('func2')
+
+    def test_register_as_callable(self):
+        """ ensure register can be used as a callable to take a name
+        and a normxcorr func """
+        func = gen_xcorr_func('funky')
+        register_array_xcorr(name='func3', func=func)
+        assert self.name_func_is_registered('func3')
+
+    def test_set_default(self):
+        """ ensure the default can be overwritten """
+        func = gen_xcorr_func('funky')
+        corr.register_array_xcorr(func, is_default=True)
+        assert corr.XCOR_FUNCS['default'] is func
+
+    def test_register_bad_func_rasies(self):
+        """ test trying to register a non-supported function raises """
+        func = corr.XCOR_FUNCS['default']
+
+        with pytest.raises(ValueError):
+            @func.register('not_supported_value')
+            def func():
+                pass
+
+
+class TestRegisterAlternativeConcurrency:
+    """ Tests for registering alternative concurrency functions """
+    counter = defaultdict(lambda: 0)
+
+    # helper functions
+    def new_multi(self, template_dict, stream_dict, pad_dict, seed_ids):
+        pass
+
+    # fixtures
+    @pytest.fixture
+    def r_normxcorr(self):
+        """ return a registered normxcorr function """
+        return register_array_xcorr(gen_xcorr_func('normxcorr'))
+
+    @pytest.fixture
+    def normxcorr_new_multithread(self, r_normxcorr):
+        """ register the new multithread method """
+        func = measure_counts(self, self.new_multi)
+        r_normxcorr.register('multithread')(func)
+        r_normxcorr.multithread(None, None, None, None)
+        yield func.__name__
+        self.counter.pop(func.__name__)
+
+    # tests
+    def test_new_method_was_called(self, normxcorr_new_multithread):
+        """ ensure the new method was called """
+        assert self.counter[normxcorr_new_multithread]
diff --git a/eqcorrscan/tests/mag_calc_test.py b/eqcorrscan/tests/mag_calc_test.py
index 5980fbd73..90acda73e 100644
--- a/eqcorrscan/tests/mag_calc_test.py
+++ b/eqcorrscan/tests/mag_calc_test.py
@@ -7,6 +7,7 @@
 from __future__ import unicode_literals
 
 import unittest
+import pytest
 import numpy as np
 import os
 import datetime as dt
@@ -44,16 +45,16 @@ def test_dist_calc(self):
         self.assertEqual(round(dist_calc((90, 90, 0), (90, 89, 0))), 0)
         self.assertEqual(round(dist_calc((90, 90, 0), (89, 90, 0))), 111)
 
+    @pytest.mark.network
+    @pytest.mark.flaky(reruns=2)
     def test_sim_WA(self):
         """Test feeding both PAZ and seedresp."""
         t1 = UTCDateTime("2010-09-3T16:30:00.000")
         t2 = UTCDateTime("2010-09-3T17:00:00.000")
         fdsn_client = Client('IRIS')
-        st = fdsn_client.get_waveforms(network='NZ', station='BFZ',
-                                       location='10',
-                                       channel='HHZ',
-                                       starttime=t1, endtime=t2,
-                                       attach_response=True)
+        st = fdsn_client.get_waveforms(
+            network='NZ', station='BFZ', location='10', channel='HHZ',
+            starttime=t1, endtime=t2, attach_response=True)
         tr = st[0]
         PAZ = {'poles': [-4.440 + 4.440j, -4.440 - 4.440j, -1.083 + 0.0j],
                'zeros': [0.0 + 0.0j, 0.0 + 0.0j, 0.0 + 0.0],
@@ -75,14 +76,10 @@ def test_sim_WA(self):
                                  filename=respf)
             date = t1
 
-            seedresp = {'filename': respf,  # RESP filename
-                        'date': date,
-                        'network': tr.stats.network,
-                        'station': tr.stats.station,
-                        'channel': tr.stats.channel,
-                        'location': tr.stats.location,
-                        # Units to return response in ('DIS', 'VEL' or ACC)
-                        'units': 'DIS'
+            seedresp = {
+                'filename': respf, 'date': date, 'network': tr.stats.network,
+                'station': tr.stats.station, 'channel': tr.stats.channel,
+                'location': tr.stats.location, 'units': 'DIS'
                         }
             _sim_WA(trace=tr, PAZ=None, seedresp=seedresp, water_level=10)
 
@@ -189,10 +186,10 @@ def test_amp_pick_sfile(self):
                              '01-0411-15L.S201309')
         datapath = os.path.join(testing_path, 'WAV', 'TEST_')
         respdir = testing_path
-        event = amp_pick_sfile(sfile=sfile, datapath=datapath,
-                               respdir=respdir, chans=['Z'], var_wintype=True,
-                               winlen=0.9, pre_pick=0.2, pre_filt=True,
-                               lowcut=1.0, highcut=20.0, corners=4)
+        event = amp_pick_sfile(
+            sfile=sfile, datapath=datapath, respdir=respdir, chans=['Z'],
+            var_wintype=True, winlen=0.9, pre_pick=0.2, pre_filt=True,
+            lowcut=1.0, highcut=20.0, corners=4)
         self.assertTrue(isinstance(event, Event))
         self.assertTrue(os.path.isfile('mag_calc.out'))
         os.remove('mag_calc.out')
@@ -339,5 +336,6 @@ def test_amp_pick_no_prefilt(self):
                                       var_wintype=False, pre_filt=False)
         self.assertEqual(len(picked_event.amplitudes), 1)
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/eqcorrscan/tests/match_filter_test.py b/eqcorrscan/tests/match_filter_test.py
index a2e513366..d7d58a32c 100644
--- a/eqcorrscan/tests/match_filter_test.py
+++ b/eqcorrscan/tests/match_filter_test.py
@@ -9,6 +9,7 @@
 import copy
 import os
 import unittest
+import pytest
 
 import numpy as np
 from obspy import read, UTCDateTime, read_events, Catalog, Stream, Trace
@@ -96,30 +97,8 @@ def test_spike_test(self):
             _spike_test(stream)
 
 
+@pytest.mark.serial
 class TestSynthData(unittest.TestCase):
-    def test_debug_range(self):
-        """Test range of debug outputs"""
-        # debug == 3 fails on travis due to plotting restrictions.
-        for debug in range(0, 3):
-            print('Testing for debug level=%s' % debug)
-            try:
-                kfalse, ktrue = test_match_filter(debug=debug)
-            except RuntimeError:
-                print('Error plotting, missing test')
-                continue
-            if ktrue > 0:
-                self.assertTrue(kfalse / ktrue < 0.25)
-            else:
-                # Randomised data occasionally yields 0 detections
-                kfalse, ktrue = test_match_filter(debug=debug)
-                self.assertTrue(kfalse / ktrue < 0.25)
-        if os.path.isfile('cccsum_0.npy'):
-            os.remove('cccsum_0.npy')
-        if os.path.isfile('cccsum_1.npy'):
-            os.remove('cccsum_1.npy')
-        if os.path.isfile('peaks_1970-01-01.pdf'):
-            os.remove('peaks_1970-01-01.pdf')
-
     def test_threshold_methods(self):
         # Test other threshold methods
         for threshold_type, threshold in [('absolute', 2),
@@ -156,8 +135,10 @@ def test_onesamp_diff(self):
                      plotvar=False)
 
 
+@pytest.mark.network
 class TestGeoNetCase(unittest.TestCase):
     @classmethod
+    @pytest.mark.flaky(reruns=2)
     def setUpClass(cls):
         client = Client('GEONET')
         cls.t1 = UTCDateTime(2016, 9, 4)
@@ -250,6 +231,7 @@ def test_no_matching_data(self):
                 threshold_type='MAD', trig_int=6.0, plotvar=False,
                 plotdir='.', cores=1)
 
+    @pytest.mark.flaky(reruns=2)
     def test_geonet_tribe_detect(self):
         client = Client('GEONET')
         # Try to force issues with starting samples on wrong day for geonet
@@ -266,8 +248,10 @@ def test_geonet_tribe_detect(self):
         self.assertEqual(len(party), 16)
 
 
+@pytest.mark.network
 class TestNCEDCCases(unittest.TestCase):
     @classmethod
+    @pytest.mark.flaky(reruns=2)
     def setUpClass(cls):
         print('\t\t\t Downloading data')
         client = Client('NCEDC')
@@ -521,8 +505,10 @@ def test_tribe_copy(self):
         self.assertEqual(tribe, copied)
 
 
+@pytest.mark.network
 class TestMatchObjects(unittest.TestCase):
     @classmethod
+    @pytest.mark.flaky(reruns=2)
     def setUpClass(cls):
         print('\t\t\t Downloading data')
         client = Client('NCEDC')
@@ -629,6 +615,7 @@ def test_tribe_detect(self):
                             det.__dict__[key], check_det.__dict__[key])
             # self.assertEqual(fam.template, check_fam.template)
 
+    @pytest.mark.flaky(reruns=2)
     def test_client_detect(self):
         """Test the client_detect method."""
         client = Client('NCEDC')
diff --git a/eqcorrscan/tests/pre_processing_test.py b/eqcorrscan/tests/pre_processing_test.py
index d3b980833..b9caf1d5e 100644
--- a/eqcorrscan/tests/pre_processing_test.py
+++ b/eqcorrscan/tests/pre_processing_test.py
@@ -19,9 +19,9 @@
 class TestPreProcessing(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        testing_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
-                                    'test_data', 'day_vols', 'Y2012',
-                                    'R086.01', '*')
+        testing_path = os.path.join(
+            os.path.abspath(os.path.dirname(__file__)), 'test_data',
+            'day_vols', 'Y2012', 'R086.01', '*')
         cls.st = read(testing_path)
         cls.day_start = cls.st[0].stats.starttime.date
         cls.short_stream = cls.st.copy().trim(cls.st[0].stats.starttime,
diff --git a/eqcorrscan/tests/sfile_util_test.py b/eqcorrscan/tests/sfile_util_test.py
index cd34e57f2..6d958e1d7 100644
--- a/eqcorrscan/tests/sfile_util_test.py
+++ b/eqcorrscan/tests/sfile_util_test.py
@@ -7,6 +7,7 @@
 from __future__ import unicode_literals
 
 import unittest
+import pytest
 
 from eqcorrscan.utils.sfile_util import eventtosfile, readwavename, readpicks
 from eqcorrscan.utils.sfile_util import _nortoevmag, _evmagtonor, nordpick
@@ -15,6 +16,8 @@
 
 
 class TestSfileMethods(unittest.TestCase):
+    @pytest.mark.network
+    @pytest.mark.flaky(reruns=2)
     def test_download_write(self):
         """
         Function to download quakeML files from a range of datacenters and \
@@ -328,7 +331,6 @@ def test_read_empty_header(self):
         by Dominic Evanzia.
         """
         import os
-        import numpy as np
         testing_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                     'test_data')
         test_event = readpicks(os.path.join(testing_path, 'Sfile_no_header'))
@@ -391,6 +393,8 @@ def test_read_wavename(self):
         wavefiles = readwavename(testing_path)
         self.assertEqual(len(wavefiles), 1)
 
+    @pytest.mark.network
+    @pytest.mark.flaky(reruns=2)
     def test_station_to_seisan(self):
         from obspy.clients.fdsn import Client
         from obspy import UTCDateTime
diff --git a/eqcorrscan/tests/subspace_test.py b/eqcorrscan/tests/subspace_test.py
index c74b2015a..250e28228 100644
--- a/eqcorrscan/tests/subspace_test.py
+++ b/eqcorrscan/tests/subspace_test.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import unittest
+import pytest
 import os
 import copy
 
@@ -96,11 +97,13 @@ def test_stat(self):
         self.assertEqual((stat.max().round(6) - 0.229755).round(6), 0)
 
 
+@pytest.mark.network
 class SubspaceTestingMethods(unittest.TestCase):
     """
     Main tests for the subspace module.
     """
     @classmethod
+    @pytest.mark.flaky(reruns=2)
     def setUpClass(cls):
         """Set up the test templates."""
         cls.templates, cls.st = get_test_data()
diff --git a/eqcorrscan/tests/template_gen_test.py b/eqcorrscan/tests/template_gen_test.py
index 7cabe9655..30c451007 100644
--- a/eqcorrscan/tests/template_gen_test.py
+++ b/eqcorrscan/tests/template_gen_test.py
@@ -7,6 +7,7 @@
 from __future__ import unicode_literals
 
 import unittest
+import pytest
 import glob
 import os
 import numpy as np
@@ -16,7 +17,6 @@
 import copy
 
 from obspy import read, UTCDateTime, read_events, Stream
-from obspy.clients.fdsn.header import FDSNException
 from obspy.clients.fdsn import Client
 from obspy.core.event import Catalog, Event, Origin, Pick, WaveformStreamID
 
@@ -54,6 +54,8 @@ def test_sac_template_gen(self):
                 for tr in template:
                     self.assertEqual(len(tr.data), length * samp_rate)
 
+    @pytest.mark.network
+    @pytest.mark.flaky(reruns=2)
     def test_tutorial_template_gen(self):
         """Test template generation from tutorial, uses from_client method.
 
@@ -61,11 +63,7 @@ def test_tutorial_template_gen(self):
         """
         testing_path = os.path.join(
             os.path.abspath(os.path.dirname(__file__)), 'test_data')
-        try:
-            mktemplates(plot=False)
-        except FDSNException:
-            warnings.warn('FDSN error, is server down?')
-            return
+        mktemplates(plot=False)
         for template_no in range(4):
             template = read('tutorial_template_' + str(template_no) + '.ms')
             expected_template = read(os.path.join(
@@ -78,19 +76,20 @@ def test_tutorial_template_gen(self):
             del(template)
             os.remove('tutorial_template_' + str(template_no) + '.ms')
 
+    @pytest.mark.network
+    @pytest.mark.flaky(reruns=2)
     def test_not_delayed(self):
         """Test the method of template_gen without applying delays to
         channels."""
-        cat = get_geonet_events(minlat=-40.98, maxlat=-40.85, minlon=175.4,
-                                maxlon=175.5,
-                                startdate=UTCDateTime(2016, 5, 1),
-                                enddate=UTCDateTime(2016, 5, 2))
+        cat = get_geonet_events(
+            minlat=-40.98, maxlat=-40.85, minlon=175.4, maxlon=175.5,
+            startdate=UTCDateTime(2016, 5, 1), enddate=UTCDateTime(2016, 5, 2))
         cat = filter_picks(catalog=cat, top_n_picks=5)
-        template = from_client(catalog=cat, client_id='GEONET',
-                               lowcut=None, highcut=None, samp_rate=100.0,
-                               filt_order=4, length=10.0, prepick=0.5,
-                               swin='all', process_len=3600,
-                               debug=0, plot=False, delayed=False)[0]
+        template = from_client(
+            catalog=cat, client_id='GEONET', lowcut=None, highcut=None,
+            samp_rate=100.0, filt_order=4, length=10.0, prepick=0.5,
+            swin='all', process_len=3600, debug=0, plot=False,
+            delayed=False)[0]
         for tr in template:
             tr.stats.starttime.precision = 6
         starttime = template[0].stats.starttime
@@ -101,6 +100,8 @@ def test_not_delayed(self):
                             tr.stats.delta)
             self.assertEqual(tr.stats.npts, length)
 
+    @pytest.mark.network
+    @pytest.mark.flaky(reruns=2)
     def test_download_various_methods(self):
         """
         Will download data from server and store in various databases,
@@ -109,8 +110,8 @@ def test_download_various_methods(self):
         client = Client('GEONET')
         # get the events
         catalog = Catalog()
-        data_stream = client._download('http://quakeml.geonet.org.nz/' +
-                                       'quakeml/1.2/2016p008194')
+        data_stream = client._download(
+            'http://quakeml.geonet.org.nz/quakeml/1.2/2016p008194')
         data_stream.seek(0, 0)
         catalog += read_events(data_stream, format="quakeml")
         data_stream.close()
@@ -253,16 +254,16 @@ def test_upsample_error(self):
 class TestEdgeGen(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        cls.testing_path = os.path.dirname(os.path.abspath(inspect.getfile(
-            inspect.currentframe())))
-        cls.st = read(os.path.join(cls.testing_path, 'test_data',
-                                   'WAV', 'TEST_',
-                                   '2013-09-15-0930-28.DFDPC_027_00'))
+        cls.testing_path = os.path.dirname(
+            os.path.abspath(inspect.getfile(inspect.currentframe())))
+        cls.st = read(os.path.join(
+            cls.testing_path, 'test_data', 'WAV', 'TEST_',
+            '2013-09-15-0930-28.DFDPC_027_00'))
         for tr in cls.st:
             tr.stats.channel = tr.stats.channel[0] + tr.stats.channel[-1]
-        event = read_event(os.path.join(cls.testing_path, 'test_data',
-                                        'REA', 'TEST_',
-                                        '15-0931-08L.S201309'))
+        event = read_event(os.path.join(
+            cls.testing_path, 'test_data', 'REA', 'TEST_',
+            '15-0931-08L.S201309'))
         cls.picks = event.picks
 
     def test_undefined_phase_type(self):
@@ -310,10 +311,9 @@ def test_extract_from_stack_and_process(self):
         length = 3
         stack = self.st.copy()
         template = template_gen(self.picks, self.st.copy(), 2)
-        extracted = extract_from_stack(stack, template, length=length,
-                                       pre_pick=0.3, pre_pad=45,
-                                       pre_processed=False, samp_rate=20,
-                                       lowcut=2, highcut=8)
+        extracted = extract_from_stack(
+            stack, template, length=length, pre_pick=0.3, pre_pad=45,
+            pre_processed=False, samp_rate=20, lowcut=2, highcut=8)
         self.assertEqual(len(template), len(extracted))
         for tr in extracted:
             self.assertEqual(tr.stats.endtime - tr.stats.starttime,
@@ -323,9 +323,9 @@ def test_extract_from_stack_including_z(self):
         length = 3
         stack = self.st.copy()
         template = template_gen(self.picks, self.st.copy(), 2)
-        extracted = extract_from_stack(stack, template, length=length,
-                                       pre_pick=0.3, pre_pad=45,
-                                       Z_include=True)
+        extracted = extract_from_stack(
+            stack, template, length=length, pre_pick=0.3, pre_pad=45,
+            Z_include=True)
         self.assertLess(len(template), len(extracted))
         for tr in extracted:
             self.assertEqual(tr.stats.endtime - tr.stats.starttime,
diff --git a/eqcorrscan/tests/tutorials_test.py b/eqcorrscan/tests/tutorials_test.py
index 230cd637c..629181b89 100644
--- a/eqcorrscan/tests/tutorials_test.py
+++ b/eqcorrscan/tests/tutorials_test.py
@@ -31,6 +31,7 @@ def setUpClass(cls):
             os.path.abspath(os.path.dirname(__file__)), 'test_data')
 
     @slow
+    @pytest.mark.flaky(reruns=2)
     def test_templates_and_match(self):
         """Call the template creation then the matched-filter tests."""
         mktemplates(plot=False)
@@ -74,6 +75,7 @@ def test_templates_and_match(self):
                 os.remove('tutorial_template_' + str(template_no) + '.ms')
 
     @slow
+    @pytest.mark.flaky(reruns=2)
     def test_lag_calc(self):
         """Test the lag calculation tutorial."""
         shift_len = 0.2
@@ -102,6 +104,7 @@ def test_lag_calc(self):
                 self.assertTrue(abs(re_picked_delay) < shift_len)
 
     @slow
+    @pytest.mark.flaky(reruns=2)
     def test_subspace(self):
         """Test the subspace tutorial."""
         detections = subspace.run_tutorial(plot=False)
diff --git a/eqcorrscan/utils/__init__.py b/eqcorrscan/utils/__init__.py
index 3f35af8bf..a3b0715c4 100644
--- a/eqcorrscan/utils/__init__.py
+++ b/eqcorrscan/utils/__init__.py
@@ -12,8 +12,8 @@
 
 
 __all__ = ['archive_read', 'catalog_to_dd', 'catalog_utils', 'clustering',
-           'correlate', 'despike', 'findpeaks', 'mag_calc', 'normalise',
-           'parameters', 'picker', 'plotting', 'pre_processing',
+           'correlate', 'debug_log', 'despike', 'findpeaks', 'mag_calc',
+           'normalise', 'parameters', 'picker', 'plotting', 'pre_processing',
            'sac_util', 'seismo_logs', 'sfile_util', 'stacking',
            'synth_seis', 'timer', 'trigger']
 
diff --git a/eqcorrscan/utils/archive_read.py b/eqcorrscan/utils/archive_read.py
index e22df1890..36919387c 100644
--- a/eqcorrscan/utils/archive_read.py
+++ b/eqcorrscan/utils/archive_read.py
@@ -30,7 +30,7 @@ def read_data(archive, arc_type, day, stachans, length=86400):
         client.  If arc_type is day_vols, then this is the path to the top
         directory.
     :type arc_type: str
-    :param arc_type: The type of archive, can be: seishub, FDSN, day_volves
+    :param arc_type: The type of archive, can be: seishub, FDSN, day_volumes
     :type day: datetime.date
     :param day: Date to retrieve data for
     :type stachans: list
@@ -55,27 +55,24 @@ def read_data(archive, arc_type, day, stachans, length=86400):
     >>> from eqcorrscan.utils.archive_read import read_data
     >>> from obspy import UTCDateTime
     >>> t1 = UTCDateTime(2012, 3, 26)
-    >>> stachans = [('FOZ', 'HHZ'), ('JCZ', 'HHZ')]
-    >>> st = read_data('GEONET', 'FDSN', t1, stachans)
+    >>> stachans = [('JCNB', 'SP1')]
+    >>> st = read_data('NCEDC', 'FDSN', t1, stachans)
     >>> print(st)
-    2 Trace(s) in Stream:
-    NZ.FOZ.10.HHZ | 2012-03-25T23:59:57.018393Z - 2012-03-27T00:00:00.688393Z \
-| 100.0 Hz, 8640368 samples
-    NZ.JCZ.10.HHZ | 2012-03-25T23:59:57.348391Z - 2012-03-27T00:00:02.958391Z \
-| 100.0 Hz, 8640562 samples
-
+    1 Trace(s) in Stream:
+    BP.JCNB.40.SP1 | 2012-03-26T00:00:00.000000Z - 2012-03-26T23:59:59.\
+950000Z | 20.0 Hz, 1728000 samples
 
     .. rubric:: Example, missing data
 
     >>> from eqcorrscan.utils.archive_read import read_data
     >>> from obspy import UTCDateTime
     >>> t1 = UTCDateTime(2012, 3, 26)
-    >>> stachans = [('FOZ', 'HHZ'), ('GCSZ', 'HHZ')]
-    >>> st = read_data('GEONET', 'FDSN', t1, stachans)
+    >>> stachans = [('JCNB', 'SP1'), ('GCSZ', 'HHZ')]
+    >>> st = read_data('NCEDC', 'FDSN', t1, stachans)
     >>> print(st)
     1 Trace(s) in Stream:
-    NZ.FOZ.10.HHZ | 2012-03-25T23:59:57.018393Z - 2012-03-27T00:00:00.688393Z \
-| 100.0 Hz, 8640368 samples
+    BP.JCNB.40.SP1 | 2012-03-26T00:00:00.000000Z - 2012-03-26T23:59:59.\
+950000Z | 20.0 Hz, 1728000 samples
 
 
     .. rubric:: Example, local day-volumes
diff --git a/eqcorrscan/utils/clustering.py b/eqcorrscan/utils/clustering.py
index 5475f73dc..8c3ef24bd 100644
--- a/eqcorrscan/utils/clustering.py
+++ b/eqcorrscan/utils/clustering.py
@@ -13,24 +13,25 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-import warnings
-import matplotlib.pyplot as plt
 import os
-
+import warnings
 from multiprocessing import Pool, cpu_count
-from scipy.spatial.distance import squareform
-from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
-from obspy.signal.cross_correlation import xcorr
+
+import matplotlib.pyplot as plt
+import numpy as np
 from obspy import Stream, Catalog, UTCDateTime, Trace
+from obspy.signal.cross_correlation import xcorr
+from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
+from scipy.spatial.distance import squareform
 
-from eqcorrscan.utils.mag_calc import dist_calc
-from eqcorrscan.utils.correlate import time_multi_normxcorr
 from eqcorrscan.utils import stacking
 from eqcorrscan.utils.archive_read import read_data
+from eqcorrscan.utils.correlate import get_array_xcorr
+from eqcorrscan.utils.mag_calc import dist_calc
 
 
-def cross_chan_coherence(st1, st2, allow_shift=False, shift_len=0.2, i=0):
+def cross_chan_coherence(st1, st2, allow_shift=False, shift_len=0.2, i=0,
+                         xcorr_func='time_domain'):
     """
     Calculate cross-channel coherency.
 
@@ -49,6 +50,11 @@ def cross_chan_coherence(st1, st2, allow_shift=False, shift_len=0.2, i=0):
     :param shift_len: Samples to shift, only used if `allow_shift=True`
     :type i: int
     :param i: index used for parallel async processing, returned unaltered
+    :type xcorr_func: str, callable
+    :param xcorr_func:
+        The method for performing correlations. Accepts either a string or
+         callabe. See :func:`eqcorrscan.utils.correlate.register_array_xcorr`
+         for more details
 
     :returns:
         cross channel coherence, float - normalized by number of channels,
@@ -57,6 +63,7 @@ def cross_chan_coherence(st1, st2, allow_shift=False, shift_len=0.2, i=0):
     """
     cccoh = 0.0
     kchan = 0
+    array_xcorr = get_array_xcorr(xcorr_func)
     for tr in st1:
         tr2 = st2.select(station=tr.stats.station,
                          channel=tr.stats.channel)
@@ -71,7 +78,7 @@ def cross_chan_coherence(st1, st2, allow_shift=False, shift_len=0.2, i=0):
             kchan += 1
         elif len(tr2) > 0:
             min_len = min(len(tr.data), len(tr2[0].data))
-            cccoh += time_multi_normxcorr(
+            cccoh += array_xcorr(
                 np.array([tr.data[0:min_len]]), tr2[0].data[0:min_len],
                 [0])[0][0][0]
             kchan += 1
@@ -297,7 +304,7 @@ def group_delays(stream_list):
                     if chan in group_chans[j]:
                         shared_chans.append(chan)
                         shared_delays_slave.append(delays[k])
-                        shared_delays_master.\
+                        shared_delays_master. \
                             append(group_delays[j][group_chans[j].index(chan)])
                 # Normalize master and slave delay times
                 shared_delays_slave = [delay - min(shared_delays_slave)
@@ -382,7 +389,7 @@ def svd(stream_list, full=False):
                                  channel=stachan[1])
             if chan:
                 if len(chan[0].data) > min_length:
-                    if abs(len(chan[0].data) - min_length) > 0.1 *\
+                    if abs(len(chan[0].data) - min_length) > 0.1 * \
                             chan[0].stats.sampling_rate:
                         raise IndexError('More than 0.1 s length '
                                          'difference, align and fix')
@@ -402,7 +409,7 @@ def svd(stream_list, full=False):
         svalues.append(s)
         svectors.append(v)
         uvectors.append(u)
-        del(chan_mat)
+        del (chan_mat)
     return uvectors, svalues, svectors, stachans
 
 
@@ -448,12 +455,12 @@ def empirical_svd(stream_list, linear=True):
             tr = st.select(station=stachan[0],
                            channel=stachan[1])[0]
             if len(tr.data) > min_length:
-                if abs(len(tr.data) - min_length) > (0.1 *
-                                                     tr.stats.sampling_rate):
-                        raise IndexError('More than 0.1 s length '
-                                         'difference, align and fix')
-                warnings.warn(str(tr) + ' is not the same length as others, ' +
-                              'trimming the end')
+                sr = tr.stats.sampling_rate
+                if abs(len(tr.data) - min_length) > (0.1 * sr):
+                    msg = 'More than 0.1 s length difference, align and fix'
+                    raise IndexError(msg)
+                msg = ' is not the same length as others, trimming the end'
+                warnings.warn(str(tr) + msg)
                 tr.data = tr.data[0:min_length]
     if linear:
         first_subspace = stacking.linstack(stream_list)
@@ -462,8 +469,9 @@ def empirical_svd(stream_list, linear=True):
     second_subspace = first_subspace.copy()
     for i in range(len(second_subspace)):
         second_subspace[i].data = np.diff(second_subspace[i].data)
-        second_subspace[i].stats.starttime += 0.5 * \
-            second_subspace[i].stats.delta
+        delta = second_subspace[i].stats.delta
+        second_subspace[i].stats.starttime += 0.5 * delta
+
     return [first_subspace, second_subspace]
 
 
@@ -542,8 +550,9 @@ def corr_cluster(trace_list, thresh=0.9):
     stack = stacking.linstack([Stream(tr) for tr in trace_list])[0]
     output = np.array([False] * len(trace_list))
     group1 = []
+    array_xcorr = get_array_xcorr()
     for i, tr in enumerate(trace_list):
-        if time_multi_normxcorr(
+        if array_xcorr(
                 np.array([tr.data]), stack.data, [0])[0][0][0] > 0.6:
             output[i] = True
             group1.append(tr)
@@ -553,7 +562,7 @@ def corr_cluster(trace_list, thresh=0.9):
     stack = stacking.linstack([Stream(tr) for tr in group1])[0]
     group2 = []
     for i, tr in enumerate(trace_list):
-        if time_multi_normxcorr(
+        if array_xcorr(
                 np.array([tr.data]), stack.data, [0])[0][0][0] > thresh:
             group2.append(tr)
             output[i] = True
@@ -750,10 +759,9 @@ def extract_detections(detections, templates, archive, arc_type,
                   detection.detect_time.strftime('%Y/%m/%d %H:%M:%S'))
             detect_wav = st.copy()
             for tr in detect_wav:
-                tr.trim(starttime=UTCDateTime(detection.detect_time) -
-                        extract_len / 2,
-                        endtime=UTCDateTime(detection.detect_time) +
-                        extract_len / 2)
+                t1 = UTCDateTime(detection.detect_time) - extract_len / 2
+                t2 = UTCDateTime(detection.detect_time) + extract_len / 2
+                tr.trim(starttime=t1, endtime=t2)
             if outdir:
                 if not os.path.isdir(os.path.join(outdir,
                                                   detection.template_name)):
@@ -766,7 +774,7 @@ def extract_detections(detections, templates, archive, arc_type,
                 print('Written file: %s' %
                       '/'.join([outdir, detection.template_name,
                                 detection.detect_time.
-                                strftime('%Y-%m-%d_%H-%M-%S') + '.ms']))
+                               strftime('%Y-%m-%d_%H-%M-%S') + '.ms']))
             if not outdir:
                 detection_wavefiles.append(detect_wav)
             del detect_wav
@@ -987,9 +995,11 @@ def re_thresh_csv(path, old_thresh, new_thresh, chan_thresh):
     detections_out = 0
     for detection in old_detections:
         detections_in += 1
-        if abs(detection.detect_val) >=\
-           (new_thresh / old_thresh) * detection.threshold and\
-           detection.no_chans >= chan_thresh:
+        con1 = (new_thresh / old_thresh) * detection.threshold
+        con2 = detection.no_chans >= chan_thresh
+        requirted_thresh = (new_thresh / old_thresh) * detection.threshold
+        con3 = abs(detection.detect_val) >= requirted_thresh
+        if all([con1, con2, con3]):
             detections_out += 1
             detections.append(detection)
     print('Read in %i detections' % detections_in)
@@ -999,4 +1009,5 @@ def re_thresh_csv(path, old_thresh, new_thresh, chan_thresh):
 
 if __name__ == "__main__":
     import doctest
+
     doctest.testmod()
diff --git a/eqcorrscan/utils/correlate.py b/eqcorrscan/utils/correlate.py
index 07c9c1301..fcc3f908e 100644
--- a/eqcorrscan/utils/correlate.py
+++ b/eqcorrscan/utils/correlate.py
@@ -5,7 +5,7 @@
 routine using FFTW, a Numpy fft routine which uses bottleneck for normalisation
 and a compiled time-domain routine. These have varying levels of efficiency,
 both in terms of overall speed, and in memory usage.  The time-domain is the
-most memory efficient but slowest routine (although fastest for small cases of
+most memory efficient but slowest routine (although fast for small cases of
 less than a few hundred correlations), the Numpy routine is fast, but memory
 inefficient due to a need to store large double-precision arrays for
 normalisation.  The fftw compiled routine is fastest and more memory efficient
@@ -24,18 +24,292 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
+import contextlib
+import copy
 import ctypes
+import os
+from multiprocessing import Pool as ProcessPool, cpu_count
+from multiprocessing.pool import ThreadPool
+
+import numpy as np
 from future.utils import native_str
 
-from multiprocessing import Pool
-from scipy.fftpack.helper import next_fast_len
 from eqcorrscan.utils.libnames import _load_cdll
 
+# This is for building docs on readthedocs, which has an old version of
+# scipy - without this, this module cannot be imported, which breaks the docs
+# Instead we define a dummy function that returns what it is given.
+READ_THE_DOCS = os.environ.get('READTHEDOCS', None) == 'True'
+if not READ_THE_DOCS:
+    from scipy.fftpack.helper import next_fast_len
+else:
+    def next_fast_len(a):
+        return a
+
+XCOR_FUNCS = {}  # cache of functions for doing cross correlations
+
+# methods added to each xcorr func registered
+# these implement the stream interface
+XCORR_STREAM_METHODS = ('multithread', 'multiprocess', 'concurrent',
+                        'stream_xcorr')
+
+# these implement the array interface
+XCOR_ARRAY_METHODS = ('array_xcorr')
+
+
+# ------------------ Context manager for switching out default
+class _Context:
+    """ class for permanently or temporarily changing items in a dict """
+
+    def __init__(self, cache, value_to_switch):
+        """
+        :type cache: dict
+        :param cache: A dict to store values in
+        :type value_to_switch: str
+        :param value_to_switch:
+            The key in cache to switch based on different contexts
+        """
+        self.cache = cache
+        self.value_to_switch = value_to_switch
+        self.previous_value = None
+
+    def __call__(self, new_value, *args, **kwargs):
+        """ # TODO change docs if this ever becomes general use
+        Set a new value for the default xcorr function.
+
+        This function can be called directly to permanently change the
+        default normxcorr function or it may be used as a context manager
+        to only modify it temporarily.
+
+        :param new_value:
+        :return:
+        """
+        self.previous_value = self.cache.get(self.value_to_switch)
+        self.cache[self.value_to_switch] = get_array_xcorr(new_value)
+        return self
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.revert()
+
+    def __repr__(self):
+        """ this hides the fact _Context instance are returned after calls """
+        name = self.cache[self.value_to_switch].__str__()
+        if hasattr(self.cache[self.value_to_switch], '__name__'):
+            name = self.cache[self.value_to_switch].__name__
+        out_str = ("%s changed to %s" % (self.value_to_switch, name))
+        return out_str
+
+    def revert(self):
+        """ revert the default xcorr function to previous value """
+        new_value = self.previous_value
+        self(new_value)
+
+
+set_xcorr = _Context(XCOR_FUNCS, 'default')
+
+
+# ---------------------- generic concurrency functions
+
+@contextlib.contextmanager
+def _pool_boy(Pool, traces, **kwargs):
+    """
+    A context manager for handling the setup and cleanup of a pool object.
+
+    :param Pool: any Class (not instance) that implements the multiprocessing
+        Pool interface
+    :param traces: The number of traces to process
+    :type traces: int
+    """
+    # All parallel processing happens on a per-trace basis, we shouldn't create
+    # more workers than there are traces
+    n_cores = kwargs.get('cores', cpu_count())
+    if n_cores > traces:
+        n_cores = traces
+    pool = Pool(n_cores)
+    yield pool
+    pool.close()
+    pool.join()
+
+
+def _pool_normxcorr(templates, stream, pool, func, *args, **kwargs):
+    chans = [[] for _i in range(len(templates))]
+    array_dict_tuple = _get_array_dicts(templates, stream)
+    stream_dict, template_dict, pad_dict, seed_ids = array_dict_tuple
+    # get parameter iterator
+    params = ((template_dict[sid], stream_dict[sid], pad_dict[sid])
+              for sid in seed_ids)
+    # get cc results and used chans into their own lists
+    results = [pool.apply_async(func, param) for param in params]
+    xcorrs, tr_chans = zip(*(res.get() for res in results))
+    cccsums = np.sum(xcorrs, axis=0)
+    no_chans = np.sum(np.array(tr_chans).astype(np.int), axis=0)
+    for seed_id, tr_chan in zip(seed_ids, tr_chans):
+        for chan, state in zip(chans, tr_chan):
+            if state:
+                chan.append((seed_id.split('.')[1],
+                             seed_id.split('.')[-1].split('_')[0]))
+    return cccsums, no_chans, chans
+
+
+def _general_multithread(func):
+    """ return the general multithreading function using func """
+
+    def multithread(templates, stream, *args, **kwargs):
+        with _pool_boy(ThreadPool, len(stream), **kwargs) as pool:
+            return _pool_normxcorr(templates, stream, pool=pool, func=func)
+
+    return multithread
+
+
+def _general_multiprocess(func):
+    def multiproc(templates, stream, *args, **kwargs):
+        with _pool_boy(ProcessPool, len(stream), **kwargs) as pool:
+            return _pool_normxcorr(templates, stream, pool=pool, func=func)
 
-def numpy_normxcorr(templates, stream, pads):
+    return multiproc
+
+
+def _general_serial(func):
+    def stream_xcorr(templates, stream, *args, **kwargs):
+        no_chans = np.zeros(len(templates))
+        chans = [[] for _ in range(len(templates))]
+        array_dict_tuple = _get_array_dicts(templates, stream)
+        stream_dict, template_dict, pad_dict, seed_ids = array_dict_tuple
+        cccsums = np.zeros([len(templates),
+                            len(stream[0]) - len(templates[0][0]) + 1])
+        for seed_id in seed_ids:
+            tr_cc, tr_chans = func(template_dict[seed_id],
+                                   stream_dict[seed_id],
+                                   pad_dict[seed_id])
+            cccsums = np.sum([cccsums, tr_cc], axis=0)
+            no_chans += tr_chans.astype(np.int)
+            for chan, state in zip(chans, tr_chans):
+                if state:
+                    chan.append((seed_id.split('.')[1],
+                                 seed_id.split('.')[-1].split('_')[0]))
+        return cccsums, no_chans, chans
+
+    return stream_xcorr
+
+
+def register_array_xcorr(name, func=None, is_default=False):
     """
-    Compute the normalized cross-correlation of multiple templates with data.
+    Decorator for registering correlation functions.
+
+    Each function must have the same interface as numpy_normxcorr, which is
+    *f(templates, stream, pads, *args, **kwargs)* any number of specific kwargs
+    can be used.
+
+    Register_normxcorr can be used as a decorator (with or without arguments)
+    or as a callable.
+
+    :param name: The name of the function for quick access, or the callable
+        that will be wrapped when used as a decorator.
+    :type name: str, callable
+    :param func: The function to register
+    :type func: callable, optional
+    :param is_default: True if this function should be marked as default
+        normxcorr
+    :type is_default: bool
+
+    :return: callable
+    """
+    valid_methods = set(list(XCOR_ARRAY_METHODS) + list(XCORR_STREAM_METHODS))
+    cache = {}
+
+    def register(register_str):
+        """
+        Register a function as an implementation.
+
+        :param register_str: The registration designation
+        :type register_str: str
+        """
+        if register_str not in valid_methods:
+            msg = 'register_name must be in %s' % valid_methods
+            raise ValueError(msg)
+
+        def _register(func):
+            cache[register_str] = func
+            setattr(cache['func'], register_str, func)
+            return func
+
+        return _register
+
+    def wrapper(func, func_name=None):
+        # register the functions in the XCOR
+        fname = func_name or name.__name__ if callable(name) else str(name)
+        XCOR_FUNCS[fname] = func
+        if is_default:  # set function as default
+            XCOR_FUNCS['default'] = func
+        # attach some attrs, this is a bit of a hack to avoid pickle problems
+        func.register = register
+        cache['func'] = func
+        func.multithread = _general_multithread(func)
+        func.multiprocess = _general_multiprocess(func)
+        func.concurrent = _general_multithread(func)
+        func.stream_xcorr = _general_serial(func)
+        func.array_xcorr = func
+        func.registered = True
+        return func
+
+    # used as a decorator
+    if callable(name):
+        return wrapper(name)
+
+    # used as a normal function (called and passed a function)
+    if callable(func):
+        return wrapper(func, func_name=name)
+
+    # called, then used as a decorator
+    return wrapper
+
+
+# ------------------ array_xcorr fetching functions
+
+
+def _get_registerd_func(name_or_func):
+    """ get a xcorr function from a str or callable. """
+    # get the function or register callable
+    if callable(name_or_func):
+        func = register_array_xcorr(name_or_func)
+    else:
+        func = XCOR_FUNCS[name_or_func or 'default']
+    assert callable(func), 'func is not callable'
+    # ensure func has the added methods
+    if not hasattr(func, 'registered'):
+        func = register_array_xcorr(func)
+    return func
+
+
+def get_array_xcorr(name_or_func=None):
+    """
+    Get an normalized cross correlation function that takes arrays as inputs.
+
+    See :func:`eqcorrscan.utils.correlate.array_normxcorr` for expected
+    function signature.
+
+    :param name_or_func: Either a name of a registered xcorr function or a
+        callable that implements the standard array_normxcorr signature.
+    :type name_or_func: str or callable
+
+    :return: callable wth array_normxcorr interface
+
+    see also :func:`eqcorrscan.utils.correlate.get_stream_xcorr`
+    """
+    func = _get_registerd_func(name_or_func)
+    return func.array_xcorr
+
+
+# ----------------------- registered array_xcorr functions
+
+
+@register_array_xcorr('numpy')
+def numpy_normxcorr(templates, stream, pads, *args, **kwargs):
+    """
+    Compute the normalized cross-correlation using numpy and bottleneck.
 
     :param templates: 2D Array of templates
     :type templates: np.ndarray
@@ -64,6 +338,8 @@ def numpy_normxcorr(templates, stream, pads):
         stream, template_length)[template_length - 1:]
     stream_std_array = bottleneck.move_std(
         stream, template_length)[template_length - 1:]
+    # because stream_std_array is in denominator or res, nan all 0s
+    stream_std_array[stream_std_array == 0] = np.nan
     # Normalize and flip the templates
     norm = ((templates - templates.mean(axis=-1, keepdims=True)) / (
         templates.std(axis=-1, keepdims=True) * template_length))
@@ -75,134 +351,15 @@ def numpy_normxcorr(templates, stream, pads):
     res = ((_centered(res, stream_length - template_length + 1)) -
            norm_sum * stream_mean_array) / stream_std_array
     res[np.isnan(res)] = 0.0
-    for i in range(len(pads)):
-        res[i] = np.append(res[i], np.zeros(pads[i]))[pads[i]:]
+    # res[np.isinf(res)] = 0.0
+    for i, pad in enumerate(pads):  # range(len(pads)):
+        res[i] = np.append(res[i], np.zeros(pad))[pad:]
     return res.astype(np.float32), used_chans
 
 
-def multichannel_normxcorr(templates, stream, cores=1, time_domain=False,
-                           openmp=False):
-    """
-    Cross-correlate multiple channels either in parallel or not
-
-    :type templates: list
-    :param templates:
-        A list of templates, where each one should be an obspy.Stream object
-        containing multiple traces of seismic data and the relevant header
-        information.
-    :type stream: obspy.core.stream.Stream
-    :param stream:
-        A single Stream object to be correlated with the templates.
-    :type cores: int
-    :param cores:
-        Number of processes to use, if set to None, no Python multiprocessing
-        will be done.
-    :type cores: int
-    :param cores: Number of cores to loop over
-    :type time_domain: bool
-    :param time_domain:
-        Whether to compute in the time-domain using the compiled openMP
-        parallel cross-correlation routine.
-    :type openmp: bool
-    :param openmp: Whether to use the openmp, compiled threaded loop or not.
-
-    :returns:
-        New list of :class:`numpy.ndarray` objects.  These will contain
-        the correlation sums for each template for this day of data.
-    :rtype: list
-    :returns:
-        list of ints as number of channels used for each cross-correlation.
-    :rtype: list
-    :returns:
-        list of list of tuples of station, channel for all cross-correlations.
-    :rtype: list
-
-    .. Note::
-        Each template must contain the same channels as every other template,
-        the stream must also contain the same channels (note that if there
-        are duplicate channels in the template you do not need duplicate
-        channels in the stream).
-    """
-    no_chans = np.zeros(len(templates))
-    chans = [[] for _i in range(len(templates))]
-    # Do some reshaping
-    stream.sort(['network', 'station', 'location', 'channel'])
-    t_starts = []
-    for template in templates:
-        template.sort(['network', 'station', 'location', 'channel'])
-        t_starts.append(min([tr.stats.starttime for tr in template]))
-    seed_ids = [tr.id + '_' + str(i) for i, tr in enumerate(templates[0])]
-    template_array = {}
-    stream_array = {}
-    pad_array = {}
-    for i, seed_id in enumerate(seed_ids):
-        t_ar = np.array([template[i].data
-                         for template in templates]).astype(np.float32)
-        template_array.update({seed_id: t_ar})
-        stream_array.update(
-            {seed_id: stream.select(
-                id=seed_id.split('_')[0])[0].data.astype(np.float32)})
-        pad_list = [
-            int(round(template[i].stats.sampling_rate *
-                      (template[i].stats.starttime - t_starts[j])))
-            for j, template in zip(range(len(templates)), templates)]
-        pad_array.update({seed_id: pad_list})
-    if cores is None and not openmp:
-        cccsums = np.zeros([len(templates),
-                            len(stream[0]) - len(templates[0][0]) + 1])
-        for seed_id in seed_ids:
-            if time_domain:
-                tr_xcorrs, tr_chans = time_multi_normxcorr(
-                    templates=template_array[seed_id],
-                    stream=stream_array[seed_id], pads=pad_array[seed_id])
-            else:
-                tr_xcorrs, tr_chans = fftw_normxcorr(
-                    templates=template_array[seed_id],
-                    stream=stream_array[seed_id], pads=pad_array[seed_id],
-                    threaded=True)
-            cccsums = np.sum([cccsums, tr_xcorrs], axis=0)
-            no_chans += tr_chans.astype(np.int)
-            for chan, state in zip(chans, tr_chans):
-                if state:
-                    chan.append((seed_id.split('.')[1],
-                                 seed_id.split('.')[-1].split('_')[0]))
-    elif not openmp:
-        pool = Pool(processes=cores)
-        if time_domain:
-            results = [pool.apply_async(time_multi_normxcorr, (
-                template_array[seed_id], stream_array[seed_id],
-                pad_array[seed_id])) for seed_id in seed_ids]
-        else:
-            results = [pool.apply_async(fftw_normxcorr, (
-                template_array[seed_id], stream_array[seed_id],
-                pad_array[seed_id], False)) for seed_id in seed_ids]
-        pool.close()
-        results = [p.get() for p in results]
-        xcorrs = [p[0] for p in results]
-        tr_chans = np.array([p[1] for p in results])
-        pool.join()
-        cccsums = np.sum(xcorrs, axis=0)
-        no_chans = np.sum(tr_chans.astype(np.int), axis=0)
-        for seed_id, tr_chan in zip(seed_ids, tr_chans):
-            for chan, state in zip(chans, tr_chan):
-                if state:
-                    chan.append((seed_id.split('.')[1],
-                                 seed_id.split('.')[-1].split('_')[0]))
-    else:
-        xcorrs, tr_chans = fftw_multi_normxcorr(
-            template_array=template_array, stream_array=stream_array,
-            pad_array=pad_array, seed_ids=seed_ids)
-        cccsums = np.sum(xcorrs, axis=0)
-        no_chans = np.sum(np.array(tr_chans).astype(np.int), axis=0)
-        for seed_id, tr_chan in zip(seed_ids, tr_chans):
-            for chan, state in zip(chans, tr_chan):
-                if state:
-                    chan.append((seed_id.split('.')[1],
-                                 seed_id.split('.')[-1].split('_')[0]))
-    return cccsums, no_chans, chans
-
-
-def time_multi_normxcorr(templates, stream, pads):
+@register_array_xcorr('time_domain')
+def time_multi_normxcorr(templates, stream, pads, threaded=False, *args,
+                         **kwargs):
     """
     Compute cross-correlations in the time-domain using C routine.
 
@@ -212,6 +369,8 @@ def time_multi_normxcorr(templates, stream, pads):
     :type stream: np.ndarray
     :param pads: List of ints of pad lengths in the same order as templates
     :type pads: list
+    :param threaded: Whether to use the threaded routine or not
+    :type threaded: bool
 
     :return: np.ndarray of cross-correlations
     :return: np.ndarray channels used
@@ -220,7 +379,7 @@ def time_multi_normxcorr(templates, stream, pads):
 
     utilslib = _load_cdll('libutils')
 
-    utilslib.multi_normxcorr_time.argtypes = [
+    argtypes = [
         np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
                                flags=native_str('C_CONTIGUOUS')),
         ctypes.c_int, ctypes.c_int,
@@ -229,8 +388,14 @@ def time_multi_normxcorr(templates, stream, pads):
         ctypes.c_int,
         np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
                                flags=native_str('C_CONTIGUOUS'))]
-    utilslib.multi_normxcorr_time.restype = ctypes.c_int
-
+    restype = ctypes.c_int
+    if threaded:
+        func = utilslib.multi_normxcorr_time_threaded
+        argtypes.append(ctypes.c_int)
+    else:
+        func = utilslib.multi_normxcorr_time
+    func.argtypes = argtypes
+    func.restype = restype
     # Need to de-mean everything
     templates_means = templates.mean(axis=1).astype(np.float32)[:, np.newaxis]
     stream_mean = stream.mean().astype(np.float32)
@@ -242,9 +407,11 @@ def time_multi_normxcorr(templates, stream, pads):
     ccc = np.ascontiguousarray(
         np.empty((image_len - template_len + 1) * n_templates), np.float32)
     t_array = np.ascontiguousarray(templates.flatten(), np.float32)
-    utilslib.multi_normxcorr_time(
-        t_array, template_len, n_templates,
-        np.ascontiguousarray(stream, np.float32), image_len, ccc)
+    time_args = [t_array, template_len, n_templates,
+                 np.ascontiguousarray(stream, np.float32), image_len, ccc]
+    if threaded:
+        time_args.append(kwargs.get('cores', cpu_count()))
+    func(*time_args)
     ccc[np.isnan(ccc)] = 0.0
     ccc = ccc.reshape((n_templates, image_len - template_len + 1))
     for i in range(len(pads)):
@@ -254,7 +421,8 @@ def time_multi_normxcorr(templates, stream, pads):
     return ccc, used_chans
 
 
-def fftw_normxcorr(templates, stream, pads, threaded=True):
+@register_array_xcorr('fftw', is_default=True)
+def fftw_normxcorr(templates, stream, pads, threaded=False, *args, **kwargs):
     """
     Normalised cross-correlation using the fftw library.
 
@@ -291,54 +459,144 @@ def fftw_normxcorr(templates, stream, pads, threaded=True):
         np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
                                flags=native_str('C_CONTIGUOUS')),
         ctypes.c_int,
-        np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
+        np.ctypeslib.ndpointer(dtype=np.float32,
                                flags=native_str('C_CONTIGUOUS')),
-        ctypes.c_int]
+        ctypes.c_int,
+        np.ctypeslib.ndpointer(dtype=np.intc,
+                               flags=native_str('C_CONTIGUOUS')),
+        np.ctypeslib.ndpointer(dtype=np.intc,
+                               flags=native_str('C_CONTIGUOUS'))]
     restype = ctypes.c_int
+
     if threaded:
         func = utilslib.normxcorr_fftw_threaded
     else:
         func = utilslib.normxcorr_fftw
+
     func.argtypes = argtypes
     func.restype = restype
+
     # Generate a template mask
     used_chans = ~np.isnan(templates).any(axis=1)
     template_length = templates.shape[1]
     stream_length = len(stream)
     n_templates = templates.shape[0]
     fftshape = next_fast_len(template_length + stream_length - 1)
+
     # # Normalize and flip the templates
     norm = ((templates - templates.mean(axis=-1, keepdims=True)) / (
         templates.std(axis=-1, keepdims=True) * template_length))
 
     norm = np.nan_to_num(norm)
-    ccc = np.empty((n_templates, stream_length - template_length + 1),
-                   np.float32).flatten(order='C')
+    ccc = np.zeros((n_templates, stream_length - template_length + 1),
+                   np.float32)
+    used_chans_np = np.ascontiguousarray(used_chans, dtype=np.intc)
+    pads_np = np.ascontiguousarray(pads, dtype=np.intc)
+
     ret = func(
         np.ascontiguousarray(norm.flatten(order='C'), np.float32),
         template_length, n_templates,
         np.ascontiguousarray(stream, np.float32), stream_length,
-        np.ascontiguousarray(ccc, np.float32), fftshape)
+        np.ascontiguousarray(ccc, np.float32), fftshape,
+        used_chans_np, pads_np)
     if ret != 0:
         print(ret)
         raise MemoryError()
-    ccc = ccc.reshape((n_templates, stream_length - template_length + 1))
-    for i in range(n_templates):
-        if not used_chans[i]:
-            ccc[i] = np.zeros(stream_length - template_length + 1)
-    ccc[np.isnan(ccc)] = 0.0
-    if np.any(np.abs(ccc) > 1.01):
-        print('Normalisation error in C code')
-        print(ccc.max())
-        print(ccc.min())
-        raise MemoryError()
-    ccc[ccc > 1.0] = 1.0
-    ccc[ccc < -1.0] = -1.0
-    for i in range(len(pads)):
-        ccc[i] = np.append(ccc[i], np.zeros(pads[i]))[pads[i]:]
+
     return ccc, used_chans
 
 
+# The time-domain routine can be sped up massively on large machines (many
+# threads) using the openMP threaded functions.
+
+@time_multi_normxcorr.register('concurrent')
+def _time_threaded_normxcorr(templates, stream, *args, **kwargs):
+    """
+    Use the threaded time-domain routine for concurrency
+
+    :type templates: list
+    :param templates:
+        A list of templates, where each one should be an obspy.Stream object
+        containing multiple traces of seismic data and the relevant header
+        information.
+    :type stream: obspy.core.stream.Stream
+    :param stream:
+        A single Stream object to be correlated with the templates.
+
+    :returns:
+        New list of :class:`numpy.ndarray` objects.  These will contain
+        the correlation sums for each template for this day of data.
+    :rtype: list
+    :returns:
+        list of ints as number of channels used for each cross-correlation.
+    :rtype: list
+    :returns:
+        list of list of tuples of station, channel for all cross-correlations.
+    :rtype: list
+    """
+    no_chans = np.zeros(len(templates))
+    chans = [[] for _ in range(len(templates))]
+    array_dict_tuple = _get_array_dicts(templates, stream)
+    stream_dict, template_dict, pad_dict, seed_ids = array_dict_tuple
+    cccsums = np.zeros([len(templates),
+                        len(stream[0]) - len(templates[0][0]) + 1])
+    for seed_id in seed_ids:
+        tr_cc, tr_chans = time_multi_normxcorr(
+            template_dict[seed_id], stream_dict[seed_id], pad_dict[seed_id],
+            True)
+        cccsums = np.sum([cccsums, tr_cc], axis=0)
+        no_chans += tr_chans.astype(np.int)
+        for chan, state in zip(chans, tr_chans):
+            if state:
+                chan.append((seed_id.split('.')[1],
+                             seed_id.split('.')[-1].split('_')[0]))
+    return cccsums, no_chans, chans
+
+
+# TODO: This should be turned back on when openMP loop is merged
+# @fftw_normxcorr.register('stream_xcorr')
+@fftw_normxcorr.register('concurrent')
+def _fftw_stream_xcorr(templates, stream, *args, **kwargs):
+    """
+    Apply fftw normxcorr routine concurrently.
+
+    :type templates: list
+    :param templates:
+        A list of templates, where each one should be an obspy.Stream object
+        containing multiple traces of seismic data and the relevant header
+        information.
+    :type stream: obspy.core.stream.Stream
+    :param stream:
+        A single Stream object to be correlated with the templates.
+
+    :returns:
+        New list of :class:`numpy.ndarray` objects.  These will contain
+        the correlation sums for each template for this day of data.
+    :rtype: list
+    :returns:
+        list of ints as number of channels used for each cross-correlation.
+    :rtype: list
+    :returns:
+        list of list of tuples of station, channel for all cross-correlations.
+    :rtype: list
+    """
+
+    chans = [[] for _i in range(len(templates))]
+    array_dict_tuple = _get_array_dicts(templates, stream)
+    stream_dict, template_dict, pad_dict, seed_ids = array_dict_tuple
+    assert set(seed_ids)
+    cccsums, tr_chans = fftw_multi_normxcorr(
+        template_array=template_dict, stream_array=stream_dict,
+        pad_array=pad_dict, seed_ids=seed_ids)
+    no_chans = np.sum(np.array(tr_chans).astype(np.int), axis=0)
+    for seed_id, tr_chan in zip(seed_ids, tr_chans):
+        for chan, state in zip(chans, tr_chan):
+            if state:
+                chan.append((seed_id.split('.')[1],
+                             seed_id.split('.')[-1].split('_')[0]))
+    return cccsums, no_chans, chans
+
+
 def fftw_multi_normxcorr(template_array, stream_array, pad_array, seed_ids):
     """
     Use a C loop rather than a Python loop - in some cases this will be fast.
@@ -358,15 +616,19 @@ def fftw_multi_normxcorr(template_array, stream_array, pad_array, seed_ids):
     utilslib = _load_cdll('libutils')
 
     utilslib.multi_normxcorr_fftw.argtypes = [
-        np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
+        np.ctypeslib.ndpointer(dtype=np.float32,
                                flags=native_str('C_CONTIGUOUS')),
         ctypes.c_int, ctypes.c_int, ctypes.c_int,
-        np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
+        np.ctypeslib.ndpointer(dtype=np.float32,
                                flags=native_str('C_CONTIGUOUS')),
         ctypes.c_int,
-        np.ctypeslib.ndpointer(dtype=np.float32, ndim=1,
+        np.ctypeslib.ndpointer(dtype=np.float32,
+                               flags=native_str('C_CONTIGUOUS')),
+        ctypes.c_int,
+        np.ctypeslib.ndpointer(dtype=np.intc,
                                flags=native_str('C_CONTIGUOUS')),
-        ctypes.c_int]
+        np.ctypeslib.ndpointer(dtype=np.intc,
+                               flags=native_str('C_CONTIGUOUS'))]
     utilslib.multi_normxcorr_fftw.restype = ctypes.c_int
     '''
     Arguments are:
@@ -378,7 +640,10 @@ def fftw_multi_normxcorr(template_array, stream_array, pad_array, seed_ids):
         image length
         cross-correlations (stacked as per image)
         fft-length
+        used channels (stacked as per templates)
+        pad array (stacked as per templates)
     '''
+    # pre processing
     used_chans = []
     template_len = template_array[seed_ids[0]].shape[1]
     for seed_id in seed_ids:
@@ -393,38 +658,104 @@ def fftw_multi_normxcorr(template_array, stream_array, pad_array, seed_ids):
     n_templates = template_array[seed_ids[0]].shape[0]
     image_len = stream_array[seed_ids[0]].shape[0]
     fft_len = next_fast_len(template_len + image_len - 1)
-    template_array = np.array(list(template_array.values())).flatten(order='C')
-    stream_array = np.array(list(stream_array.values())).flatten(order='C')
-    cccs = np.empty((n_channels, n_templates, image_len - template_len + 1),
-                    np.float32).flatten(order='C')
+    template_array = np.ascontiguousarray([template_array[x]
+                                           for x in seed_ids],
+                                          dtype=np.float32)
+    stream_array = np.ascontiguousarray([stream_array[x] for x in seed_ids],
+                                        dtype=np.float32)
+    cccs = np.zeros((n_templates, image_len - template_len + 1),
+                    np.float32)
+    used_chans_np = np.ascontiguousarray(used_chans, dtype=np.intc)
+    pad_array_np = np.ascontiguousarray([pad_array[seed_id]
+                                         for seed_id in seed_ids],
+                                        dtype=np.intc)
+
+    # call C function
     ret = utilslib.multi_normxcorr_fftw(
         template_array, n_templates, template_len, n_channels, stream_array,
-        image_len, cccs, fft_len)
-    if ret != 0:
+        image_len, cccs, fft_len, used_chans_np, pad_array_np)
+    if ret < 0:
         raise MemoryError()
-    cccs = cccs.reshape((n_channels, n_templates,
-                         image_len - template_len + 1))
-    for j in range(n_channels):
-        for i in range(n_templates):
-            if not used_chans[j][i]:
-                cccs[j][i] = np.zeros(image_len - template_len + 1)
-    cccs[np.isnan(cccs)] = 0.0
-    if np.any(np.abs(cccs) > 1.01):
-        print('Normalisation error in C code')
+    elif ret > 0:
+        print('Error in C code (possible normalisation error)')
         print(cccs.max())
         print(cccs.min())
         raise MemoryError()
-    cccs[cccs > 1.0] = 1.0
-    cccs[cccs < -1.0] = -1.0
-    for j, seed_id in enumerate(seed_ids):
-        for i in range(len(pad_array[seed_id])):
-            cccs[j][i] = np.append(
-                cccs[j][i],
-                np.zeros(pad_array[seed_id][i]))[pad_array[seed_id][i]:]
-    cccs = cccs.reshape(n_channels, n_templates, image_len - template_len + 1)
+
     return cccs, used_chans
 
 
+# ------------------------------- stream_xcorr functions
+
+
+def get_stream_xcorr(name_or_func=None, concurrency=None):
+    """
+    Return a function for performing normalized cross correlation on lists of
+    streams.
+
+    :param name_or_func:
+        Either a name of a registered function or a callable that implements
+        the standard array_normxcorr signature.
+    :param concurrency:
+        Optional concurrency strategy, options are below.
+
+    :return: A callable with the interface of stream_normxcorr
+
+    :Concurrency options:
+        - multithread - use a threadpool for concurrency;
+        - multiprocess - use a process pool for concurrency;
+        - concurrent - use a customized concurrency strategy for the function,
+          if not defined threading will be used.
+    """
+    func = _get_registerd_func(name_or_func)
+
+    concur = concurrency or 'stream_xcorr'
+    if not hasattr(func, concur):
+        msg = '%s does not support concurrency %s' % (func.__name__, concur)
+        raise ValueError(msg)
+    return getattr(func, concur)
+
+
+# --------------------------- stream prep functions
+
+
+def _get_array_dicts(templates, stream, copy_streams=True):
+    """ prepare templates and stream, return dicts """
+    # Do some reshaping
+    # init empty structures for data storage
+    template_dict = {}
+    stream_dict = {}
+    pad_dict = {}
+    t_starts = []
+
+    stream.sort(['network', 'station', 'location', 'channel'])
+    for template in templates:
+        template.sort(['network', 'station', 'location', 'channel'])
+        t_starts.append(min([tr.stats.starttime for tr in template]))
+    # get seed ids, make sure these are collected on sorted streams
+    seed_ids = [tr.id + '_' + str(i) for i, tr in enumerate(templates[0])]
+    # pull common channels out of streams and templates and put in dicts
+    for i, seed_id in enumerate(seed_ids):
+        temps_with_seed = [template[i].data for template in templates]
+        t_ar = np.array(temps_with_seed).astype(np.float32)
+        template_dict.update({seed_id: t_ar})
+        stream_dict.update(
+            {seed_id: stream.select(
+                id=seed_id.split('_')[0])[0].data.astype(np.float32)})
+        pad_list = [
+            int(round(template[i].stats.sampling_rate *
+                      (template[i].stats.starttime - t_starts[j])))
+            for j, template in zip(range(len(templates)), templates)]
+        pad_dict.update({seed_id: pad_list})
+
+    return stream_dict, template_dict, pad_dict, seed_ids
+
+
+# a dict of built in xcorr functions, used to distinguish from user-defined
+XCORR_FUNCS_ORIGINAL = copy.copy(XCOR_FUNCS)
+
+
 if __name__ == '__main__':
     import doctest
+
     doctest.testmod()
diff --git a/eqcorrscan/utils/debug_log.py b/eqcorrscan/utils/debug_log.py
new file mode 100644
index 000000000..cd2b9fb05
--- /dev/null
+++ b/eqcorrscan/utils/debug_log.py
@@ -0,0 +1,40 @@
+"""
+EQcorrscan's simple logging.
+
+:copyright:
+    EQcorrscan developers.
+
+:license:
+    GNU Lesser General Public License, Version 3
+    (https://www.gnu.org/copyleft/lesser.html)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+def debug_print(string, debug_level, print_level):
+    """
+    Print the string if the print_level exceeds the debug_level.
+
+    :type string: str
+    :param string: String to print
+    :type print_level: int
+    :param print_level: Print-level for statement
+    :type debug_level: int
+    :param debug_level: Output level for function
+
+    .. rubric:: Example
+    >>> debug_print("Albert", 2, 0)
+    >>> debug_print("Norman", 0, 2)
+    Norman
+    """
+    if print_level > debug_level:
+        print(string)
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/eqcorrscan/utils/findpeaks.py b/eqcorrscan/utils/findpeaks.py
index 6189a4feb..b69df601e 100644
--- a/eqcorrscan/utils/findpeaks.py
+++ b/eqcorrscan/utils/findpeaks.py
@@ -204,89 +204,6 @@ def decluster(peaks, trig_int, return_ind=False, debug=0):
         return peaks_out
 
 
-def find_peaks_dep(arr, thresh, trig_int, debug=0, starttime=False,
-                   samp_rate=1.0):
-    """
-    Determine peaks in an array of data above a certain threshold: depreciated.
-
-    Depreciated peak-finding routine, very slow, but accurate.  If all else \
-    fails this one should work.
-
-    :type arr: numpy.ndarray
-    :param arr: 1-D numpy array is required
-    :type thresh: float
-    :param thresh: The threshold below which will be considered noise and \
-        peaks will not be found in.
-    :type trig_int: int
-    :param trig_int: The minimum difference in samples between triggers,\
-        if multiple peaks within this window this code will find the highest.
-    :type starttime: obspy.core.utcdatetime.UTCDateTime
-    :param starttime: Starttime for plotting, only used if debug > 2.
-    :type samp_rate: float
-    :param samp_rate: Sampling rate in Hz, only used for plotting if debug > 2.
-
-    :return: peaks: Lists of tuples of peak values and locations.
-    :rtype: list
-
-    >>> import numpy as np
-    >>> arr = np.random.randn(100)
-    >>> threshold = 10
-    >>> arr[40] = 20
-    >>> arr[60] = 100
-    >>> find_peaks_dep(arr, threshold, 3)
-    [(20.0, 40), (100.0, 60)]
-    """
-    if not starttime:
-        starttime = UTCDateTime(0)
-    # Perform some checks
-    if trig_int < 3:
-        msg = 'Trigger interval must be greater than 2 samples to find maxima'
-        raise IOError(msg)
-    # from joblib import Parallel, delayed
-    # Will find peaks in the absolute then transfer these to the true values
-    sig = np.abs(arr) - thresh
-    true_peaks = []
-    for i in range(int(trig_int), int(len(sig) - trig_int), int(trig_int)):
-        window = sig[i - trig_int: i + trig_int]
-        # Define a moving window containing data from +/- the trigger iterval
-        peaks = []
-        locs = []
-        for j in range(1, len(window) - 1):
-            # Find all turning points within the window
-            if window[j] > 0.0 and window[j] > window[j + 1] and\
-               window[j] > window[j - 1]:
-                peaks.append(window[j])
-                locs.append(i - trig_int + j)
-        # Find maximum peak in window
-        if peaks:
-            true_peaks.append((np.max(np.array(peaks)),
-                               locs[np.argmax(np.array(peaks))]))
-    # Get unique values
-    peaks = sorted(list(set(true_peaks)), key=lambda loc: loc[1])
-    # Find highest peak in peaks within trig_int of each other
-    for i in range(1, len(peaks) - 1):
-        if peaks[i + 1][1] - peaks[i][1] < trig_int:
-            if peaks[i][0] < peaks[i + 1][0]:
-                peaks[i] = peaks[i + 1]
-            else:
-                peaks[i + 1] = peaks[i]
-        elif peaks[i][1] - peaks[i - 1][1] < trig_int:
-            if peaks[i][0] < peaks[i - 1][0]:
-                peaks[i] = peaks[i - 1]
-            else:
-                peaks[i - 1] = peaks[i]
-    peaks = sorted(list(set(peaks)), key=lambda loc: loc[1])
-    if debug >= 3:
-        from eqcorrscan.utils import plotting
-        _fname = ''.join(['peaks_',
-                          starttime.datetime.strftime('%Y-%m-%d'),
-                          '.pdf'])
-        plotting.peaks_plot(arr, starttime, samp_rate, True, peaks,
-                            _fname)
-    peaks = [(peak[0] + thresh, peak[1]) for peak in peaks]
-    return peaks
-
-
 def coin_trig(peaks, stachans, samp_rate, moveout, min_trig, trig_int):
     """
     Find network coincidence triggers within peaks of detection statistics.
diff --git a/eqcorrscan/utils/mag_calc.py b/eqcorrscan/utils/mag_calc.py
index f4357d11a..25d289799 100644
--- a/eqcorrscan/utils/mag_calc.py
+++ b/eqcorrscan/utils/mag_calc.py
@@ -990,11 +990,11 @@ def svd_moments(u, s, v, stachans, event_list, n_svs=2):
     >>> stream_files = glob.glob(os.path.join(testing_path, '*'))
     >>> stream_list = [read(stream_file) for stream_file in stream_files]
     >>> event_list = []
+    >>> remove_list = [('WHAT2', 'SH1'), ('WV04', 'SHZ'), ('GCSZ', 'EHZ')]
     >>> for i, stream in enumerate(stream_list):
     ...     st_list = []
     ...     for tr in stream:
-    ...         if (tr.stats.station, tr.stats.channel) not in
-    ...           [('WHAT2', 'SH1'), ('WV04', 'SHZ'), ('GCSZ', 'EHZ')]:
+    ...         if (tr.stats.station, tr.stats.channel) not in remove_list:
     ...             stream.remove(tr)
     ...             continue
     ...         tr.detrend('simple')
@@ -1106,7 +1106,7 @@ def svd_moments(u, s, v, stachans, event_list, n_svs=2):
     K_width = len(K[0])
     # Add an extra row to K, so average moment = 1
     K.append(np.ones(K_width) * (1. / K_width))
-    print("\nCreated Kernel matrix: ")
+    print("Created Kernel matrix: ")
     del row
     print('\n'.join([''.join([str(round(float(item), 3)).ljust(6)
           for item in row]) for row in K]))
diff --git a/eqcorrscan/utils/pre_processing.py b/eqcorrscan/utils/pre_processing.py
index 4d7dc3bca..b9fd2542b 100644
--- a/eqcorrscan/utils/pre_processing.py
+++ b/eqcorrscan/utils/pre_processing.py
@@ -171,6 +171,8 @@ def shortproc(st, lowcut, highcut, filt_order, samp_rate, debug=0,
     if parallel:
         if not num_cores:
             num_cores = cpu_count()
+        if num_cores > len(st):
+            num_cores = len(st)
         pool = Pool(processes=num_cores)
         results = [pool.apply_async(process, (tr,), {
             'lowcut': lowcut, 'highcut': highcut, 'filt_order': filt_order,
@@ -183,10 +185,10 @@ def shortproc(st, lowcut, highcut, filt_order, samp_rate, debug=0,
         st = Stream(stream_list)
     else:
         for tr in st:
-            process(tr=tr, lowcut=lowcut, highcut=highcut,
-                    filt_order=filt_order, samp_rate=samp_rate, debug=debug,
-                    starttime=False, clip=False,
-                    seisan_chan_names=seisan_chan_names)
+            process(
+                tr=tr, lowcut=lowcut, highcut=highcut, filt_order=filt_order,
+                samp_rate=samp_rate, debug=debug, starttime=False, clip=False,
+                seisan_chan_names=seisan_chan_names)
     if tracein:
         st.merge()
         return st[0]
@@ -244,7 +246,7 @@ def dayproc(st, lowcut, highcut, filt_order, samp_rate, starttime, debug=0,
         warn any-time it has to pad data - if you see strange artifacts in your
         detections, check whether the data have gaps.
 
-    .. rubric:: Example, bandpass
+    .. rubric:: Example
 
     >>> import obspy
     >>> if int(obspy.__version__.split('.')[0]) >= 1:
@@ -253,60 +255,30 @@ def dayproc(st, lowcut, highcut, filt_order, samp_rate, starttime, debug=0,
     ...     from obspy.fdsn import Client
     >>> from obspy import UTCDateTime
     >>> from eqcorrscan.utils.pre_processing import dayproc
-    >>> client = Client('GEONET')
+    >>> client = Client('NCEDC')
     >>> t1 = UTCDateTime(2012, 3, 26)
     >>> t2 = t1 + 86400
-    >>> bulk_info = [('NZ', 'FOZ', '10', 'HHE', t1, t2),
-    ...              ('NZ', 'FOZ', '10', 'HHE', t1, t2)]
+    >>> bulk_info = [('BP', 'JCNB', '40', 'SP1', t1, t2)]
     >>> st = client.get_waveforms_bulk(bulk_info)
+    >>> st_keep = st.copy()  # Copy the stream for later examples
+    >>> # Example of bandpass filtering
     >>> st = dayproc(st=st, lowcut=2, highcut=9, filt_order=3, samp_rate=20,
     ...              starttime=t1, debug=0, parallel=True, num_cores=2)
     >>> print(st[0])
-    NZ.FOZ.10.HHE | 2012-03-25T23:59:59.998393Z - 2012-03-26T23:59:59.\
-948393Z | 20.0 Hz, 1728000 samples
-
-
-    .. rubric:: Example, low-pass
-
-    >>> import obspy
-    >>> if int(obspy.__version__.split('.')[0]) >= 1:
-    ...     from obspy.clients.fdsn import Client
-    ... else:
-    ...     from obspy.fdsn import Client
-    >>> from obspy import UTCDateTime
-    >>> from eqcorrscan.utils.pre_processing import dayproc
-    >>> client = Client('GEONET')
-    >>> t1 = UTCDateTime(2012, 3, 26)
-    >>> t2 = t1 + 86400
-    >>> bulk_info = [('NZ', 'FOZ', '10', 'HHE', t1, t2),
-    ...              ('NZ', 'FOZ', '10', 'HHE', t1, t2)]
-    >>> st = client.get_waveforms_bulk(bulk_info)
+    BP.JCNB.40.SP1 | 2012-03-26T00:00:00.000000Z - 2012-03-26T23:59:59.\
+950000Z | 20.0 Hz, 1728000 samples
+    >>> # Example of lowpass filtering
     >>> st = dayproc(st=st, lowcut=None, highcut=9, filt_order=3, samp_rate=20,
     ...              starttime=t1, debug=0, parallel=True, num_cores=2)
     >>> print(st[0])
-    NZ.FOZ.10.HHE | 2012-03-25T23:59:59.998393Z - 2012-03-26T23:59:59.\
-948393Z | 20.0 Hz, 1728000 samples
-
-    .. rubric:: Example, high-pass
-
-    >>> import obspy
-    >>> if int(obspy.__version__.split('.')[0]) >= 1:
-    ...     from obspy.clients.fdsn import Client
-    ... else:
-    ...     from obspy.fdsn import Client
-    >>> from obspy import UTCDateTime
-    >>> from eqcorrscan.utils.pre_processing import dayproc
-    >>> client = Client('GEONET')
-    >>> t1 = UTCDateTime(2012, 3, 26)
-    >>> t2 = t1 + 86400
-    >>> bulk_info = [('NZ', 'FOZ', '10', 'HHE', t1, t2),
-    ...              ('NZ', 'FOZ', '10', 'HHE', t1, t2)]
-    >>> st = client.get_waveforms_bulk(bulk_info)
+    BP.JCNB.40.SP1 | 2012-03-26T00:00:00.000000Z - 2012-03-26T23:59:59.\
+950000Z | 20.0 Hz, 1728000 samples
+    >>> # Example of highpass filtering
     >>> st = dayproc(st=st, lowcut=2, highcut=None, filt_order=3, samp_rate=20,
     ...              starttime=t1, debug=0, parallel=True, num_cores=2)
     >>> print(st[0])
-    NZ.FOZ.10.HHE | 2012-03-25T23:59:59.998393Z - 2012-03-26T23:59:59.\
-948393Z | 20.0 Hz, 1728000 samples
+    BP.JCNB.40.SP1 | 2012-03-26T00:00:00.000000Z - 2012-03-26T23:59:59.\
+950000Z | 20.0 Hz, 1728000 samples
     """
     # Add sanity check for filter
     if isinstance(st, Trace):
@@ -339,6 +311,8 @@ def dayproc(st, lowcut, highcut, filt_order, samp_rate, starttime, debug=0,
     if parallel:
         if not num_cores:
             num_cores = cpu_count()
+        if num_cores > len(st):
+            num_cores = len(st)
         pool = Pool(processes=num_cores)
         results = [pool.apply_async(process, (tr,), {
             'lowcut': lowcut, 'highcut': highcut, 'filt_order': filt_order,
@@ -352,11 +326,11 @@ def dayproc(st, lowcut, highcut, filt_order, samp_rate, starttime, debug=0,
         st = Stream(stream_list)
     else:
         for tr in st:
-            process(tr=tr, lowcut=lowcut, highcut=highcut,
-                    filt_order=filt_order, samp_rate=samp_rate, debug=debug,
-                    starttime=starttime, clip=True, length=86400,
-                    ignore_length=ignore_length,
-                    seisan_chan_names=seisan_chan_names)
+            process(
+                tr=tr, lowcut=lowcut, highcut=highcut, filt_order=filt_order,
+                samp_rate=samp_rate, debug=debug, starttime=starttime,
+                clip=True, length=86400, ignore_length=ignore_length,
+                seisan_chan_names=seisan_chan_names)
     if tracein:
         st.merge()
         return st[0]
diff --git a/eqcorrscan/utils/timer.py b/eqcorrscan/utils/timer.py
index 9f50a51d0..3d2de137a 100644
--- a/eqcorrscan/utils/timer.py
+++ b/eqcorrscan/utils/timer.py
@@ -11,7 +11,20 @@
 
 
 class Timer(object):
-    """Simple wrapper for timing objects, used for debug."""
+    """
+    Simple wrapper for timing objects, used for debug.
+
+    .. rubric:: Example
+
+    >>> from time import sleep
+    >>> with Timer() as t:
+    ...     sleep(0.1)
+    >>> print("%.1f" % t.secs) # doctest:+SKIP
+    0.1
+    >>> with Timer(verbose=True) as t:
+    ...     sleep(0.1)  # doctest:+ELLIPSIS
+    elapsed time: ...
+    """
 
     def __init__(self, verbose=False):
         """Create timer object."""
@@ -28,7 +41,7 @@ def __exit__(self, *args):
         self.secs = self.end - self.start
         self.msecs = self.secs * 1000  # millisecs
         if self.verbose:
-            print('elapsed time: %f ms' % self.msecs)
+            print('elapsed time: %.0f ms' % self.msecs)
 
 
 if __name__ == "__main__":
diff --git a/pytest.ini b/pytest.ini
index 8c3f8d4a7..63110dbd1 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 norecursedirs=eqcorrscan/doc .* eqcorrscan/scripts build pyasdf eqcorrscan/lib eqcorrscan/utils/src
-addopts = --cov=eqcorrscan --cov-config .coveragerc --ignore=setup.py --doctest-modules -n 2
+addopts = --cov=eqcorrscan --cov-config .coveragerc --ignore=setup.py --doctest-modules -p no:warnings
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..140df563f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+numpy>=1.12
+matplotlib>=1.3.0
+scipy>=0.18
+LatLon
+bottleneck
+obspy>=1.0.3
+h5py
+pytest>=2.0.0
+pytest-cov
+pytest-pep8
+pytest-xdist
+pytest-rerunfailures
+codecov
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c771ab0c2..34e12b1a2 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@
 
 import os
 import sys
+import shutil
 import glob
 import eqcorrscan
 
@@ -55,7 +56,7 @@ def get_package_data():
     if get_build_platform() in ('win32', 'win-amd64'):
         package_data['eqcorrscan.lib'] = [
             'libfftw3-3.dll', 'libfftw3f-3.dll', 'libfftw3l-3.dll']
-        
+
     return package_data
 
 
@@ -132,7 +133,8 @@ def get_extensions():
         'include_dirs': get_include_dirs(),
         'library_dirs': get_library_dirs()}
 
-    sources = [os.path.join('eqcorrscan', 'lib', 'multi_corr.c')]
+    sources = [os.path.join('eqcorrscan', 'lib', 'multi_corr.c'),
+               os.path.join('eqcorrscan', 'lib', 'time_corr.c')]
     exp_symbols = export_symbols("eqcorrscan/lib/libutils.def")
 
     if get_build_platform() not in ('win32', 'win-amd64'):
@@ -238,7 +240,8 @@ def setup_package():
 
     if not READ_THE_DOCS:
         install_requires = ['matplotlib>=1.3.0', 'scipy>=0.18', 'LatLon',
-                            'bottleneck', 'obspy>=1.0.3', 'numpy>=1.12']
+                            'bottleneck', 'obspy>=1.0.3', 'numpy>=1.12',
+                            'h5py']
     else:
         install_requires = ['matplotlib>=1.3.0', 'LatLon', 'obspy>=1.0.3',
                             'mock']
@@ -257,7 +260,7 @@ def setup_package():
             'Development Status :: 4 - Beta',
             'Intended Audience :: Science/Research',
             'Topic :: Scientific/Engineering',
-            'License :: OSI Approved :: GNU Library or Lesser General Public ' +
+            'License :: OSI Approved :: GNU Library or Lesser General Public '
             'License (LGPL)',
             'Programming Language :: Python :: 2.7',
             'Programming Language :: Python :: 3.4',
@@ -268,8 +271,8 @@ def setup_package():
         'scripts': scriptfiles,
         'install_requires': install_requires,
         'setup_requires': ['pytest-runner'],
-        'tests_require': ['pytest', 'pytest-cov', 'pytest-pep8',
-                          'pytest-xdist'],
+        'tests_require': ['pytest>=2.0.0', 'pytest-cov', 'pytest-pep8',
+                          'pytest-xdist', 'pytest-rerunfailures'],
         'cmdclass': {'build_ext': CustomBuildExt}
     }
 
@@ -290,6 +293,8 @@ def setup_package():
         setup_args['ext_modules'] = get_extensions()
         setup_args['package_data'] = get_package_data()
         setup_args['package_dir'] = get_package_dir()
+    if os.path.isdir("build"):
+        shutil.rmtree("build")
     setup(**setup_args)
 
 if __name__ == '__main__':