diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
new file mode 100644
index 0000000..8b48ddf
--- /dev/null
+++ b/.github/workflows/black.yml
@@ -0,0 +1,11 @@
+name: Lint
+
+on: [pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+      - uses: psf/black@stable
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4e1ef42..e54ad87 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -11,11 +11,13 @@ jobs:
   deploy:
 
     runs-on: ubuntu-latest
-
+    name: upload release to PyPI
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: '3.x'
     - name: Install dependencies
@@ -23,9 +25,8 @@ jobs:
         python -m pip install --upgrade pip
         pip install setuptools wheel twine
     - name: Build and publish
-      env:
-        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
         python setup.py sdist bdist_wheel
-        twine upload dist/*
+    - name: Publish package distributions to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
diff --git a/.github/workflows/run-codecov.yml b/.github/workflows/run-codecov.yml
new file mode 100644
index 0000000..a41a1fd
--- /dev/null
+++ b/.github/workflows/run-codecov.yml
@@ -0,0 +1,21 @@
+name: Run codecov
+
+on:
+  pull_request:
+    branches: [master]
+
+jobs:
+  pytest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v2
+      with:
+        file: ./coverage.xml
+        name: py-${{ matrix.python-version }}-${{ matrix.os }}
diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
new file mode 100644
index 0000000..a4fb5fe
--- /dev/null
+++ b/.github/workflows/run-pytest.yml
@@ -0,0 +1,30 @@
+name: Run pytests
+
+on:
+  pull_request:
+    branches: [master, dev]
+
+jobs:
+  pytest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.11"]
+        os: [ubuntu-latest]
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install test dependencies
+      run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi
+
+    - name: Install package
+      run: python -m pip install .
+
+    - name: Run pytest tests
+      run: pytest tests -x -vv --cov=./ --cov-report=xml
diff --git a/.gitignore b/.gitignore
index 7523f45..55ae73f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,9 +71,12 @@ open_pipelines/
 *RESERVE*
 
 # Build-related stuff
+build
 dist/
 refget.egg-info/
 
 #ipynm checkpoints
 *ipynb_checkpoints*
 *.egg-info*
+
+
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9e5fd48..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-language: python
-python:
-  - "3.5"
-  - "3.6"
-  - "3.8"
-os:
-  - linux
-before_install:
-  - pip install git+https://github.com/databio/henge.git@master
-install:
-  - pip install .
-  - pip install -r requirements/requirements-test.txt
-script: pytest tests -x -vv
-branches:
-  only:
-    - dev
-    - master
diff --git a/LICENSE.txt b/LICENSE.txt
index 1b78bad..ec94eeb 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright 2017 Nathan Sheffield
+Copyright 2024 Nathan Sheffield
 
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 
diff --git a/README.md b/README.md
index b855130..7d9660d 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 
 [![Build Status](https://travis-ci.com/refgenie/refget.svg?branch=master)](https://travis-ci.com/refgenie/refget)
 
-The refget package provides a Python interface to both remote and local use of the refget protocol. 
+The refget package provides a Python interface to both remote and local use of the refget protocol.
 
-Documentation is hosted at [refget.databio.org](http://refget.databio.org).
+This package provides clients and functions for both refget sequences and refget sequence collections (seqcol).
+
+Documentation is hosted at [refgenie.org/refget](https://refgenie.org/refget/).
diff --git a/array_overlap.py b/array_overlap.py
new file mode 100644
index 0000000..4012c1a
--- /dev/null
+++ b/array_overlap.py
@@ -0,0 +1,55 @@
+# from itertools import compress
+# from collections import Counter
+# overlap = sum((Counter(A) & Counter(B)).values())
+# A=example1
+# B=example4
+# overlap
+# all = list(A) + list(set(B) - set(list(A)))
+# A_compressed = list(compress(A, B))
+# B_compressed = list(compress(B, A))
+# A_filtered = list(filter(lambda x: x in B, A))
+# B_filtered = list(filter(lambda x: x in A, B))
+# len(A_compressed) == len(B_compressed)
+# C_compressed = list(compress(C, A))
+
+# from itertools import filter
+
+
+def array_overlap(A, B):
+    A_filtered = list(filter(lambda x: x in B, A))
+    B_filtered = list(filter(lambda x: x in A, B))
+    A_count = len(A_filtered)
+    B_count = len(B_filtered)
+    overlap = min(len(A_filtered), len(B_filtered))
+    if A_count + B_count < 1:
+        # order match requires at least 2 matching elements
+        order = None
+    elif not (A_count == B_count):
+        # duplicated matches means order match is undefined
+        order = None
+    else:
+        order = A_filtered == B_filtered
+    return {"overlap": overlap, "order-match": order}
+
+
+example1 = ["A", "B", "C", "D"]
+example2 = ["A", "B", "C"]
+example3 = ["A", "B", "C", "B"]
+example4 = ["B", "B", "B", "B"]
+example5 = ["X", "A", "B", "Y", "C", "D", "E"]
+example6 = ["A", "B", "C", "D", "B"]
+example7 = ["A", "B", "C", "D", "A"]
+example8 = ["A", "B", "C", "D", "B", "A"]
+
+
+compatoverlap(example1, example2)
+compatoverlap(example1, example3)
+compatoverlap(example2, example3)
+compatoverlap(example1, example4)
+compatoverlap(example1, example5)
+compatoverlap(example3, example5)
+compatoverlap(example5, example3)
+compatoverlap(example3, example6)
+compatoverlap(example6, example7)
+compatoverlap(example7, example8)
+compatoverlap(example8, example8)
diff --git a/demo_fasta/compare-0vs1.json b/demo_fasta/compare-0vs1.json
new file mode 100644
index 0000000..e0d9ae6
--- /dev/null
+++ b/demo_fasta/compare-0vs1.json
@@ -0,0 +1,38 @@
+{
+  "array_elements":{
+    "a":{
+      "lengths":3,
+      "names":3,
+      "sequences":3,
+      "sorted_name_length_pairs":3
+    },
+    "a_and_b":{
+      "lengths":2,
+      "names":2,
+      "sequences":0,
+      "sorted_name_length_pairs":2
+    },
+    "a_and_b_same_order":{
+      "lengths":true,
+      "names":true,
+      "sequences":null,
+      "sorted_name_length_pairs":true
+    },
+    "b":{
+      "lengths":2,
+      "names":2,
+      "sequences":2,
+      "sorted_name_length_pairs":2
+    }
+  },
+  "attributes":{
+    "a_and_b":[
+      "lengths",
+      "names",
+      "sequences",
+      "sorted_name_length_pairs"
+    ],
+    "a_only":[],
+    "b_only":[]
+  }
+}
\ No newline at end of file
diff --git a/demo_fasta/compare-1vs1.json b/demo_fasta/compare-1vs1.json
new file mode 100644
index 0000000..6c01b0c
--- /dev/null
+++ b/demo_fasta/compare-1vs1.json
@@ -0,0 +1,38 @@
+{
+  "array_elements":{
+    "a":{
+      "lengths":2,
+      "names":2,
+      "sequences":2,
+      "sorted_name_length_pairs":2
+    },
+    "a_and_b":{
+      "lengths":2,
+      "names":2,
+      "sequences":2,
+      "sorted_name_length_pairs":2
+    },
+    "a_and_b_same_order":{
+      "lengths":true,
+      "names":true,
+      "sequences":true,
+      "sorted_name_length_pairs":true
+    },
+    "b":{
+      "lengths":2,
+      "names":2,
+      "sequences":2,
+      "sorted_name_length_pairs":2
+    }
+  },
+  "attributes":{
+    "a_and_b":[
+      "lengths",
+      "names",
+      "sequences",
+      "sorted_name_length_pairs"
+    ],
+    "a_only":[],
+    "b_only":[]
+  }
+}
\ No newline at end of file
diff --git a/demo_fasta/compare-5vs6.json b/demo_fasta/compare-5vs6.json
new file mode 100644
index 0000000..347f738
--- /dev/null
+++ b/demo_fasta/compare-5vs6.json
@@ -0,0 +1,38 @@
+{
+  "array_elements":{
+    "a":{
+      "lengths":3,
+      "names":3,
+      "sequences":3,
+      "sorted_name_length_pairs":3
+    },
+    "a_and_b":{
+      "lengths":3,
+      "names":3,
+      "sequences":3,
+      "sorted_name_length_pairs":3
+    },
+    "a_and_b_same_order":{
+      "lengths":false,
+      "names":false,
+      "sequences":false,
+      "sorted_name_length_pairs":true
+    },
+    "b":{
+      "lengths":3,
+      "names":3,
+      "sequences":3,
+      "sorted_name_length_pairs":3
+    }
+  },
+  "attributes":{
+    "a_and_b":[
+      "lengths",
+      "names",
+      "sequences",
+      "sorted_name_length_pairs"
+    ],
+    "a_only":[],
+    "b_only":[]
+  }
+}
\ No newline at end of file
diff --git a/demo_fasta/demo0.fa b/demo_fasta/demo0.fa
new file mode 100644
index 0000000..dd08063
--- /dev/null
+++ b/demo_fasta/demo0.fa
@@ -0,0 +1,6 @@
+>chrX
+TTGGGGAA
+>chr1
+GGAA
+>chr2
+GCGC
diff --git a/demo_fasta/demo0.fa.fai b/demo_fasta/demo0.fa.fai
new file mode 100644
index 0000000..dc37dcd
--- /dev/null
+++ b/demo_fasta/demo0.fa.fai
@@ -0,0 +1,3 @@
+chrX	8	6	8	9
+chr1	4	21	4	5
+chr2	4	32	4	5
diff --git a/demo_fasta/demo1.fa.fai b/demo_fasta/demo1.fa.fai
new file mode 100644
index 0000000..c55c6c1
--- /dev/null
+++ b/demo_fasta/demo1.fa.fai
@@ -0,0 +1,2 @@
+chr1	4	6	4	5
+chr2	4	17	4	5
diff --git a/demo_fasta/demo1.fa.gz b/demo_fasta/demo1.fa.gz
new file mode 100644
index 0000000..c2174d6
Binary files /dev/null and b/demo_fasta/demo1.fa.gz differ
diff --git a/demo_fasta/demo5.fa.gz b/demo_fasta/demo5.fa.gz
new file mode 100644
index 0000000..be98af2
Binary files /dev/null and b/demo_fasta/demo5.fa.gz differ
diff --git a/demo_fasta/demo6.fa b/demo_fasta/demo6.fa
new file mode 100644
index 0000000..dd08063
--- /dev/null
+++ b/demo_fasta/demo6.fa
@@ -0,0 +1,6 @@
+>chrX
+TTGGGGAA
+>chr1
+GGAA
+>chr2
+GCGC
diff --git a/demo_fasta/demo6.fa.fai b/demo_fasta/demo6.fa.fai
new file mode 100644
index 0000000..dc37dcd
--- /dev/null
+++ b/demo_fasta/demo6.fa.fai
@@ -0,0 +1,3 @@
+chrX	8	6	8	9
+chr1	4	21	4	5
+chr2	4	32	4	5
diff --git a/deprecated.py b/deprecated.py
new file mode 100644
index 0000000..6038709
--- /dev/null
+++ b/deprecated.py
@@ -0,0 +1,106 @@
+from itertools import compress
+from functools import reduce
+
+import seqcol
+
+sc1 = {"names": ["chr1", "chr2"], "sequences": ["ay89fw", "we4f9x"]}
+
+sc2 = {"names": ["1", "2", "3"], "sequences": ["ay89fw", "we4f9x", "3n20xk2"]}
+
+sc3 = {"names": ["2", "3", "1"], "sequences": ["we4f9x", "3n20xk2", "ay89fw"]}
+
+sc4 = {"names": ["chr1", "3", "2"], "sequences": ["zyt2fw", "9snm23k", "fsd2x3"]}
+
+sc5 = {
+    "names": ["chr1", "3", "2"],
+    "sequences": ["zyt2fw", "9snm23k", "fsd2x3"],
+    "topologies": ["circular", "linear", "linear"],
+}
+
+
+def compat(A, B):
+    ainb = [x in B for x in A]
+    bina = [x in A for x in B]
+    if any(ainb):
+        order = list(compress(B, bina)) == list(compress(A, ainb))
+    else:
+        order = False
+
+    any(ainb)
+
+    flag = 0
+    flag += 2 if all(ainb) else 0
+    flag += 4 if all(bina) else 0
+    flag += 8 if order else 0
+    flag += 1 if any(ainb) else 0
+
+    return flag
+
+
+# New compat function that adds true/false
+def compat(A, B):
+    ainb = [x in B for x in A]
+    bina = [x in A for x in B]
+    if any(ainb):
+        order = list(compress(B, bina)) == list(compress(A, ainb))
+    else:
+        order = False
+
+    any(ainb)
+
+    flag = 0
+    flag += 2 if all(ainb) else 0
+    flag += 4 if all(bina) else 0
+    flag += 8 if order else 0
+    flag += 1 if any(ainb) else 0
+    result = {
+        "any-elements-shared": any(ainb),
+        "a-subset-of-b": all(ainb),
+        "b-subset-of-a": all(bina),
+        "order-match": order,
+        "flag": flag,
+    }
+    return result
+
+
+# For each array:
+# - any-elements-shared (1) (0001)
+# - all-a-in-b        (2) (0010)
+# - all-b-in-a        (4) (0100)
+# - order-match       (8) (1000)
+
+# no match: 0000 = 0
+# one or more shared elements: 0001 = 1
+# all-a-in-b (a is a subset of b, in different order) = 0011 = 3
+# all-b-in-a (b is a subset of a, in different order) = 0101 = 5
+# same content, different order: 0111 = 7
+# a is a subset of b, in same order: 1011 = 11
+# b is a subset of a, in same order: 1101 = 13
+# identity: 1111 = 15
+
+
+compat(sc1["sequences"], sc2["sequences"])
+compat(sc1["names"], sc2["names"])
+
+compat(sc3["sequences"], sc2["sequences"])
+compat(sc3["names"], sc2["names"])
+
+compat(sc1["sequences"], sc3["sequences"])
+
+
+def compat_all(A, B):
+    all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
+    result = {}
+    for k in all_keys:
+        if k not in A or k not in B:
+            result[k] = {"flag": -1}
+        else:
+            result[k] = compat(A[k], B[k])
+    # result["all"] = reduce(lambda x,y: x['flag']&y['flag'], list(result.values()))
+    return result
+
+
+compat_all(sc1, sc2)
+compat_all(sc3, sc2)
+compat_all(sc1, sc3)
+compat_all(sc1, sc5)
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 0d94560..0000000
--- a/docs/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Refget
-
-## Introduction
-
-The refget package provides a Python interface to both remote and local use of the refget protocol. This package serves 4 functions:
-
-1. A lightweight python interface to a remote refget API.
-
-2. Local caching of retrieved results, improving performance for applications that require repeated lookups.
-
-3. A fully functioning local implementation of the refget protocol for local analysis backed by either memory, SQLite, or MongoDB.
-
-4. Convenience functions for computing refget checksums from python and handling FASTA files directly.
-
-## Install
-
-```console
-pip install refget
-```
-
-## Basic use
-
-### Retrieve results from a RESTful API
-
-```python
-import refget
-
-rgc = refget.RefGetClient("https://refget.herokuapp.com/sequence/")
-rgc.refget("6681ac2f62509cfc220d78751b8dc524", start=0, end=10)
-
-```
-
-### Compute digests locally
-
-```python
-refget.trunc512_digest("TCGA")
-```
-
-### Insert and retrieve sequences with a local database
-
-```python
-checksum = rgc.load_seq("GGAA")
-rgc.refget(checksum)
-```
-
-For more details, see the [tutorial](tutorial.md).
diff --git a/docs/autodoc_build/refget.md b/docs/autodoc_build/refget.md
deleted file mode 100644
index 6f38ada..0000000
--- a/docs/autodoc_build/refget.md
+++ /dev/null
@@ -1,110 +0,0 @@
-<script>
-document.addEventListener('DOMContentLoaded', (event) => {
-  document.querySelectorAll('h3 code').forEach((block) => {
-    hljs.highlightBlock(block);
-  });
-});
-</script>
-
-<style>
-h3 .content { 
-    padding-left: 22px;
-    text-indent: -15px;
- }
-h3 .hljs .content {
-    padding-left: 20px;
-    margin-left: 0px;
-    text-indent: -15px;
-    martin-bottom: 0px;
-}
-h4 .content, table .content, p .content, li .content { margin-left: 30px; }
-h4 .content { 
-    font-style: italic;
-    font-size: 1em;
-    margin-bottom: 0px;
-}
-
-</style>
-
-
-# Package `refget` Documentation
-
-## <a name="RefGetClient"></a> Class `RefGetClient`
-```python
-def __init__(self, api_url_base=None, database={}, schemas=None, henges=None, checksum_function=<function md5 at 0x7f9a95a74710>, suppress_connect=True)
-```
-
-A user interface to insert and retrieve decomposable recursive unique identifiers (DRUIDs).
-#### Parameters:
-
-- `database` (`dict`):  Dict-like lookup database with sequences and hashes.
-- `schemas` (`dict`):  One or more jsonschema schemas describing thedata types stored by this Henge
-- `checksum_function` (`function(str) -> str`):  Default function to handle the digest of theserialized items stored in this henge.
-
-
-
-
-```python
-def get_service_info(self)
-```
-
-
-
-```python
-def item_types(self)
-```
-
-A list of item types handled by this Henge instance
-
-
-
-```python
-def load_fasta(self, fa_file, lengths_only=False)
-```
-
-Calculates checksums and loads each sequence in a fasta file into the database.
-
-
-
-```python
-def load_seq(self, seq)
-```
-
-
-
-```python
-def load_sequence_dict(self, seqset)
-```
-
-Convert a 'seqset', which is a dict with names as sequence names and values as sequences, into the 'asdlist' required for henge insert.
-
-
-
-```python
-def meta(self, digest)
-```
-
-
-
-```python
-def refget(self, digest, start=None, end=None)
-```
-
-
-
-```python
-def refget_remote(self, digest, start=None, end=None)
-```
-
-
-
-```python
-def service_info(self)
-```
-
-
-
-
-
-
-*Version Information: `refget` v0.0.1, generated by `lucidoc` v0.4.2*
\ No newline at end of file
diff --git a/docs/changelog.md b/docs/changelog.md
deleted file mode 100644
index a067237..0000000
--- a/docs/changelog.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Changelog
-
-This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. 
-
-## [0.1.0] - 2021-06-17
-
-First public version, backed by henge version 0.1.1.
-
-## [0.0.1] - 2020-06-25
-
-Beta version for testing
diff --git a/docs_jupyter/build/.gitignore b/docs_jupyter/build/.gitignore
deleted file mode 100644
index d6b7ef3..0000000
--- a/docs_jupyter/build/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb
deleted file mode 100644
index acff965..0000000
--- a/docs_jupyter/tutorial.ipynb
+++ /dev/null
@@ -1,690 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Refget python package tutorial"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Record some versions:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'3.8.5'"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from platform import python_version \n",
-    "python_version()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'0.1.0'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import refget\n",
-    "refget.__version__"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Computing digests locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from refget import trunc512_digest"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Show some results for sequence digests:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "trunc512_digest('ACGT')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "trunc512_digest('TCGA')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf70'"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "trunc512_digest('ACGT', 26)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Connecting to a remote API\n",
-    "\n",
-    "The refget package provides a simple python wrapper around a remote hosted refget RESTful API. Provide the base url when construction a RefGetClient object and you can retrieve sequences from the remote server."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'CCACACCACA'"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can also hit the `{digest}/metadata` and `service_info` API endpoints described in the refget API specification:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'metadata': {'aliases': [{'alias': 'ga4gh:SQ.lZyxiD_ByprhOUzrR1o1bq0ezO_1gkrn',\n",
-       "    'naming_authority': 'ga4gh'},\n",
-       "   {'alias': 'I', 'naming_authority': 'unknown'}],\n",
-       "  'length': 230218,\n",
-       "  'md5': '6681ac2f62509cfc220d78751b8dc524',\n",
-       "  'trunc512': '959cb1883fc1ca9ae1394ceb475a356ead1ecceff5824ae7'}}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.meta(\"6681ac2f62509cfc220d78751b8dc524\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'service': {'algorithms': ['ga4gh', 'md5', 'trunc512'],\n",
-       "  'circular_supported': True,\n",
-       "  'subsequence_limit': None,\n",
-       "  'supported_api_versions': ['1.0.0']}}"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.service_info"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "When requesting a sequence that is not found, the service responds appropriately:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Not Found'"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(trunc512_digest('TCGATCGA'))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Use a local database for caching\n",
-    "\n",
-    "By default, any full-sequences retrieved from an API are cached locally in memory (in a Python Dict). This data will not persist past a current session, but is useful if you have an application that requires repeated requests. here, we re-request the sequence requested above. It is much faster this time because it uses a local cache:\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'CCACACCACA'"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can also add new sequences into the database:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Not Found'"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(refget.md5('TCGATCGA'))  # This sequence is not found in our database yet"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "checksum = rgc.load_seq(\"TCGATCGA\")  # So, let's add it into database"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'TCGATCGA'"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(checksum)  # This time it returns"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Keep in mind that sequences added in this way are added to your *local* database, not to the remote API, so when we restart, they will be gone:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del rgc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Not Found'"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")\n",
-    "rgc.refget(refget.md5('TCGA'))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Making data persist\n",
-    "\n",
-    "If you want to retain your local cache, you can use a Dict that is backed by some persistent storage, such as a database on disk or another running process. There are many ways to do this, for example, you can use an sqlite database, a Redis database, or a MongoDB database. Here we'll show you how to use the `sqlitedict` package to back your local database.\n",
-    "\n",
-    "To start, you need to create a dict object and pass that to the RefGetClient constructor."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import refget\n",
-    "from sqlitedict import SqliteDict\n",
-    "mydict = SqliteDict('./my_db.sqlite', autocommit=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\", mydict)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now when we retrieve a sequence it will be added to the local sqlite database automatically."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Look, we can see that this object has been added to our sqlite database:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'CACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mydict[\"6681ac2f62509cfc220d78751b8dc524\"][1:50]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "So now if we kill this object and start it up again *without the API connection*, but with the mydict local backend, we can still retrieve it:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del rgc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rgc = refget.RefGetClient(database=mydict)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Loading a fasta file\n",
-    "\n",
-    "The package also comes with a helper function for computing checksums for an entire fasta file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fa_file = \"../demo_fasta/demo.fa\"\n",
-    "content = rgc.load_fasta(fa_file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'name': 'chr1',\n",
-       "  'length': 4,\n",
-       "  'sequence_digest': 'f1f8f4bf413b16ad135722aa4591043e'},\n",
-       " {'name': 'chr2',\n",
-       "  'length': 4,\n",
-       "  'sequence_digest': '45d0ff9f1a9504cf2039f89c1ffb4c32'}]"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "content"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'ACGT'"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(content[0]['sequence_digest'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "No remote URL connected\n"
-     ]
-    }
-   ],
-   "source": [
-    "rgc.refget(\"blah\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rgc.api_url_base = \"https://refget.herokuapp.com/sequence/\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Not Found'"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rgc.refget(\"blah\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# You can show the complete contents of the database like this:\n",
-    "# rgc.show()\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/interactive_tests.py b/interactive_tests.py
new file mode 100644
index 0000000..e16cdaa
--- /dev/null
+++ b/interactive_tests.py
@@ -0,0 +1,168 @@
+import seqcol
+from seqcol import SeqColHenge
+
+
+scc = SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"])
+scc
+
+fa_file = "demo_fasta/demo0.fa"
+fa_object = seqcol.parse_fasta(fa_file)
+
+skip_seq = False
+aslist = []
+names = []
+lengths = []
+lengthsi = []
+sequences = []
+for k in fa_object.keys():
+    seq = str(fa_object[k])
+    names.append(k)
+    lengths.append(str(len(seq)))
+    lengthsi.append(len(seq))
+    sequences.append(seq)
+
+array_set = {"names": names, "lengths": lengths, "sequences": sequences}
+
+collection_checksum = scc.insert(array_set, "SeqColArraySet")
+collection_checksum
+
+scc.retrieve(collection_checksum)
+scc.retrieve(collection_checksum, reclimit=1)
+
+# scc.retrieve("5c4b07f08319d3d0815f5ee25c45916a01f9d1519f0112e8")
+
+scc.retrieve(collection_checksum, reclimit=1)
+scc.retrieve(collection_checksum, reclimit=2)
+scc.retrieve(collection_checksum)
+scc.supports_inherent_attrs
+
+# Now a test of inherent attributes
+import seqcol
+
+scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
+scci
+scci.schemas
+
+
+fa_file = "demo_fasta/demo0.fa"
+fa_object = seqcol.parse_fasta(fa_file)
+
+array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"}
+array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"}
+
+
+di = scci.insert(array_set_i, "SeqColArraySet")
+di = scci.insert(array_set_i2, "SeqColArraySet")
+di
+# scc.retrieve(di)
+scci.retrieve(di)
+fasta_path = "demo_fasta"
+fasta1 = "demo2.fa"
+fasta2 = "demo3.fa"
+fasta5 = "demo5.fa.gz"
+fasta6 = "demo6.fa"
+
+import os
+
+d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1))
+d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
+d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
+d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5))
+d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6))
+scci.retrieve(d["digest"])
+
+scci.retrieve(d5["digest"])
+
+fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1))
+SCAS = seqcol.fasta_to_csc(fa_object)
+digest = scci.insert(SCAS, "SeqColArraySet", reclimit=1)
+
+
+d["digest"]
+d2["digest"]
+
+scci.compare_digests(d["digest"], d2["digest"])
+scci.compare(d["SCAS"], d2["SCAS"])
+
+
+json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]))
+print(
+    json.dumps(
+        scci.compare(d5["SCAS"], d6["SCAS"]),
+        separators=(",", ":"),
+        ensure_ascii=False,
+        allow_nan=False,
+        sort_keys=True,
+        indent=2,
+    )
+)
+
+build_sorted_name_length_pairs(array_set_i)
+
+# reorder
+
+array_set_reordered = {}
+for k, v in array_set.items():
+    print(k, v)
+    array_set_reordered[k] = list(reversed(v))
+
+array_set
+array_set_reordered
+
+build_sorted_name_length_pairs(array_set)
+build_sorted_name_length_pairs(array_set_reordered)
+
+
+import henge
+
+
+from henge import md5
+
+names = []
+lengths = []
+seq_digests = []
+for k in fa_object.keys():
+    seq = str(fa_object[k])
+    names.append(k)
+    lengths.append(str(len(seq)))
+    seq_digests.append(scc.checksum_function(seq))
+
+
+array_set = {"names": names, "lengths": lengths, "sequences": seq_digests}
+array_set
+collection_checksum = scc.insert(array_set, "SeqColArraySet", reclimit=1)
+
+scc.database = {}
+scc.retrieve("d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a", reclimit=1)
+scc.retrieve("d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a", reclimit=2)
+scc.database[collection_checksum]
+scc.checksum_function
+
+
+scc.database["d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a"]
+
+scc.database["a99fe2e92875099c84f73b20ef8e7dd2f2d12f063383bed0"]
+scc.database["ca82b053295b6f49923d0b2cedb83de49c6be59688c3dfd9"]
+
+
+import os
+
+os.getcwd()
+
+
+## standalone functions
+
+import seqcol
+
+fa_file = "demo_fasta/demo0.fa"
+fa_object = seqcol.parse_fasta(fa_file)
+
+# get a canonical seqcol object
+csc = seqcol.fasta_to_csc(fa_object)
+csc
+import json
+
+print(json.dumps(csc, indent=2))
+
+
+seqcol.seqcol_digest(csc)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..9348158
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.black]
+line-length = 99
+target-version = ['py38', 'py311']
+include = '\.pyi?$'
diff --git a/refget/__init__.py b/refget/__init__.py
index 3841f46..897c9fa 100644
--- a/refget/__init__.py
+++ b/refget/__init__.py
@@ -1,11 +1,12 @@
 # Project configuration, particularly for logging.
 
 import logging
+
 from ._version import __version__
+from .const import *
 from .hash_functions import *
 from .refget import RefGetClient
 from .refget import parse_fasta
-
-
-__classes__ = ["RefGetClient"]
-__all__ = __classes__ + []
+from .seqcol import *
+from .utilities import *
+from .seqcol_client import *
diff --git a/refget/_version.py b/refget/_version.py
index 3dc1f76..d3ec452 100644
--- a/refget/_version.py
+++ b/refget/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/refget/const.py b/refget/const.py
new file mode 100644
index 0000000..987b644
--- /dev/null
+++ b/refget/const.py
@@ -0,0 +1,51 @@
+import os
+
+
+def _schema_path(name):
+    return os.path.join(SCHEMA_FILEPATH, name)
+
+
+CONTENT_ALL_A_IN_B = 2**0
+CONTENT_ALL_B_IN_A = 2**1
+LENGTHS_ALL_A_IN_B = 2**2
+LENGTHS_ALL_B_IN_A = 2**3
+NAMES_ALL_A_IN_B = 2**4
+NAMES_ALL_B_IN_A = 2**5
+TOPO_ALL_A_IN_B = 2**6
+TOPO_ALL_B_IN_A = 2**7
+CONTENT_ANY_SHARED = 2**8
+LENGTHS_ANY_SHARED = 2**9
+NAMES_ANY_SHARED = 2**10
+CONTENT_A_ORDER = 2**11
+CONTENT_B_ORDER = 2**12
+
+FLAGS = {
+    CONTENT_ALL_A_IN_B: "CONTENT_ALL_A_IN_B",
+    CONTENT_ALL_B_IN_A: "CONTENT_ALL_B_IN_A",
+    LENGTHS_ALL_A_IN_B: "LENGTHS_ALL_A_IN_B",
+    LENGTHS_ALL_B_IN_A: "LENGTHS_ALL_B_IN_A",
+    NAMES_ALL_A_IN_B: "NAMES_ALL_A_IN_B",
+    NAMES_ALL_B_IN_A: "NAMES_ALL_B_IN_A",
+    TOPO_ALL_A_IN_B: "TOPO_ALL_A_IN_B",
+    TOPO_ALL_B_IN_A: "TOPO_ALL_B_IN_A",
+    CONTENT_ANY_SHARED: "CONTENT_ANY_SHARED",
+    LENGTHS_ANY_SHARED: "LENGTHS_ANY_SHARED",
+    NAMES_ANY_SHARED: "NAMES_ANY_SHARED",
+    CONTENT_A_ORDER: "CONTENT_A_ORDER",
+    CONTENT_B_ORDER: "CONTENT_B_ORDER",
+}
+
+KNOWN_TOPOS = ["linear", "circular"]
+NAME_KEY = "name"
+SEQ_KEY = "sequence"
+TOPO_KEY = "topology"
+LEN_KEY = "length"
+
+# internal schemas paths determination
+ASL_NAME = "AnnotatedSequenceList"
+SCHEMA_NAMES = [ASL_NAME + ".yaml"]
+SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas")
+INTERNAL_SCHEMAS = [_schema_path(s) for s in SCHEMA_NAMES]
+
+# Alias dict to make typehinting clearer
+SeqCol = dict
diff --git a/refget/exceptions.py b/refget/exceptions.py
new file mode 100644
index 0000000..fd7bde0
--- /dev/null
+++ b/refget/exceptions.py
@@ -0,0 +1,10 @@
+class InvalidSeqColError(Exception):
+    """Object was not validated successfully according to schema."""
+
+    def __init__(self, message, errors):
+        super().__init__(message)
+        self.message = message
+        self.errors = errors
+
+    def __str__(self):
+        return f"InvalidSeqColError ({self.message}): {self.errors}"
diff --git a/refget/hash_functions.py b/refget/hash_functions.py
index 234d98a..34a29b3 100644
--- a/refget/hash_functions.py
+++ b/refget/hash_functions.py
@@ -6,32 +6,38 @@
 import hashlib
 import binascii
 
+
 def trunc512_digest(seq, offset=24):
-    digest = hashlib.sha512(seq.encode('utf-8')).digest()
+    digest = hashlib.sha512(seq.encode("utf-8")).digest()
     hex_digest = binascii.hexlify(digest[:offset])
-    return hex_digest.decode("utf-8") 
+    return hex_digest.decode("utf-8")
+
 
 def ga4gh_digest(seq, digest_size=24):
     # b64 encoding results in 4/3 size expansion of data and padded if
     # not multiple of 3, which doesn't make sense for this use
     assert digest_size % 3 == 0, "digest size must be multiple of 3"
-    digest = hashlib.sha512(seq.encode('utf-8')).digest()
+    digest = hashlib.sha512(seq.encode("utf-8")).digest()
     return _ga4gh_format(digest, digest_size)
 
+
 def _ga4gh_format(digest, digest_size=24):
     tdigest_b64us = base64.urlsafe_b64encode(digest[:digest_size])
     return "ga4gh:SQ.{}".format(tdigest_b64us.decode("utf-8"))
 
+
 def ga4gh_to_trunc512(vmc):
-    base64_strip = vmc.replace("ga4gh:SQ.","")
+    base64_strip = vmc.replace("ga4gh:SQ.", "")
     digest = base64.urlsafe_b64decode(base64_strip)
     hex_digest = binascii.hexlify(digest)
-    return hex_digest.decode("utf-8") 
+    return hex_digest.decode("utf-8")
+
 
 def trunc512_to_ga4gh(trunc512):
-    digest_length = len(trunc512)*2
+    digest_length = len(trunc512) * 2
     digest = binascii.unhexlify(trunc512)
     return _ga4gh_format(digest, digest_length)
 
+
 def md5(seq):
     return hashlib.md5(seq.encode()).hexdigest()
diff --git a/refget/refget.py b/refget/refget.py
index 08468d6..7bc351b 100644
--- a/refget/refget.py
+++ b/refget/refget.py
@@ -12,9 +12,7 @@
 
 _LOGGER = logging.getLogger(__name__)
 henge.ITEM_TYPE = "_item_type"
-SCHEMA_FILEPATH = os.path.join(
-        os.path.dirname(__file__),
-        "schemas")
+SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas")
 
 sequence_schema = """description: "Schema for a single raw sequence"
 henge_class: sequence
@@ -23,9 +21,17 @@
 """
 
 
-
 class RefGetClient(henge.Henge):
-    def __init__(self, api_url_base=None, database={}, schemas=[], schemas_str=[], henges=None, checksum_function=henge.md5, suppress_connect=True):
+    def __init__(
+        self,
+        api_url_base=None,
+        database={},
+        schemas=[],
+        schemas_str=[],
+        henges=None,
+        checksum_function=henge.md5,
+        suppress_connect=True,
+    ):
         """
         A user interface to insert and retrieve decomposable recursive unique
         identifiers (DRUIDs).
@@ -35,29 +41,26 @@ def __init__(self, api_url_base=None, database={}, schemas=[], schemas_str=[], h
             data types stored by this Henge
         :param function(str) -> str checksum_function: Default function to handle the digest of the
             serialized items stored in this henge.
-        """      
+        """
         self.api_url_base = api_url_base
         self.local_cache = database
         self.info = None
         if not suppress_connect:
             self.info = self.get_service_info()
 
-
         # These are the item types that this henge can understand.
-        if len(schemas) ==0 and len(schemas_str) == 0:
+        if len(schemas) == 0 and len(schemas_str) == 0:
             schemas_str = [sequence_schema]
-        super(RefGetClient, self).__init__(database, schemas, schemas_str, henges=henges,
-                                          checksum_function=checksum_function)
+        super(RefGetClient, self).__init__(
+            database, schemas, schemas_str, henges=henges, checksum_function=checksum_function
+        )
 
     def refget_remote(self, digest, start=None, end=None):
         if not self.api_url_base:
             print("No remote URL connected")
             return
 
-        url = "{base}{digest}?accept=text/plain".format(
-            base=self.api_url_base,
-            digest=digest)
-
+        url = "{base}{digest}?accept=text/plain".format(base=self.api_url_base, digest=digest)
 
         if start is not None and end is not None:
             full_retrieved = False
@@ -66,11 +69,11 @@ def refget_remote(self, digest, start=None, end=None):
             full_retrieved = True
 
         r = requests.get(url)
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
         if full_retrieved:
             self.load_seq(result)
-                
+
         return result
 
     def refget(self, digest, start=None, end=None):
@@ -81,7 +84,7 @@ def refget(self, digest, start=None, end=None):
                 result = full_data[start:end]
             else:
                 result = full_data
-            
+
             return result
         except henge.NotFoundException:
             return self.refget_remote(digest, start, end)
@@ -96,22 +99,18 @@ def service_info(self):
         if not self.info:
             self.info = self.get_service_info()
         return self.info
-    
+
     def meta(self, digest):
-        url = "{base}{digest}/metadata".format(
-            base=self.api_url_base,
-            digest=digest)
-        
+        url = "{base}{digest}/metadata".format(base=self.api_url_base, digest=digest)
+
         r = requests.get(url)
-        return json.loads(r.content.decode('utf-8'))
-    
+        return json.loads(r.content.decode("utf-8"))
+
     def get_service_info(self):
-        url = "{base}service-info".format(
-            base=self.api_url_base)
-        
-        r = requests.get(url)
-        return json.loads(r.content.decode('utf-8'))     
+        url = "{base}service-info".format(base=self.api_url_base)
 
+        r = requests.get(url)
+        return json.loads(r.content.decode("utf-8"))
 
     def load_fasta(self, fa_file, lengths_only=False):
         """
@@ -123,9 +122,7 @@ def load_fasta(self, fa_file, lengths_only=False):
         for k in fa_object.keys():
             seq = str(fa_object[k])
             seq_digest = self.load_seq(seq)
-            asdlist.append({'name': k,
-                          'length': len(seq), 
-                          'sequence_digest': seq_digest})
+            asdlist.append({"name": k, "length": len(seq), "sequence_digest": seq_digest})
 
         _LOGGER.debug(asdlist)
         return asdlist
@@ -139,30 +136,29 @@ def load_sequence_dict(self, seqset):
         for k, v in seqset.items():
             if isinstance(v, str):
                 seq = v
-                v = {'sequence': seq}
-            if 'length' not in v.keys():
-                if 'sequence' not in v.keys():
-                    _LOGGER.warning(
-                        "Each sequence must have either length or a sequence.")
+                v = {"sequence": seq}
+            if "length" not in v.keys():
+                if "sequence" not in v.keys():
+                    _LOGGER.warning("Each sequence must have either length or a sequence.")
                 else:
-                    v['length'] = len(v['sequence'])
-            if 'sequence' in v.keys():
-                v['sequence_digest'] = self.load_seq(seq)
-                del v['sequence']
-            if 'name' not in v.keys():
-                v['name'] = k
-            if 'toplogy' not in v.keys():
-                v['toplogy'] = 'linear'
+                    v["length"] = len(v["sequence"])
+            if "sequence" in v.keys():
+                v["sequence_digest"] = self.load_seq(seq)
+                del v["sequence"]
+            if "name" not in v.keys():
+                v["name"] = k
+            if "toplogy" not in v.keys():
+                v["toplogy"] = "linear"
 
             seqset_new[k] = v
 
-        collection_checksum = self.insert(list(seqset_new.values()), 'ASDList')
+        collection_checksum = self.insert(list(seqset_new.values()), "ASDList")
         return collection_checksum, seqset_new
 
 
-
 # Static functions below (these don't require a database)
 
+
 def parse_fasta(fa_file):
     _LOGGER.debug("Hashing {}".format(fa_file))
     try:
@@ -176,4 +172,4 @@ def parse_fasta(fa_file):
         fa_file_unzipped = fa_file.replace(".gz", "")
         fa_object = pyfaidx.Fasta(fa_file_unzipped)
         os.system("gzip {}".format(fa_file_unzipped))
-    return fa_object
\ No newline at end of file
+    return fa_object
diff --git a/refget/schemas/AnnotatedSequenceList.yaml b/refget/schemas/AnnotatedSequenceList.yaml
new file mode 100644
index 0000000..320fdf9
--- /dev/null
+++ b/refget/schemas/AnnotatedSequenceList.yaml
@@ -0,0 +1,33 @@
+description: "Schema for List of ASDs"
+henge_class: "AnnotatedSequenceList"
+recursive: true
+type: array
+items: 
+  description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata"
+  type: object
+  henge_class: ASD
+  properties:
+    name:
+      type: string
+    length:
+      type: "integer"
+    topology:
+      type: string
+      enum: ["circular", "linear"]
+      default: "linear"
+    sequence:
+      description: "Schema for a single raw sequence"
+      henge_class: sequence
+      type: object
+      properties:
+        sequence:
+          type: string
+          description: "Actual sequence content"
+      required:
+        - sequence
+  required:
+    - length
+    - name
+    - topology
+  recursive:
+    - sequence
diff --git a/refget/schemas/RawSeqCol.yaml b/refget/schemas/RawSeqCol.yaml
new file mode 100644
index 0000000..1881280
--- /dev/null
+++ b/refget/schemas/RawSeqCol.yaml
@@ -0,0 +1,19 @@
+description: "Schema for List of ASDs"
+type: array
+henge_class: "RawSeqCol"
+items: 
+  description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata"
+  type: object
+  henge_class: ASD
+  properties:
+    name:
+      type: string
+    length:
+      type: "integer"
+    sequence:
+      description: "Actual sequence content for a single raw sequence"
+      henge_class: sequence
+      type: string
+  required:
+    - length
+    - name
diff --git a/refget/schemas/SeqColArraySet.yaml b/refget/schemas/SeqColArraySet.yaml
new file mode 100644
index 0000000..d37158f
--- /dev/null
+++ b/refget/schemas/SeqColArraySet.yaml
@@ -0,0 +1,27 @@
+description: "SeqColArraySet"
+type: object
+henge_class: "SeqColArraySet"
+properties:
+  topologies:
+    type: array
+    henge_class: "array"
+    items:
+      type: string
+      enum: ["circular", "linear"]
+      default: "linear"  
+  names:
+    type: array
+    henge_class: "array"
+    items:
+      type: string    
+  lengths:
+    type: array
+    henge_class: "array"
+    items:
+      type: string    
+  sequences:
+    type: array
+    henge_class: "seqarray"
+    items:
+      type: string
+      henge_class: sequence
diff --git a/refget/schemas/SeqColArraySetInherent.yaml b/refget/schemas/SeqColArraySetInherent.yaml
new file mode 100644
index 0000000..743cc28
--- /dev/null
+++ b/refget/schemas/SeqColArraySetInherent.yaml
@@ -0,0 +1,36 @@
+description: "SeqColArraySet"
+type: object
+henge_class: "SeqColArraySet"
+properties:
+  topologies:
+    type: array
+    henge_class: "strarray"
+    items:
+      type: string
+      enum: ["circular", "linear"]
+      default: "linear"  
+  names:
+    type: array
+    henge_class: "strarray"
+    items:
+      type: string    
+  lengths:
+    type: array
+    henge_class: "intarray"
+    items:
+      type: integer
+  sequences:
+    type: array
+    henge_class: "seqarray"
+    items:
+      type: string
+      henge_class: sequence
+  sorted_name_length_pairs:
+    type: array
+    henge_class: "strarray"
+    items:
+      type: string
+inherent:
+  - names
+  - lengths
+  - sequences
diff --git a/refget/schemas/TASeqCol.yaml b/refget/schemas/TASeqCol.yaml
new file mode 100644
index 0000000..19a4eb7
--- /dev/null
+++ b/refget/schemas/TASeqCol.yaml
@@ -0,0 +1,31 @@
+description: "TAseqcol"
+type: object
+henge_class: "TASeqCol"
+properties:
+  topology:
+    type: array
+    henge_class: "TopologyVec"
+    items:
+      type: string
+      enum: ["circular", "linear"]
+      default: "linear"  
+  rawseqcol:
+    description: "Schema for List of ASDs"
+    type: array
+    henge_class: "RawSeqCol"
+    items: 
+      description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata"
+      type: object
+      henge_class: ASD
+      properties:
+        name:
+          type: string
+        length:
+          type: "integer"
+        sequence:
+          type: string
+          henge_class: sequence
+          description: "Actual sequence content"
+      required:
+        - length
+        - name
diff --git a/refget/schemas/seqcol.yaml b/refget/schemas/seqcol.yaml
new file mode 100644
index 0000000..42a722b
--- /dev/null
+++ b/refget/schemas/seqcol.yaml
@@ -0,0 +1,28 @@
+description: "A collection of biological sequences."
+type: object
+properties:
+  lengths:
+    type: array
+    collated: true
+    description: "Number of elements, such as nucleotides or amino acids, in each sequence."
+    items:
+      type: integer
+  names:
+    type: array
+    collated: true
+    description: "Human-readable identifiers of each sequence (chromosome names)."
+    items:
+      type: string
+  sequences:
+    type: array
+    collated: true
+    items:
+      type: string
+      description: "Digests of sequences computed using the GA4GH digest algorithm (sha512t24u)."
+required:
+  - lengths
+  - names
+inherent:
+  - lengths
+  - names
+  - sequences
\ No newline at end of file
diff --git a/refget/seqcol.py b/refget/seqcol.py
new file mode 100644
index 0000000..ba6ff56
--- /dev/null
+++ b/refget/seqcol.py
@@ -0,0 +1,199 @@
+import henge
+import logging
+import yacman
+
+from itertools import compress
+
+from .const import *
+from .utilities import *
+
+
+_LOGGER = logging.getLogger(__name__)
+henge.ITEM_TYPE = "_item_type"
+
+
+class SeqColConf(yacman.YAMLConfigManager):
+    """
+    Simple configuration manager object for SeqColHenge.
+    """
+
+    def __init__(
+        self,
+        entries={},
+        filepath=None,
+        yamldata=None,
+        writable=False,
+        wait_max=60,
+        skip_read_lock=False,
+    ):
+        filepath = yacman.select_config(
+            config_filepath=filepath, config_env_vars=["SEQCOLAPI_CONFIG"], config_name="seqcol"
+        )
+        super(SeqColConf, self).__init__(entries, filepath, yamldata, writable)
+
+
+class SeqColHenge(henge.Henge):
+    """
+    Extension of henge that accommodates collections of sequences.
+    """
+
+    def __init__(
+        self,
+        database={},
+        schemas=None,
+        henges=None,
+        checksum_function=sha512t24u_digest,
+    ):
+        """
+        A user interface to insert and retrieve decomposable recursive unique
+        identifiers (DRUIDs).
+
+        :param dict database: Dict-like lookup database with sequences
+            and hashes
+        :param dict schemas: One or more jsonschema schemas describing the
+            data types stored by this Henge
+        :param function(str) -> str checksum_function: Default function to
+            handle the digest of the
+            serialized items stored in this henge.
+        """
+        super(SeqColHenge, self).__init__(
+            database=database,
+            schemas=schemas or INTERNAL_SCHEMAS,
+            henges=henges,
+            checksum_function=checksum_function,
+        )
+        _LOGGER.info("Initializing SeqColHenge")
+
+    def load_fasta(self, fa_file, skip_seq=False, topology_default="linear"):
+        """
+        Load a sequence collection into the database
+
+        :param str fa_file: path to the FASTA file to parse and load
+        :param bool skip_seq: whether to disregard the actual sequences,
+            load just the names and lengths and topology
+        :param bool skip_seq: whether to disregard the actual sequences,
+            load just the names and lengths and topology
+        :param str topology_default: the default topology assigned to
+            every sequence
+        """
+        # TODO: any systematic way infer topology from a FASTA file?
+        if topology_default not in KNOWN_TOPOS:
+            raise ValueError(
+                f"Invalid topology ({topology_default}). " f"Choose from: {','.join(KNOWN_TOPOS)}"
+            )
+        fa_object = parse_fasta(fa_file)
+        aslist = []
+        for k in fa_object.keys():
+            seq = str(fa_object[k])
+            aslist.append(
+                {
+                    NAME_KEY: k,
+                    LEN_KEY: len(seq),
+                    TOPO_KEY: topology_default,
+                    SEQ_KEY: {"" if skip_seq else SEQ_KEY: seq},
+                }
+            )
+        collection_checksum = self.insert(aslist, ASL_NAME)
+        _LOGGER.debug(f"Loaded {ASL_NAME}: {aslist}")
+        return collection_checksum, aslist
+
+    def load_fasta2(self, fa_file, skip_seq=False, topology_default="linear"):
+        """
+        Load a sequence collection into the database
+
+        :param str fa_file: path to the FASTA file to parse and load
+        :param bool skip_seq: whether to disregard the actual sequences,
+            load just the names and lengths and topology
+        :param bool skip_seq: whether to disregard the actual sequences,
+            load just the names and lengths and topology
+        :param str topology_default: the default topology assigned to
+            every sequence
+        """
+        # TODO: any systematic way infer topology from a FASTA file?
+        _LOGGER.info("Loading fasta file...")
+        fa_object = parse_fasta(fa_file)
+        aslist = []
+        for k in fa_object.keys():
+            seq = str(fa_object[k])
+            _LOGGER.info("Loading key: {k} / Length: {l}...".format(k=k, l=len(seq)))
+            aslist.append(
+                {
+                    NAME_KEY: k,
+                    LEN_KEY: len(seq),
+                    TOPO_KEY: topology_default,
+                    SEQ_KEY: "" if skip_seq else seq,
+                }
+            )
+        _LOGGER.info("Inserting into database...")
+        collection_checksum = self.insert(aslist, "RawSeqCol")
+        _LOGGER.debug(f"Loaded {ASL_NAME}: {aslist}")
+        return collection_checksum, aslist
+
+    def compare_digests(self, digestA, digestB):
+        A = self.retrieve(digestA, reclimit=1)
+        B = self.retrieve(digestB, reclimit=1)
+        # _LOGGER.info(A)
+        # _LOGGER.info(B)
+        return compare_seqcols(A, B)
+
+    def retrieve(self, druid, reclimit=None, raw=False):
+        try:
+            return super(SeqColHenge, self).retrieve(druid, reclimit, raw)
+        except henge.NotFoundException as e:
+            _LOGGER.debug(e)
+            raise e
+            try:
+                return self.refget(druid)
+            except Exception as e:
+                _LOGGER.debug(e)
+                raise e
+                return henge.NotFoundException(
+                    "{} not found in database, or in refget.".format(druid)
+                )
+
+    def load_fasta_from_refgenie(self, rgc, refgenie_key):
+        """
+        @param rgc RefGenConf object
+        @param refgenie_key key of genome to load
+        """
+        filepath = rgc.seek(refgenie_key, "fasta")
+        return self.load_fasta_from_filepath(filepath)
+
+    def load_fasta_from_filepath(self, filepath):
+        """
+        @param filepath Path to fasta file
+        """
+        fa_object = parse_fasta(filepath)
+        SCAS = fasta_obj_to_seqcol(fa_object, digest_function=self.checksum_function)
+        digest = self.insert(SCAS, "SeqColArraySet", reclimit=1)
+        return {
+            "fa_file": filepath,
+            "fa_object": fa_object,
+            "SCAS": SCAS,
+            "digest": digest,
+        }
+
+    def load_from_chromsizes(self, chromsizes):
+        """
+        @param chromsizes Path to chromsizes file
+        """
+        SCAS = chrom_sizes_to_seqcol(chromsizes, digest_function=self.checksum_function)
+        digest = self.insert(SCAS, "SeqColArraySet", reclimit=1)
+        return {
+            "chromsizes_file": chromsizes,
+            "SCAS": SCAS,
+            "digest": digest,
+        }
+
+    def load_multiple_fastas(self, fasta_dict):
+        """
+        Wrapper for load_fasta_from_filepath
+
+        @param fasta_list
+        """
+        results = {}
+        for name in fasta_dict.keys():
+            path = fasta_dict[name]["fasta"]
+            print(f"Processing fasta '{name}'' at path '{path}'...")
+            results[name] = self.load_fasta_from_filepath(path)
+        return results
diff --git a/refget/seqcol_client.py b/refget/seqcol_client.py
new file mode 100644
index 0000000..58f4e40
--- /dev/null
+++ b/refget/seqcol_client.py
@@ -0,0 +1,49 @@
+import requests
+
+
+class SeqColClient(object):
+    """
+    A client for interacting with a sequence collection API.
+
+    Args:
+        url (str, optional): The base URL of the sequence collection API. Defaults to "http://seqcolapi.databio.org".
+
+    Attributes:
+        url (str): The base URL of the sequence collection API.
+
+    Methods:
+        get_collection(accession, level=2): Retrieves a sequence collection for a given accession and level.
+    """
+
+    def __init__(self, url="http://seqcolapi.databio.org"):
+        self.url = url
+
+    def get_collection(self, accession, level=2):
+        """
+        Retrieves a sequence collection for a given accession and detail level.
+
+        Args:
+            accession (str): The accession of the sequence collection.
+            level (int, optional): The level of detail for the sequence collection. Defaults to 2.
+
+        Returns:
+            dict: The JSON response containing the sequence collection.
+        """
+        url = f"{self.url}/collection/{accession}?level={level}"
+        response = requests.get(url)
+        return response.json()
+
+    def compare(self, accession1, accession2):
+        """
+        Compares two sequence collections.
+
+        Args:
+            accession1 (str): The accession of the first sequence collection.
+            accession2 (str): The accession of the second sequence collection.
+
+        Returns:
+            dict: The JSON response containing the comparison of the two sequence collections.
+        """
+        url = f"{self.url}/comparison/{accession1}/{accession2}"
+        response = requests.get(url)
+        return response.json()
diff --git a/refget/utilities.py b/refget/utilities.py
new file mode 100644
index 0000000..573e3ef
--- /dev/null
+++ b/refget/utilities.py
@@ -0,0 +1,312 @@
+import base64
+import binascii
+import hashlib
+import json
+import logging
+import os
+import pyfaidx
+
+from jsonschema import Draft7Validator
+from typing import Optional, Callable
+from yacman import load_yaml
+
+from .const import SeqCol
+from .exceptions import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def trunc512_digest(seq, offset=24) -> str:
+    """Deprecated GA4GH digest function"""
+    digest = hashlib.sha512(seq.encode()).digest()
+    hex_digest = binascii.hexlify(digest[:offset])
+    return hex_digest.decode()
+
+
+def sha512t24u_digest(seq: str, offset: int = 24) -> str:
+    """GA4GH digest function"""
+    digest = hashlib.sha512(seq.encode()).digest()
+    tdigest_b64us = base64.urlsafe_b64encode(digest[:offset])
+    return tdigest_b64us.decode("ascii")
+
+
+def canonical_str(item: dict) -> str:
+    """Convert a dict into a canonical string representation"""
+    return json.dumps(
+        item, separators=(",", ":"), ensure_ascii=False, allow_nan=False, sort_keys=True
+    )
+
+
+def print_csc(csc: dict) -> str:
+    """Convenience function to pretty-print a canonical sequence collection"""
+    return print(json.dumps(csc, indent=2))
+
+
+def validate_seqcol_bool(seqcol_obj: SeqCol, schema=None) -> bool:
+    """
+    Validate a seqcol object against the seqcol schema. Returns True if valid, False if not.
+
+    To enumerate the errors, use validate_seqcol instead.
+    """
+    schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
+    schema = load_yaml(schema_path)
+    validator = Draft7Validator(schema)
+    return validator.is_valid(seqcol_obj)
+
+
+def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]:
+    """Validate a seqcol object against the seqcol schema.
+    Returns True if valid, raises InvalidSeqColError if not, which enumerates the errors.
+    Retrieve individual errors with exception.errors
+    """
+    schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
+    schema = load_yaml(schema_path)
+    validator = Draft7Validator(schema)
+    if not validator.is_valid(seqcol_obj):
+        errors = sorted(validator.iter_errors(seqcol_obj), key=lambda e: e.path)
+        raise InvalidSeqColError("Validation failed", errors)
+    return True
+
+
+def format_itemwise(csc: SeqCol) -> list:
+    """
+    Format a SeqCol object into a list of dicts, one per sequence.
+    """
+    list_of_dicts = []
+    # TODO: handle all properties, not just these 3
+    # TODO: handle non-collated attributes, somehow
+    for i in range(len(csc["lengths"])):
+        list_of_dicts.append(
+            {
+                "name": csc["names"][i],
+                "length": csc["lengths"][i],
+                "sequence": csc["sequences"][i],
+            }
+        )
+    return {"sequences": list_of_dicts}
+
+
+def parse_fasta(fa_file) -> pyfaidx.Fasta:
+    """
+    Read in a gzipped or not gzipped FASTA file
+    """
+    try:
+        return pyfaidx.Fasta(fa_file)
+    except pyfaidx.UnsupportedCompressionFormat:
+        # pyfaidx can handle bgzip but not gzip; so we just hack it here and
+        # gunzip the file into a temporary one and read it in not to interfere
+        # with the original one.
+        from gzip import open as gzopen
+        from tempfile import NamedTemporaryFile
+
+        with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out:
+            f_out.writelines(f_in.read())
+            f_out.seek(0)
+            return pyfaidx.Fasta(f_out.name)
+
+
+def chrom_sizes_to_digest(chrom_sizes_file_path: str) -> str:
+    """Given a chrom.sizes file, return a digest"""
+    seqcol_obj = chrom_sizes_to_seqcol(chrom_sizes_file_path)
+    return seqcol_digest(seqcol_obj)
+
+
+def chrom_sizes_to_seqcol(
+    chrom_sizes_file_path: str,
+    digest_function: Callable[[str], str] = sha512t24u_digest,
+) -> dict:
+    """Given a chrom.sizes file, return a canonical seqcol object"""
+    with open(chrom_sizes_file_path, "r") as f:
+        lines = f.readlines()
+    CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
+    for line in lines:
+        line = line.strip()
+        if line == "":
+            continue
+        seq_name, seq_length, ga4gh_digest, md5_digest = line.split("\t")
+        snlp = {"length": seq_length, "name": seq_name}  # sorted_name_length_pairs
+        snlp_digest = digest_function(canonical_str(snlp))
+        CSC["lengths"].append(int(seq_length))
+        CSC["names"].append(seq_name)
+        CSC["sequences"].append(ga4gh_digest)
+        CSC["sorted_name_length_pairs"].append(snlp_digest)
+    CSC["sorted_name_length_pairs"].sort()
+    return CSC
+
+
+def fasta_file_to_digest(fa_file_path: str) -> str:
+    """Given a fasta, return a digest"""
+    seqcol_obj = fasta_file_to_seqcol(fa_file_path)
+    return seqcol_digest(seqcol_obj)
+
+
+def fasta_file_to_seqcol(fa_file_path: str) -> dict:
+    """Given a fasta, return a canonical seqcol object"""
+    fa_obj = parse_fasta(fa_file_path)
+    return fasta_obj_to_seqcol(fa_obj)
+
+
+def fasta_obj_to_seqcol(
+    fa_object: pyfaidx.Fasta,
+    verbose: bool = True,
+    digest_function: Callable[[str], str] = sha512t24u_digest,
+) -> dict:
+    """
+    Given a fasta object, return a CSC (Canonical Sequence Collection object)
+    """
+    # CSC = SeqColArraySet
+    # Or equivalently, a "Level 1 SeqCol"
+
+    CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
+    seqs = fa_object.keys()
+    nseqs = len(seqs)
+    print(f"Found {nseqs} chromosomes")
+    i = 1
+    for k in fa_object.keys():
+        if verbose:
+            print(f"Processing ({i} of {nseqs}) {k}...")
+        seq = str(fa_object[k])
+        seq_length = len(seq)
+        seq_name = fa_object[k].name
+        seq_digest = "SQ." + digest_function(seq.upper())
+        snlp = {"length": seq_length, "name": seq_name}  # sorted_name_length_pairs
+        snlp_digest = digest_function(canonical_str(snlp))
+        CSC["lengths"].append(seq_length)
+        CSC["names"].append(seq_name)
+        CSC["sorted_name_length_pairs"].append(snlp_digest)
+        CSC["sequences"].append(seq_digest)
+        i += 1
+    CSC["sorted_name_length_pairs"].sort()
+    return CSC
+
+
+def build_sorted_name_length_pairs(obj: dict, digest_function):
+    """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system"""
+    sorted_name_length_pairs = []
+    for i in range(len(obj["names"])):
+        sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]})
+    nl_digests = []  # name-length digests
+    for i in range(len(sorted_name_length_pairs)):
+        nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i])))
+
+    nl_digests.sort()
+    return nl_digests
+
+
+def compare_seqcols(A: SeqCol, B: SeqCol):
+    """
+    Workhorse comparison function
+
+    @param A Sequence collection A
+    @param B Sequence collection B
+    @return dict Following formal seqcol specification comparison function return value
+    """
+    validate_seqcol(A)  # First ensure these are the right structure
+    validate_seqcol(B)
+
+    all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
+    result = {}
+
+    # Compute lengths of each array; only do this for array attributes
+    a_lengths = {}
+    b_lengths = {}
+    for k in A.keys():
+        a_lengths[k] = len(A[k])
+    for k in B.keys():
+        b_lengths[k] = len(B[k])
+
+    return_obj = {
+        "attributes": {"a_only": [], "b_only": [], "a_and_b": []},
+        "array_elements": {
+            "a": a_lengths,
+            "b": b_lengths,
+            "a_and_b": {},
+            "a_and_b_same_order": {},
+        },
+    }
+
+    for k in all_keys:
+        _LOGGER.info(k)
+        if k not in A:
+            result[k] = {"flag": -1}
+            return_obj["attributes"]["b_only"].append(k)
+            # return_obj["array_elements"]["total"][k] = {"a": None, "b": len(B[k])}
+        elif k not in B:
+            return_obj["attributes"]["a_only"].append(k)
+            # return_obj["array_elements"]["total"][k] = {"a": len(A[k]), "b": None}
+        else:
+            return_obj["attributes"]["a_and_b"].append(k)
+            res = _compare_elements(A[k], B[k])
+            # return_obj["array_elements"]["total"][k] = {"a": len(A[k]), "b": len(B[k])}
+            return_obj["array_elements"]["a_and_b"][k] = res["a_and_b"]
+            return_obj["array_elements"]["a_and_b_same_order"][k] = res["a_and_b_same_order"]
+    return return_obj
+
+
+def _compare_elements(A: list, B: list):
+    """
+    Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function
+    """
+
+    A_filtered = list(filter(lambda x: x in B, A))
+    B_filtered = list(filter(lambda x: x in A, B))
+    A_count = len(A_filtered)
+    B_count = len(B_filtered)
+    overlap = min(len(A_filtered), len(B_filtered))  # counts duplicates
+
+    if A_count + B_count < 1:
+        # order match requires at least 2 matching elements
+        order = None
+    elif not (A_count == B_count == overlap):
+        # duplicated matches means order match is undefined
+        order = None
+    else:
+        order = A_filtered == B_filtered
+    return {"a_and_b": overlap, "a_and_b_same_order": order}
+
+
+def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
+    """
+    Given a canonical sequence collection, compute its digest.
+
+    :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object
+    :param dict schema: Schema defining the inherent attributes to digest
+    :return str: The sequence collection digest
+    """
+
+    validate_seqcol(seqcol_obj)
+    # Step 1a: Remove any non-inherent attributes,
+    # so that only the inherent attributes contribute to the digest.
+    seqcol_obj2 = {}
+    if schema:
+        for k in schema["inherent"]:
+            # Step 2: Apply RFC-8785 to canonicalize the value
+            # associated with each attribute individually.
+            seqcol_obj2[k] = canonical_str(seqcol_obj[k])
+    else:  # no schema provided, so assume all attributes are inherent
+        for k in seqcol_obj:
+            seqcol_obj2[k] = canonical_str(seqcol_obj[k])
+    # Step 3: Digest each canonicalized attribute value
+    # using the GA4GH digest algorithm.
+
+    seqcol_obj3 = {}
+    for attribute in seqcol_obj2:
+        seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute])
+    # print(json.dumps(seqcol_obj3, indent=2))  # visualize the result
+
+    # Step 4: Apply RFC-8785 again to canonicalize the JSON
+    # of new seqcol object representation.
+
+    seqcol_obj4 = canonical_str(seqcol_obj3)
+
+    # Step 5: Digest the final canonical representation again.
+    seqcol_digest = sha512t24u_digest(seqcol_obj4)
+    return seqcol_digest
+
+
+def explain_flag(flag):
+    """Explains a compare flag"""
+    print(f"Flag: {flag}\nBinary: {bin(flag)}\n")
+    for e in range(0, 13):
+        if flag & 2**e:
+            print(FLAGS[2**e])
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 0470f30..1afd5a4 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,5 +1,6 @@
-henge==0.1.1
+biopython
+henge>=0.2.1
 pyfaidx
+pyyaml
 requests
 yacman>=0.6.7
-pyyaml
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
new file mode 100644
index 0000000..60c9958
--- /dev/null
+++ b/requirements/requirements-dev.txt
@@ -0,0 +1 @@
+-e git+git://github.com/databio/henge@master#egg=henge
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f45eb89..b7cf1d9 100644
--- a/setup.py
+++ b/setup.py
@@ -19,10 +19,10 @@
 
 extra["install_requires"] = DEPENDENCIES
 
-with open("{}/_version.py".format(PACKAGE), 'r') as versionfile:
+with open("{}/_version.py".format(PACKAGE), "r") as versionfile:
     version = versionfile.readline().split()[-1].strip("\"'\n")
 
-long_description = open('README.md').read()
+long_description = open("README.md").read()
 
 setup(
     name=PACKAGE,
@@ -30,28 +30,28 @@
     version=version,
     description="Python client for refget",
     long_description=long_description,
-    long_description_content_type='text/markdown',
+    long_description_content_type="text/markdown",
     classifiers=[
         "Development Status :: 4 - Beta",
         "License :: OSI Approved :: BSD License",
-        "Programming Language :: Python :: 2.7",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
     ],
     keywords="genome, assembly, bioinformatics, reference, sequence",
     url="https://github.com/refgenie/refget",
-    author=u"Nathan Sheffield, Michal Stolarczyk",
-    author_email=u"nathan@code.databio.org", 
+    author="Nathan Sheffield, Michal Stolarczyk",
+    author_email="nathan@code.databio.org",
     license="BSD2",
     entry_points={
-        "console_scripts": [
-            'refget = refget.refget:main'
-        ],
-    },    
+        "console_scripts": ["refget = refget.refget:main"],
+    },
     # package_data={"refget": [os.path.join("refget", "*")]},
     include_package_data=True,
     test_suite="tests",
     tests_require=(["mock", "pytest"]),
     setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []),
-    **extra
+    **extra,
 )
diff --git a/tests/conftest.py b/tests/conftest.py
index 37d6922..129ff0b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,13 @@
 """ Test suite shared objects and setup """
+
 import os
 import pytest
 import oyaml as yaml
+from refget.const import _schema_path
 
 
 def ly(n, data_path):
+    """Load YAML"""
     with open(os.path.join(data_path, n), "r") as f:
         return yaml.safe_load(f)
 
@@ -14,9 +17,19 @@ def schema_path():
     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas")
 
 
+@pytest.fixture
+def fa_root():
+    return os.path.join(
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir),
+        "demo_fasta",
+    )
+
+
 @pytest.fixture
 def fasta_path():
-    return os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), "demo_fasta")
+    return os.path.join(
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), "demo_fasta"
+    )
 
 
 @pytest.fixture
diff --git a/tests/test_refget.py b/tests/test_refget.py
index 5f3a2ae..e53f37a 100644
--- a/tests/test_refget.py
+++ b/tests/test_refget.py
@@ -10,11 +10,10 @@ class TestEmptyConstructor:
     def test_no_schemas_required(self):
         assert isinstance(RefGetClient(), RefGetClient)
 
+
 class TestInsert:
     @pytest.mark.parametrize("fasta_name", DEMO_FILES)
     def test_insert_works(self, fasta_name, fasta_path):
         rgc = RefGetClient()
         d = rgc.load_seq("TCGA")
-        assert(rgc.refget(d) == "TCGA")
-
-
+        assert rgc.refget(d) == "TCGA"
diff --git a/tests/test_seqcol.py b/tests/test_seqcol.py
new file mode 100644
index 0000000..85555f9
--- /dev/null
+++ b/tests/test_seqcol.py
@@ -0,0 +1,124 @@
+import json
+import os
+import pytest
+import refget
+
+# from seqcol import SeqColHenge, validate_seqcol, compare
+# from seqcol.const import *
+
+DEMO_FILES = [
+    "demo0.fa",
+    "demo1.fa.gz",
+    "demo2.fa",
+    "demo3.fa",
+    "demo4.fa",
+    "demo5.fa.gz",
+    "demo6.fa",
+]
+
+# Pairs of files to compare, with the "correct" compare response
+COMPARE_TESTS = [
+    (DEMO_FILES[1], DEMO_FILES[1], "demo_fasta/compare-1vs1.json"),
+    (DEMO_FILES[0], DEMO_FILES[1], "demo_fasta/compare-0vs1.json"),
+]
+SNLP_TESTS = [
+    (DEMO_FILES[5], DEMO_FILES[6], "demo_fasta/compare-5vs6.json"),
+]  # sorted_name_length_pairs
+
+
+class TestGeneral:
+    def test_no_schemas_required(self):
+        """
+        In contrast to the generic Henge object, SeqColHenge does not
+        require schemas as input, they are predefined in the constructor
+        """
+        assert isinstance(refget.SeqColHenge(database={}), refget.SeqColHenge)
+
+
+class TestFastaInserting:
+    @pytest.mark.parametrize("fasta_name", DEMO_FILES)
+    def test_fasta_loading_works(self, fasta_name, fa_root):
+        scc = refget.SeqColHenge(database={})
+        f = os.path.join(fa_root, fasta_name)
+        print("Fasta file to be loaded: {}".format(f))
+        res = scc.load_fasta(f)
+        assert len(res) == 2  # returns digest and list of AnnotatedSequencesList
+
+
+class TestRetrieval:
+    @pytest.mark.parametrize("fasta_name", DEMO_FILES)
+    def test_retrieval_works(self, fasta_name, fa_root):
+        scc = refget.SeqColHenge(database={})
+        f = os.path.join(fa_root, fasta_name)
+        print("Fasta file to be loaded: {}".format(f))
+        d, asds = scc.load_fasta(f)
+        # convert integers in the dicts to strings
+        # {k: str(v) if isinstance(v, int) else v for k, v in asd.items()}
+        lst = [{k: v for k, v in asd.items()} for asd in asds]
+        assert scc.retrieve(d) == lst
+
+
+def check_comparison(fasta1, fasta2, expected_comparison):
+    print(f"Comparison: Fasta1: {fasta1} vs Fasta2: {fasta2}. Expected: {expected_comparison}")
+    scc = refget.SeqColHenge(database={})
+    d = scc.load_fasta_from_filepath(fasta1)
+    d2 = scc.load_fasta_from_filepath(fasta2)
+    with open(expected_comparison) as fp:
+        correct_compare_response = json.load(fp)
+        proposed_compare_response = refget.compare_seqcols(d["SCAS"], d2["SCAS"])
+        print(
+            json.dumps(
+                proposed_compare_response,
+                separators=(",", ":"),
+                ensure_ascii=False,
+                allow_nan=False,
+                sort_keys=True,
+                indent=2,
+            )
+        )
+        assert proposed_compare_response == correct_compare_response
+
+
+class TestCompare:
+    """
+    Test the compare function, using demo fasta files, and pre-computed
+    compare function results stored as answer files.
+    """
+
+    @pytest.mark.parametrize(["fasta1", "fasta2", "answer_file"], COMPARE_TESTS)
+    def test_fasta_compare(self, fasta1, fasta2, answer_file, fa_root):
+        check_comparison(os.path.join(fa_root, fasta1), os.path.join(fa_root, fasta2), answer_file)
+
+    @pytest.mark.parametrize(["fasta1", "fasta2", "answer_file"], SNLP_TESTS)
+    def test_names_lengths_order(self, fasta1, fasta2, answer_file, fa_root):
+        """Does the names_lengths array correctly identify order variants"""
+        check_comparison(os.path.join(fa_root, fasta1), os.path.join(fa_root, fasta2), answer_file)
+
+
+seqcol_obj = {
+    "lengths": [248956422, 133797422, 135086622],
+    "names": ["chr1", "chr2", "chr3"],
+    "sequences": [
+        "2648ae1bacce4ec4b6cf337dcae37816",
+        "907112d17fcb73bcab1ed1c72b97ce68",
+        "1511375dc2dd1b633af8cf439ae90cec",
+    ],
+}
+
+bad_seqcol = {"bogus": True}
+
+
+class TestValidate:
+    """
+    Test validation
+    """
+
+    @pytest.mark.parametrize(["seqcol_obj"], [[seqcol_obj]])
+    def test_validate(self, seqcol_obj):
+        is_valid = refget.validate_seqcol(seqcol_obj)
+        assert is_valid
+
+    @pytest.mark.parametrize(["seqcol_obj"], [[bad_seqcol]])
+    def test_failure(self, seqcol_obj):
+        with pytest.raises(Exception):
+            refget.validate_seqcol(seqcol_obj)