diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
new file mode 100644
index 0000000..8b48ddf
--- /dev/null
+++ b/.github/workflows/black.yml
@@ -0,0 +1,11 @@
+name: Lint
+
+on: [pull_request]
+
+jobs:
+ lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ - uses: psf/black@stable
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4e1ef42..e54ad87 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -11,11 +11,13 @@ jobs:
deploy:
runs-on: ubuntu-latest
-
+ name: upload release to PyPI
+ permissions:
+ id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install dependencies
@@ -23,9 +25,8 @@ jobs:
python -m pip install --upgrade pip
pip install setuptools wheel twine
- name: Build and publish
- env:
- TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
- TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
python setup.py sdist bdist_wheel
- twine upload dist/*
+ - name: Publish package distributions to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+
diff --git a/.github/workflows/run-codecov.yml b/.github/workflows/run-codecov.yml
new file mode 100644
index 0000000..a41a1fd
--- /dev/null
+++ b/.github/workflows/run-codecov.yml
@@ -0,0 +1,21 @@
+name: Run codecov
+
+on:
+ pull_request:
+ branches: [master]
+
+jobs:
+ pytest:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.9]
+ os: [ubuntu-latest]
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v2
+ with:
+ file: ./coverage.xml
+ name: py-${{ matrix.python-version }}-${{ matrix.os }}
diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
new file mode 100644
index 0000000..a4fb5fe
--- /dev/null
+++ b/.github/workflows/run-pytest.yml
@@ -0,0 +1,30 @@
+name: Run pytests
+
+on:
+ pull_request:
+ branches: [master, dev]
+
+jobs:
+ pytest:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: ["3.8", "3.11"]
+ os: [ubuntu-latest]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install test dependencies
+ run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi
+
+ - name: Install package
+ run: python -m pip install .
+
+ - name: Run pytest tests
+ run: pytest tests -x -vv --cov=./ --cov-report=xml
diff --git a/.gitignore b/.gitignore
index 7523f45..55ae73f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,9 +71,12 @@ open_pipelines/
*RESERVE*
# Build-related stuff
+build
dist/
refget.egg-info/
#ipynm checkpoints
*ipynb_checkpoints*
*.egg-info*
+
+
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9e5fd48..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-language: python
-python:
- - "3.5"
- - "3.6"
- - "3.8"
-os:
- - linux
-before_install:
- - pip install git+https://github.com/databio/henge.git@master
-install:
- - pip install .
- - pip install -r requirements/requirements-test.txt
-script: pytest tests -x -vv
-branches:
- only:
- - dev
- - master
diff --git a/LICENSE.txt b/LICENSE.txt
index 1b78bad..ec94eeb 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright 2017 Nathan Sheffield
+Copyright 2024 Nathan Sheffield
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/README.md b/README.md
index b855130..7d9660d 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
[![Build Status](https://travis-ci.com/refgenie/refget.svg?branch=master)](https://travis-ci.com/refgenie/refget)
-The refget package provides a Python interface to both remote and local use of the refget protocol.
+The refget package provides a Python interface to both remote and local use of the refget protocol.
-Documentation is hosted at [refget.databio.org](http://refget.databio.org).
+This package provides clients and functions for both refget sequences and refget sequence collections (seqcol).
+
+Documentation is hosted at [refgenie.org/refget](https://refgenie.org/refget/).
diff --git a/array_overlap.py b/array_overlap.py
new file mode 100644
index 0000000..4012c1a
--- /dev/null
+++ b/array_overlap.py
@@ -0,0 +1,55 @@
+# from itertools import compress
+# from collections import Counter
+# overlap = sum((Counter(A) & Counter(B)).values())
+# A=example1
+# B=example4
+# overlap
+# all = list(A) + list(set(B) - set(list(A)))
+# A_compressed = list(compress(A, B))
+# B_compressed = list(compress(B, A))
+# A_filtered = list(filter(lambda x: x in B, A))
+# B_filtered = list(filter(lambda x: x in A, B))
+# len(A_compressed) == len(B_compressed)
+# C_compressed = list(compress(C, A))
+
+# from itertools import filter
+
+
+def array_overlap(A, B):
+ A_filtered = list(filter(lambda x: x in B, A))
+ B_filtered = list(filter(lambda x: x in A, B))
+ A_count = len(A_filtered)
+ B_count = len(B_filtered)
+ overlap = min(len(A_filtered), len(B_filtered))
+ if A_count + B_count < 1:
+ # order match requires at least 2 matching elements
+ order = None
+ elif not (A_count == B_count):
+ # duplicated matches means order match is undefined
+ order = None
+ else:
+ order = A_filtered == B_filtered
+ return {"overlap": overlap, "order-match": order}
+
+
+example1 = ["A", "B", "C", "D"]
+example2 = ["A", "B", "C"]
+example3 = ["A", "B", "C", "B"]
+example4 = ["B", "B", "B", "B"]
+example5 = ["X", "A", "B", "Y", "C", "D", "E"]
+example6 = ["A", "B", "C", "D", "B"]
+example7 = ["A", "B", "C", "D", "A"]
+example8 = ["A", "B", "C", "D", "B", "A"]
+
+
+compatoverlap(example1, example2)
+compatoverlap(example1, example3)
+compatoverlap(example2, example3)
+compatoverlap(example1, example4)
+compatoverlap(example1, example5)
+compatoverlap(example3, example5)
+compatoverlap(example5, example3)
+compatoverlap(example3, example6)
+compatoverlap(example6, example7)
+compatoverlap(example7, example8)
+compatoverlap(example8, example8)
diff --git a/demo_fasta/compare-0vs1.json b/demo_fasta/compare-0vs1.json
new file mode 100644
index 0000000..e0d9ae6
--- /dev/null
+++ b/demo_fasta/compare-0vs1.json
@@ -0,0 +1,38 @@
+{
+ "array_elements":{
+ "a":{
+ "lengths":3,
+ "names":3,
+ "sequences":3,
+ "sorted_name_length_pairs":3
+ },
+ "a_and_b":{
+ "lengths":2,
+ "names":2,
+ "sequences":0,
+ "sorted_name_length_pairs":2
+ },
+ "a_and_b_same_order":{
+ "lengths":true,
+ "names":true,
+ "sequences":null,
+ "sorted_name_length_pairs":true
+ },
+ "b":{
+ "lengths":2,
+ "names":2,
+ "sequences":2,
+ "sorted_name_length_pairs":2
+ }
+ },
+ "attributes":{
+ "a_and_b":[
+ "lengths",
+ "names",
+ "sequences",
+ "sorted_name_length_pairs"
+ ],
+ "a_only":[],
+ "b_only":[]
+ }
+}
\ No newline at end of file
diff --git a/demo_fasta/compare-1vs1.json b/demo_fasta/compare-1vs1.json
new file mode 100644
index 0000000..6c01b0c
--- /dev/null
+++ b/demo_fasta/compare-1vs1.json
@@ -0,0 +1,38 @@
+{
+ "array_elements":{
+ "a":{
+ "lengths":2,
+ "names":2,
+ "sequences":2,
+ "sorted_name_length_pairs":2
+ },
+ "a_and_b":{
+ "lengths":2,
+ "names":2,
+ "sequences":2,
+ "sorted_name_length_pairs":2
+ },
+ "a_and_b_same_order":{
+ "lengths":true,
+ "names":true,
+ "sequences":true,
+ "sorted_name_length_pairs":true
+ },
+ "b":{
+ "lengths":2,
+ "names":2,
+ "sequences":2,
+ "sorted_name_length_pairs":2
+ }
+ },
+ "attributes":{
+ "a_and_b":[
+ "lengths",
+ "names",
+ "sequences",
+ "sorted_name_length_pairs"
+ ],
+ "a_only":[],
+ "b_only":[]
+ }
+}
\ No newline at end of file
diff --git a/demo_fasta/compare-5vs6.json b/demo_fasta/compare-5vs6.json
new file mode 100644
index 0000000..347f738
--- /dev/null
+++ b/demo_fasta/compare-5vs6.json
@@ -0,0 +1,38 @@
+{
+ "array_elements":{
+ "a":{
+ "lengths":3,
+ "names":3,
+ "sequences":3,
+ "sorted_name_length_pairs":3
+ },
+ "a_and_b":{
+ "lengths":3,
+ "names":3,
+ "sequences":3,
+ "sorted_name_length_pairs":3
+ },
+ "a_and_b_same_order":{
+ "lengths":false,
+ "names":false,
+ "sequences":false,
+ "sorted_name_length_pairs":true
+ },
+ "b":{
+ "lengths":3,
+ "names":3,
+ "sequences":3,
+ "sorted_name_length_pairs":3
+ }
+ },
+ "attributes":{
+ "a_and_b":[
+ "lengths",
+ "names",
+ "sequences",
+ "sorted_name_length_pairs"
+ ],
+ "a_only":[],
+ "b_only":[]
+ }
+}
\ No newline at end of file
diff --git a/demo_fasta/demo0.fa b/demo_fasta/demo0.fa
new file mode 100644
index 0000000..dd08063
--- /dev/null
+++ b/demo_fasta/demo0.fa
@@ -0,0 +1,6 @@
+>chrX
+TTGGGGAA
+>chr1
+GGAA
+>chr2
+GCGC
diff --git a/demo_fasta/demo0.fa.fai b/demo_fasta/demo0.fa.fai
new file mode 100644
index 0000000..dc37dcd
--- /dev/null
+++ b/demo_fasta/demo0.fa.fai
@@ -0,0 +1,3 @@
+chrX 8 6 8 9
+chr1 4 21 4 5
+chr2 4 32 4 5
diff --git a/demo_fasta/demo1.fa.fai b/demo_fasta/demo1.fa.fai
new file mode 100644
index 0000000..c55c6c1
--- /dev/null
+++ b/demo_fasta/demo1.fa.fai
@@ -0,0 +1,2 @@
+chr1 4 6 4 5
+chr2 4 17 4 5
diff --git a/demo_fasta/demo1.fa.gz b/demo_fasta/demo1.fa.gz
new file mode 100644
index 0000000..c2174d6
Binary files /dev/null and b/demo_fasta/demo1.fa.gz differ
diff --git a/demo_fasta/demo5.fa.gz b/demo_fasta/demo5.fa.gz
new file mode 100644
index 0000000..be98af2
Binary files /dev/null and b/demo_fasta/demo5.fa.gz differ
diff --git a/demo_fasta/demo6.fa b/demo_fasta/demo6.fa
new file mode 100644
index 0000000..dd08063
--- /dev/null
+++ b/demo_fasta/demo6.fa
@@ -0,0 +1,6 @@
+>chrX
+TTGGGGAA
+>chr1
+GGAA
+>chr2
+GCGC
diff --git a/demo_fasta/demo6.fa.fai b/demo_fasta/demo6.fa.fai
new file mode 100644
index 0000000..dc37dcd
--- /dev/null
+++ b/demo_fasta/demo6.fa.fai
@@ -0,0 +1,3 @@
+chrX 8 6 8 9
+chr1 4 21 4 5
+chr2 4 32 4 5
diff --git a/deprecated.py b/deprecated.py
new file mode 100644
index 0000000..6038709
--- /dev/null
+++ b/deprecated.py
@@ -0,0 +1,106 @@
+from itertools import compress
+from functools import reduce
+
+import seqcol
+
+sc1 = {"names": ["chr1", "chr2"], "sequences": ["ay89fw", "we4f9x"]}
+
+sc2 = {"names": ["1", "2", "3"], "sequences": ["ay89fw", "we4f9x", "3n20xk2"]}
+
+sc3 = {"names": ["2", "3", "1"], "sequences": ["we4f9x", "3n20xk2", "ay89fw"]}
+
+sc4 = {"names": ["chr1", "3", "2"], "sequences": ["zyt2fw", "9snm23k", "fsd2x3"]}
+
+sc5 = {
+ "names": ["chr1", "3", "2"],
+ "sequences": ["zyt2fw", "9snm23k", "fsd2x3"],
+ "topologies": ["circular", "linear", "linear"],
+}
+
+
+def compat(A, B):
+ ainb = [x in B for x in A]
+ bina = [x in A for x in B]
+ if any(ainb):
+ order = list(compress(B, bina)) == list(compress(A, ainb))
+ else:
+ order = False
+
+ any(ainb)
+
+ flag = 0
+ flag += 2 if all(ainb) else 0
+ flag += 4 if all(bina) else 0
+ flag += 8 if order else 0
+ flag += 1 if any(ainb) else 0
+
+ return flag
+
+
+# New compat function that adds true/false
+def compat(A, B):
+ ainb = [x in B for x in A]
+ bina = [x in A for x in B]
+ if any(ainb):
+ order = list(compress(B, bina)) == list(compress(A, ainb))
+ else:
+ order = False
+
+ any(ainb)
+
+ flag = 0
+ flag += 2 if all(ainb) else 0
+ flag += 4 if all(bina) else 0
+ flag += 8 if order else 0
+ flag += 1 if any(ainb) else 0
+ result = {
+ "any-elements-shared": any(ainb),
+ "a-subset-of-b": all(ainb),
+ "b-subset-of-a": all(bina),
+ "order-match": order,
+ "flag": flag,
+ }
+ return result
+
+
+# For each array:
+# - any-elements-shared (1) (0001)
+# - all-a-in-b (2) (0010)
+# - all-b-in-a (4) (0100)
+# - order-match (8) (1000)
+
+# no match: 0000 = 0
+# one or more shared elements: 0001 = 1
+# all-a-in-b (a is a subset of b, in different order) = 0011 = 3
+# all-b-in-a (b is a subset of a, in different order) = 0101 = 5
+# same content, different order: 0111 = 7
+# a is a subset of b, in same order: 1011 = 11
+# b is a subset of a, in same order: 1101 = 13
+# identity: 1111 = 15
+
+
+compat(sc1["sequences"], sc2["sequences"])
+compat(sc1["names"], sc2["names"])
+
+compat(sc3["sequences"], sc2["sequences"])
+compat(sc3["names"], sc2["names"])
+
+compat(sc1["sequences"], sc3["sequences"])
+
+
+def compat_all(A, B):
+ all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
+ result = {}
+ for k in all_keys:
+ if k not in A or k not in B:
+ result[k] = {"flag": -1}
+ else:
+ result[k] = compat(A[k], B[k])
+ # result["all"] = reduce(lambda x,y: x['flag']&y['flag'], list(result.values()))
+ return result
+
+
+compat_all(sc1, sc2)
+compat_all(sc3, sc2)
+compat_all(sc1, sc3)
+compat_all(sc1, sc5)
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 0d94560..0000000
--- a/docs/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Refget
-
-## Introduction
-
-The refget package provides a Python interface to both remote and local use of the refget protocol. This package serves 4 functions:
-
-1. A lightweight python interface to a remote refget API.
-
-2. Local caching of retrieved results, improving performance for applications that require repeated lookups.
-
-3. A fully functioning local implementation of the refget protocol for local analysis backed by either memory, SQLite, or MongoDB.
-
-4. Convenience functions for computing refget checksums from python and handling FASTA files directly.
-
-## Install
-
-```console
-pip install refget
-```
-
-## Basic use
-
-### Retrieve results from a RESTful API
-
-```python
-import refget
-
-rgc = refget.RefGetClient("https://refget.herokuapp.com/sequence/")
-rgc.refget("6681ac2f62509cfc220d78751b8dc524", start=0, end=10)
-
-```
-
-### Compute digests locally
-
-```python
-refget.trunc512_digest("TCGA")
-```
-
-### Insert and retrieve sequences with a local database
-
-```python
-checksum = rgc.load_seq("GGAA")
-rgc.refget(checksum)
-```
-
-For more details, see the [tutorial](tutorial.md).
diff --git a/docs/autodoc_build/refget.md b/docs/autodoc_build/refget.md
deleted file mode 100644
index 6f38ada..0000000
--- a/docs/autodoc_build/refget.md
+++ /dev/null
@@ -1,110 +0,0 @@
-
-
-
-
-
-# Package `refget` Documentation
-
-## Class `RefGetClient`
-```python
-def __init__(self, api_url_base=None, database={}, schemas=None, henges=None, checksum_function=, suppress_connect=True)
-```
-
-A user interface to insert and retrieve decomposable recursive unique identifiers (DRUIDs).
-#### Parameters:
-
-- `database` (`dict`): Dict-like lookup database with sequences and hashes.
-- `schemas` (`dict`): One or more jsonschema schemas describing thedata types stored by this Henge
-- `checksum_function` (`function(str) -> str`): Default function to handle the digest of theserialized items stored in this henge.
-
-
-
-
-```python
-def get_service_info(self)
-```
-
-
-
-```python
-def item_types(self)
-```
-
-A list of item types handled by this Henge instance
-
-
-
-```python
-def load_fasta(self, fa_file, lengths_only=False)
-```
-
-Calculates checksums and loads each sequence in a fasta file into the database.
-
-
-
-```python
-def load_seq(self, seq)
-```
-
-
-
-```python
-def load_sequence_dict(self, seqset)
-```
-
-Convert a 'seqset', which is a dict with names as sequence names and values as sequences, into the 'asdlist' required for henge insert.
-
-
-
-```python
-def meta(self, digest)
-```
-
-
-
-```python
-def refget(self, digest, start=None, end=None)
-```
-
-
-
-```python
-def refget_remote(self, digest, start=None, end=None)
-```
-
-
-
-```python
-def service_info(self)
-```
-
-
-
-
-
-
-*Version Information: `refget` v0.0.1, generated by `lucidoc` v0.4.2*
\ No newline at end of file
diff --git a/docs/changelog.md b/docs/changelog.md
deleted file mode 100644
index a067237..0000000
--- a/docs/changelog.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Changelog
-
-This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.
-
-## [0.1.0] - 2021-06-17
-
-First public version, backed by henge version 0.1.1.
-
-## [0.0.1] - 2020-06-25
-
-Beta version for testing
diff --git a/docs_jupyter/build/.gitignore b/docs_jupyter/build/.gitignore
deleted file mode 100644
index d6b7ef3..0000000
--- a/docs_jupyter/build/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb
deleted file mode 100644
index acff965..0000000
--- a/docs_jupyter/tutorial.ipynb
+++ /dev/null
@@ -1,690 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Refget python package tutorial"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Record some versions:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'3.8.5'"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from platform import python_version \n",
- "python_version()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'0.1.0'"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import refget\n",
- "refget.__version__"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Computing digests locally"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "from refget import trunc512_digest"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Show some results for sequence digests:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trunc512_digest('ACGT')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trunc512_digest('TCGA')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf70'"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trunc512_digest('ACGT', 26)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Connecting to a remote API\n",
- "\n",
- "The refget package provides a simple python wrapper around a remote hosted refget RESTful API. Provide the base url when construction a RefGetClient object and you can retrieve sequences from the remote server."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'CCACACCACA'"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can also hit the `{digest}/metadata` and `service_info` API endpoints described in the refget API specification:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'metadata': {'aliases': [{'alias': 'ga4gh:SQ.lZyxiD_ByprhOUzrR1o1bq0ezO_1gkrn',\n",
- " 'naming_authority': 'ga4gh'},\n",
- " {'alias': 'I', 'naming_authority': 'unknown'}],\n",
- " 'length': 230218,\n",
- " 'md5': '6681ac2f62509cfc220d78751b8dc524',\n",
- " 'trunc512': '959cb1883fc1ca9ae1394ceb475a356ead1ecceff5824ae7'}}"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.meta(\"6681ac2f62509cfc220d78751b8dc524\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'service': {'algorithms': ['ga4gh', 'md5', 'trunc512'],\n",
- " 'circular_supported': True,\n",
- " 'subsequence_limit': None,\n",
- " 'supported_api_versions': ['1.0.0']}}"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.service_info"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "When requesting a sequence that is not found, the service responds appropriately:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Not Found'"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(trunc512_digest('TCGATCGA'))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Use a local database for caching\n",
- "\n",
- "By default, any full-sequences retrieved from an API are cached locally in memory (in a Python Dict). This data will not persist past a current session, but is useful if you have an application that requires repeated requests. here, we re-request the sequence requested above. It is much faster this time because it uses a local cache:\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'CCACACCACA'"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can also add new sequences into the database:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Not Found'"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(refget.md5('TCGATCGA')) # This sequence is not found in our database yet"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "checksum = rgc.load_seq(\"TCGATCGA\") # So, let's add it into database"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'TCGATCGA'"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(checksum) # This time it returns"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Keep in mind that sequences added in this way are added to your *local* database, not to the remote API, so when we restart, they will be gone:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "del rgc"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Not Found'"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")\n",
- "rgc.refget(refget.md5('TCGA'))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Making data persist\n",
- "\n",
- "If you want to retain your local cache, you can use a Dict that is backed by some persistent storage, such as a database on disk or another running process. There are many ways to do this, for example, you can use an sqlite database, a Redis database, or a MongoDB database. Here we'll show you how to use the `sqlitedict` package to back your local database.\n",
- "\n",
- "To start, you need to create a dict object and pass that to the RefGetClient constructor."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "import refget\n",
- "from sqlitedict import SqliteDict\n",
- "mydict = SqliteDict('./my_db.sqlite', autocommit=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\", mydict)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now when we retrieve a sequence it will be added to the local sqlite database automatically."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Look, we can see that this object has been added to our sqlite database:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'CACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "mydict[\"6681ac2f62509cfc220d78751b8dc524\"][1:50]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "So now if we kill this object and start it up again *without the API connection*, but with the mydict local backend, we can still retrieve it:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "del rgc"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "rgc = refget.RefGetClient(database=mydict)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Loading a fasta file\n",
- "\n",
- "The package also comes with a helper function for computing checksums for an entire fasta file."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "fa_file = \"../demo_fasta/demo.fa\"\n",
- "content = rgc.load_fasta(fa_file)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'name': 'chr1',\n",
- " 'length': 4,\n",
- " 'sequence_digest': 'f1f8f4bf413b16ad135722aa4591043e'},\n",
- " {'name': 'chr2',\n",
- " 'length': 4,\n",
- " 'sequence_digest': '45d0ff9f1a9504cf2039f89c1ffb4c32'}]"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "content"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'ACGT'"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(content[0]['sequence_digest'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "No remote URL connected\n"
- ]
- }
- ],
- "source": [
- "rgc.refget(\"blah\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "rgc.api_url_base = \"https://refget.herokuapp.com/sequence/\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Not Found'"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rgc.refget(\"blah\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [],
- "source": [
- "# You can show the complete contents of the database like this:\n",
- "# rgc.show()\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/interactive_tests.py b/interactive_tests.py
new file mode 100644
index 0000000..e16cdaa
--- /dev/null
+++ b/interactive_tests.py
@@ -0,0 +1,168 @@
+import seqcol
+from seqcol import SeqColHenge
+
+
+scc = SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"])
+scc
+
+fa_file = "demo_fasta/demo0.fa"
+fa_object = seqcol.parse_fasta(fa_file)
+
+skip_seq = False
+aslist = []
+names = []
+lengths = []
+lengthsi = []
+sequences = []
+for k in fa_object.keys():
+ seq = str(fa_object[k])
+ names.append(k)
+ lengths.append(str(len(seq)))
+ lengthsi.append(len(seq))
+ sequences.append(seq)
+
+array_set = {"names": names, "lengths": lengths, "sequences": sequences}
+
+collection_checksum = scc.insert(array_set, "SeqColArraySet")
+collection_checksum
+
+scc.retrieve(collection_checksum)
+scc.retrieve(collection_checksum, reclimit=1)
+
+# scc.retrieve("5c4b07f08319d3d0815f5ee25c45916a01f9d1519f0112e8")
+
+scc.retrieve(collection_checksum, reclimit=1)
+scc.retrieve(collection_checksum, reclimit=2)
+scc.retrieve(collection_checksum)
+scc.supports_inherent_attrs
+
+# Now a test of inherent attributes
+import seqcol
+
+scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
+scci
+scci.schemas
+
+
+fa_file = "demo_fasta/demo0.fa"
+fa_object = seqcol.parse_fasta(fa_file)
+
+array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"}
+array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"}
+
+
+di = scci.insert(array_set_i, "SeqColArraySet")
+di = scci.insert(array_set_i2, "SeqColArraySet")
+di
+# scc.retrieve(di)
+scci.retrieve(di)
+fasta_path = "demo_fasta"
+fasta1 = "demo2.fa"
+fasta2 = "demo3.fa"
+fasta5 = "demo5.fa.gz"
+fasta6 = "demo6.fa"
+
+import os
+
+d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1))
+d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
+d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
+d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5))
+d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6))
+scci.retrieve(d["digest"])
+
+scci.retrieve(d5["digest"])
+
+fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1))
+SCAS = seqcol.fasta_to_csc(fa_object)
+digest = scci.insert(SCAS, "SeqColArraySet", reclimit=1)
+
+
+d["digest"]
+d2["digest"]
+
+scci.compare_digests(d["digest"], d2["digest"])
+scci.compare(d["SCAS"], d2["SCAS"])
+
+
+json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]))
+print(
+ json.dumps(
+ scci.compare(d5["SCAS"], d6["SCAS"]),
+ separators=(",", ":"),
+ ensure_ascii=False,
+ allow_nan=False,
+ sort_keys=True,
+ indent=2,
+ )
+)
+
+build_sorted_name_length_pairs(array_set_i)
+
+# reorder
+
+array_set_reordered = {}
+for k, v in array_set.items():
+ print(k, v)
+ array_set_reordered[k] = list(reversed(v))
+
+array_set
+array_set_reordered
+
+build_sorted_name_length_pairs(array_set)
+build_sorted_name_length_pairs(array_set_reordered)
+
+
+import henge
+
+
+from henge import md5
+
+names = []
+lengths = []
+seq_digests = []
+for k in fa_object.keys():
+ seq = str(fa_object[k])
+ names.append(k)
+ lengths.append(str(len(seq)))
+ seq_digests.append(scc.checksum_function(seq))
+
+
+array_set = {"names": names, "lengths": lengths, "sequences": seq_digests}
+array_set
+collection_checksum = scc.insert(array_set, "SeqColArraySet", reclimit=1)
+
+scc.database = {}
+scc.retrieve("d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a", reclimit=1)
+scc.retrieve("d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a", reclimit=2)
+scc.database[collection_checksum]
+scc.checksum_function
+
+
+scc.database["d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a"]
+
+scc.database["a99fe2e92875099c84f73b20ef8e7dd2f2d12f063383bed0"]
+scc.database["ca82b053295b6f49923d0b2cedb83de49c6be59688c3dfd9"]
+
+
+import os
+
+os.getcwd()
+
+
+## standalone functions
+
+import seqcol
+
+fa_file = "demo_fasta/demo0.fa"
+fa_object = seqcol.parse_fasta(fa_file)
+
+# get a canonical seqcol object
+csc = seqcol.fasta_to_csc(fa_object)
+csc
+import json
+
+print(json.dumps(csc, indent=2))
+
+
+seqcol.seqcol_digest(csc)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..9348158
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.black]
+line-length = 99
+target-version = ['py38', 'py311']
+include = '\.pyi?$'
diff --git a/refget/__init__.py b/refget/__init__.py
index 3841f46..897c9fa 100644
--- a/refget/__init__.py
+++ b/refget/__init__.py
@@ -1,11 +1,12 @@
# Project configuration, particularly for logging.
import logging
+
from ._version import __version__
+from .const import *
from .hash_functions import *
from .refget import RefGetClient
from .refget import parse_fasta
-
-
-__classes__ = ["RefGetClient"]
-__all__ = __classes__ + []
+from .seqcol import *
+from .utilities import *
+from .seqcol_client import *
diff --git a/refget/_version.py b/refget/_version.py
index 3dc1f76..d3ec452 100644
--- a/refget/_version.py
+++ b/refget/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/refget/const.py b/refget/const.py
new file mode 100644
index 0000000..987b644
--- /dev/null
+++ b/refget/const.py
@@ -0,0 +1,51 @@
+import os
+
+
+def _schema_path(name):
+ return os.path.join(SCHEMA_FILEPATH, name)
+
+
+CONTENT_ALL_A_IN_B = 2**0
+CONTENT_ALL_B_IN_A = 2**1
+LENGTHS_ALL_A_IN_B = 2**2
+LENGTHS_ALL_B_IN_A = 2**3
+NAMES_ALL_A_IN_B = 2**4
+NAMES_ALL_B_IN_A = 2**5
+TOPO_ALL_A_IN_B = 2**6
+TOPO_ALL_B_IN_A = 2**7
+CONTENT_ANY_SHARED = 2**8
+LENGTHS_ANY_SHARED = 2**9
+NAMES_ANY_SHARED = 2**10
+CONTENT_A_ORDER = 2**11
+CONTENT_B_ORDER = 2**12
+
+FLAGS = {
+ CONTENT_ALL_A_IN_B: "CONTENT_ALL_A_IN_B",
+ CONTENT_ALL_B_IN_A: "CONTENT_ALL_B_IN_A",
+ LENGTHS_ALL_A_IN_B: "LENGTHS_ALL_A_IN_B",
+ LENGTHS_ALL_B_IN_A: "LENGTHS_ALL_B_IN_A",
+ NAMES_ALL_A_IN_B: "NAMES_ALL_A_IN_B",
+ NAMES_ALL_B_IN_A: "NAMES_ALL_B_IN_A",
+ TOPO_ALL_A_IN_B: "TOPO_ALL_A_IN_B",
+ TOPO_ALL_B_IN_A: "TOPO_ALL_B_IN_A",
+ CONTENT_ANY_SHARED: "CONTENT_ANY_SHARED",
+ LENGTHS_ANY_SHARED: "LENGTHS_ANY_SHARED",
+ NAMES_ANY_SHARED: "NAMES_ANY_SHARED",
+ CONTENT_A_ORDER: "CONTENT_A_ORDER",
+ CONTENT_B_ORDER: "CONTENT_B_ORDER",
+}
+
+KNOWN_TOPOS = ["linear", "circular"]
+NAME_KEY = "name"
+SEQ_KEY = "sequence"
+TOPO_KEY = "topology"
+LEN_KEY = "length"
+
+# internal schemas paths determination
+ASL_NAME = "AnnotatedSequenceList"
+SCHEMA_NAMES = [ASL_NAME + ".yaml"]
+SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas")
+INTERNAL_SCHEMAS = [_schema_path(s) for s in SCHEMA_NAMES]
+
+# Alias dict to make typehinting clearer
+SeqCol = dict
diff --git a/refget/exceptions.py b/refget/exceptions.py
new file mode 100644
index 0000000..fd7bde0
--- /dev/null
+++ b/refget/exceptions.py
@@ -0,0 +1,10 @@
+class InvalidSeqColError(Exception):
+ """Object was not validated successfully according to schema."""
+
+ def __init__(self, message, errors):
+ super().__init__(message)
+ self.message = message
+ self.errors = errors
+
+ def __str__(self):
+ return f"InvalidSeqColError ({self.message}): {self.errors}"
diff --git a/refget/hash_functions.py b/refget/hash_functions.py
index 234d98a..34a29b3 100644
--- a/refget/hash_functions.py
+++ b/refget/hash_functions.py
@@ -6,32 +6,38 @@
import hashlib
import binascii
+
def trunc512_digest(seq, offset=24):
- digest = hashlib.sha512(seq.encode('utf-8')).digest()
+ digest = hashlib.sha512(seq.encode("utf-8")).digest()
hex_digest = binascii.hexlify(digest[:offset])
- return hex_digest.decode("utf-8")
+ return hex_digest.decode("utf-8")
+
def ga4gh_digest(seq, digest_size=24):
# b64 encoding results in 4/3 size expansion of data and padded if
# not multiple of 3, which doesn't make sense for this use
assert digest_size % 3 == 0, "digest size must be multiple of 3"
- digest = hashlib.sha512(seq.encode('utf-8')).digest()
+ digest = hashlib.sha512(seq.encode("utf-8")).digest()
return _ga4gh_format(digest, digest_size)
+
def _ga4gh_format(digest, digest_size=24):
tdigest_b64us = base64.urlsafe_b64encode(digest[:digest_size])
return "ga4gh:SQ.{}".format(tdigest_b64us.decode("utf-8"))
+
def ga4gh_to_trunc512(vmc):
- base64_strip = vmc.replace("ga4gh:SQ.","")
+ base64_strip = vmc.replace("ga4gh:SQ.", "")
digest = base64.urlsafe_b64decode(base64_strip)
hex_digest = binascii.hexlify(digest)
- return hex_digest.decode("utf-8")
+ return hex_digest.decode("utf-8")
+
def trunc512_to_ga4gh(trunc512):
- digest_length = len(trunc512)*2
+ digest_length = len(trunc512) * 2
digest = binascii.unhexlify(trunc512)
return _ga4gh_format(digest, digest_length)
+
def md5(seq):
return hashlib.md5(seq.encode()).hexdigest()
diff --git a/refget/refget.py b/refget/refget.py
index 08468d6..7bc351b 100644
--- a/refget/refget.py
+++ b/refget/refget.py
@@ -12,9 +12,7 @@
_LOGGER = logging.getLogger(__name__)
henge.ITEM_TYPE = "_item_type"
-SCHEMA_FILEPATH = os.path.join(
- os.path.dirname(__file__),
- "schemas")
+SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas")
sequence_schema = """description: "Schema for a single raw sequence"
henge_class: sequence
@@ -23,9 +21,17 @@
"""
-
class RefGetClient(henge.Henge):
- def __init__(self, api_url_base=None, database={}, schemas=[], schemas_str=[], henges=None, checksum_function=henge.md5, suppress_connect=True):
+ def __init__(
+ self,
+ api_url_base=None,
+ database={},
+ schemas=[],
+ schemas_str=[],
+ henges=None,
+ checksum_function=henge.md5,
+ suppress_connect=True,
+ ):
"""
A user interface to insert and retrieve decomposable recursive unique
identifiers (DRUIDs).
@@ -35,29 +41,26 @@ def __init__(self, api_url_base=None, database={}, schemas=[], schemas_str=[], h
data types stored by this Henge
:param function(str) -> str checksum_function: Default function to handle the digest of the
serialized items stored in this henge.
- """
+ """
self.api_url_base = api_url_base
self.local_cache = database
self.info = None
if not suppress_connect:
self.info = self.get_service_info()
-
# These are the item types that this henge can understand.
- if len(schemas) ==0 and len(schemas_str) == 0:
+ if len(schemas) == 0 and len(schemas_str) == 0:
schemas_str = [sequence_schema]
- super(RefGetClient, self).__init__(database, schemas, schemas_str, henges=henges,
- checksum_function=checksum_function)
+ super(RefGetClient, self).__init__(
+ database, schemas, schemas_str, henges=henges, checksum_function=checksum_function
+ )
def refget_remote(self, digest, start=None, end=None):
if not self.api_url_base:
print("No remote URL connected")
return
- url = "{base}{digest}?accept=text/plain".format(
- base=self.api_url_base,
- digest=digest)
-
+ url = "{base}{digest}?accept=text/plain".format(base=self.api_url_base, digest=digest)
if start is not None and end is not None:
full_retrieved = False
@@ -66,11 +69,11 @@ def refget_remote(self, digest, start=None, end=None):
full_retrieved = True
r = requests.get(url)
- result = r.content.decode('utf-8')
+ result = r.content.decode("utf-8")
if full_retrieved:
self.load_seq(result)
-
+
return result
def refget(self, digest, start=None, end=None):
@@ -81,7 +84,7 @@ def refget(self, digest, start=None, end=None):
result = full_data[start:end]
else:
result = full_data
-
+
return result
except henge.NotFoundException:
return self.refget_remote(digest, start, end)
@@ -96,22 +99,18 @@ def service_info(self):
if not self.info:
self.info = self.get_service_info()
return self.info
-
+
def meta(self, digest):
- url = "{base}{digest}/metadata".format(
- base=self.api_url_base,
- digest=digest)
-
+ url = "{base}{digest}/metadata".format(base=self.api_url_base, digest=digest)
+
r = requests.get(url)
- return json.loads(r.content.decode('utf-8'))
-
+ return json.loads(r.content.decode("utf-8"))
+
def get_service_info(self):
- url = "{base}service-info".format(
- base=self.api_url_base)
-
- r = requests.get(url)
- return json.loads(r.content.decode('utf-8'))
+ url = "{base}service-info".format(base=self.api_url_base)
+ r = requests.get(url)
+ return json.loads(r.content.decode("utf-8"))
def load_fasta(self, fa_file, lengths_only=False):
"""
@@ -123,9 +122,7 @@ def load_fasta(self, fa_file, lengths_only=False):
for k in fa_object.keys():
seq = str(fa_object[k])
seq_digest = self.load_seq(seq)
- asdlist.append({'name': k,
- 'length': len(seq),
- 'sequence_digest': seq_digest})
+ asdlist.append({"name": k, "length": len(seq), "sequence_digest": seq_digest})
_LOGGER.debug(asdlist)
return asdlist
@@ -139,30 +136,29 @@ def load_sequence_dict(self, seqset):
for k, v in seqset.items():
if isinstance(v, str):
seq = v
- v = {'sequence': seq}
- if 'length' not in v.keys():
- if 'sequence' not in v.keys():
- _LOGGER.warning(
- "Each sequence must have either length or a sequence.")
+ v = {"sequence": seq}
+ if "length" not in v.keys():
+ if "sequence" not in v.keys():
+ _LOGGER.warning("Each sequence must have either length or a sequence.")
else:
- v['length'] = len(v['sequence'])
- if 'sequence' in v.keys():
- v['sequence_digest'] = self.load_seq(seq)
- del v['sequence']
- if 'name' not in v.keys():
- v['name'] = k
- if 'toplogy' not in v.keys():
- v['toplogy'] = 'linear'
+ v["length"] = len(v["sequence"])
+ if "sequence" in v.keys():
+ v["sequence_digest"] = self.load_seq(seq)
+ del v["sequence"]
+ if "name" not in v.keys():
+ v["name"] = k
+ if "toplogy" not in v.keys():
+ v["toplogy"] = "linear"
seqset_new[k] = v
- collection_checksum = self.insert(list(seqset_new.values()), 'ASDList')
+ collection_checksum = self.insert(list(seqset_new.values()), "ASDList")
return collection_checksum, seqset_new
-
# Static functions below (these don't require a database)
+
def parse_fasta(fa_file):
_LOGGER.debug("Hashing {}".format(fa_file))
try:
@@ -176,4 +172,4 @@ def parse_fasta(fa_file):
fa_file_unzipped = fa_file.replace(".gz", "")
fa_object = pyfaidx.Fasta(fa_file_unzipped)
os.system("gzip {}".format(fa_file_unzipped))
- return fa_object
\ No newline at end of file
+ return fa_object
diff --git a/refget/schemas/AnnotatedSequenceList.yaml b/refget/schemas/AnnotatedSequenceList.yaml
new file mode 100644
index 0000000..320fdf9
--- /dev/null
+++ b/refget/schemas/AnnotatedSequenceList.yaml
@@ -0,0 +1,33 @@
+description: "Schema for List of ASDs"
+henge_class: "AnnotatedSequenceList"
+recursive: true
+type: array
+items:
+ description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata"
+ type: object
+ henge_class: ASD
+ properties:
+ name:
+ type: string
+ length:
+ type: "integer"
+ topology:
+ type: string
+ enum: ["circular", "linear"]
+ default: "linear"
+ sequence:
+ description: "Schema for a single raw sequence"
+ henge_class: sequence
+ type: object
+ properties:
+ sequence:
+ type: string
+ description: "Actual sequence content"
+ required:
+ - sequence
+ required:
+ - length
+ - name
+ - topology
+ recursive:
+ - sequence
diff --git a/refget/schemas/RawSeqCol.yaml b/refget/schemas/RawSeqCol.yaml
new file mode 100644
index 0000000..1881280
--- /dev/null
+++ b/refget/schemas/RawSeqCol.yaml
@@ -0,0 +1,19 @@
+description: "Schema for List of ASDs"
+type: array
+henge_class: "RawSeqCol"
+items:
+ description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata"
+ type: object
+ henge_class: ASD
+ properties:
+ name:
+ type: string
+ length:
+ type: "integer"
+ sequence:
+ description: "Actual sequence content for a single raw sequence"
+ henge_class: sequence
+ type: string
+ required:
+ - length
+ - name
diff --git a/refget/schemas/SeqColArraySet.yaml b/refget/schemas/SeqColArraySet.yaml
new file mode 100644
index 0000000..d37158f
--- /dev/null
+++ b/refget/schemas/SeqColArraySet.yaml
@@ -0,0 +1,27 @@
+description: "SeqColArraySet"
+type: object
+henge_class: "SeqColArraySet"
+properties:
+ topologies:
+ type: array
+ henge_class: "array"
+ items:
+ type: string
+ enum: ["circular", "linear"]
+ default: "linear"
+ names:
+ type: array
+ henge_class: "array"
+ items:
+ type: string
+ lengths:
+ type: array
+ henge_class: "array"
+ items:
+ type: string
+ sequences:
+ type: array
+ henge_class: "seqarray"
+ items:
+ type: string
+ henge_class: sequence
diff --git a/refget/schemas/SeqColArraySetInherent.yaml b/refget/schemas/SeqColArraySetInherent.yaml
new file mode 100644
index 0000000..743cc28
--- /dev/null
+++ b/refget/schemas/SeqColArraySetInherent.yaml
@@ -0,0 +1,36 @@
+description: "SeqColArraySet"
+type: object
+henge_class: "SeqColArraySet"
+properties:
+ topologies:
+ type: array
+ henge_class: "strarray"
+ items:
+ type: string
+ enum: ["circular", "linear"]
+ default: "linear"
+ names:
+ type: array
+ henge_class: "strarray"
+ items:
+ type: string
+ lengths:
+ type: array
+ henge_class: "intarray"
+ items:
+ type: integer
+ sequences:
+ type: array
+ henge_class: "seqarray"
+ items:
+ type: string
+ henge_class: sequence
+ sorted_name_length_pairs:
+ type: array
+ henge_class: "strarray"
+ items:
+ type: string
+inherent:
+ - names
+ - lengths
+ - sequences
diff --git a/refget/schemas/TASeqCol.yaml b/refget/schemas/TASeqCol.yaml
new file mode 100644
index 0000000..19a4eb7
--- /dev/null
+++ b/refget/schemas/TASeqCol.yaml
@@ -0,0 +1,31 @@
+description: "TAseqcol"
+type: object
+henge_class: "TASeqCol"
+properties:
+ topology:
+ type: array
+ henge_class: "TopologyVec"
+ items:
+ type: string
+ enum: ["circular", "linear"]
+ default: "linear"
+ rawseqcol:
+ description: "Schema for List of ASDs"
+ type: array
+ henge_class: "RawSeqCol"
+ items:
+ description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata"
+ type: object
+ henge_class: ASD
+ properties:
+ name:
+ type: string
+ length:
+ type: "integer"
+ sequence:
+ type: string
+ henge_class: sequence
+ description: "Actual sequence content"
+ required:
+ - length
+ - name
diff --git a/refget/schemas/seqcol.yaml b/refget/schemas/seqcol.yaml
new file mode 100644
index 0000000..42a722b
--- /dev/null
+++ b/refget/schemas/seqcol.yaml
@@ -0,0 +1,28 @@
+description: "A collection of biological sequences."
+type: object
+properties:
+ lengths:
+ type: array
+ collated: true
+ description: "Number of elements, such as nucleotides or amino acids, in each sequence."
+ items:
+ type: integer
+ names:
+ type: array
+ collated: true
+ description: "Human-readable identifiers of each sequence (chromosome names)."
+ items:
+ type: string
+ sequences:
+ type: array
+ collated: true
+ items:
+ type: string
+ description: "Digests of sequences computed using the GA4GH digest algorithm (sha512t24u)."
+required:
+ - lengths
+ - names
+inherent:
+ - lengths
+ - names
+ - sequences
\ No newline at end of file
diff --git a/refget/seqcol.py b/refget/seqcol.py
new file mode 100644
index 0000000..ba6ff56
--- /dev/null
+++ b/refget/seqcol.py
@@ -0,0 +1,199 @@
+import henge
+import logging
+import yacman
+
+from itertools import compress
+
+from .const import *
+from .utilities import *
+
+
+_LOGGER = logging.getLogger(__name__)
+henge.ITEM_TYPE = "_item_type"
+
+
+class SeqColConf(yacman.YAMLConfigManager):
+ """
+ Simple configuration manager object for SeqColHenge.
+ """
+
+ def __init__(
+ self,
+ entries={},
+ filepath=None,
+ yamldata=None,
+ writable=False,
+ wait_max=60,
+ skip_read_lock=False,
+ ):
+ filepath = yacman.select_config(
+ config_filepath=filepath, config_env_vars=["SEQCOLAPI_CONFIG"], config_name="seqcol"
+ )
+ super(SeqColConf, self).__init__(entries, filepath, yamldata, writable)
+
+
+class SeqColHenge(henge.Henge):
+ """
+ Extension of henge that accommodates collections of sequences.
+ """
+
+ def __init__(
+ self,
+ database={},
+ schemas=None,
+ henges=None,
+ checksum_function=sha512t24u_digest,
+ ):
+ """
+ A user interface to insert and retrieve decomposable recursive unique
+ identifiers (DRUIDs).
+
+ :param dict database: Dict-like lookup database with sequences
+ and hashes
+ :param dict schemas: One or more jsonschema schemas describing the
+ data types stored by this Henge
+ :param function(str) -> str checksum_function: Default function to
+ handle the digest of the
+ serialized items stored in this henge.
+ """
+ super(SeqColHenge, self).__init__(
+ database=database,
+ schemas=schemas or INTERNAL_SCHEMAS,
+ henges=henges,
+ checksum_function=checksum_function,
+ )
+ _LOGGER.info("Initializing SeqColHenge")
+
+ def load_fasta(self, fa_file, skip_seq=False, topology_default="linear"):
+ """
+ Load a sequence collection into the database
+
+ :param str fa_file: path to the FASTA file to parse and load
+ :param bool skip_seq: whether to disregard the actual sequences,
+ load just the names and lengths and topology
+ :param bool skip_seq: whether to disregard the actual sequences,
+ load just the names and lengths and topology
+ :param str topology_default: the default topology assigned to
+ every sequence
+ """
+ # TODO: any systematic way infer topology from a FASTA file?
+ if topology_default not in KNOWN_TOPOS:
+ raise ValueError(
+ f"Invalid topology ({topology_default}). " f"Choose from: {','.join(KNOWN_TOPOS)}"
+ )
+ fa_object = parse_fasta(fa_file)
+ aslist = []
+ for k in fa_object.keys():
+ seq = str(fa_object[k])
+ aslist.append(
+ {
+ NAME_KEY: k,
+ LEN_KEY: len(seq),
+ TOPO_KEY: topology_default,
+ SEQ_KEY: {"" if skip_seq else SEQ_KEY: seq},
+ }
+ )
+ collection_checksum = self.insert(aslist, ASL_NAME)
+ _LOGGER.debug(f"Loaded {ASL_NAME}: {aslist}")
+ return collection_checksum, aslist
+
+ def load_fasta2(self, fa_file, skip_seq=False, topology_default="linear"):
+ """
+ Load a sequence collection into the database
+
+ :param str fa_file: path to the FASTA file to parse and load
+ :param bool skip_seq: whether to disregard the actual sequences,
+ load just the names and lengths and topology
+ :param bool skip_seq: whether to disregard the actual sequences,
+ load just the names and lengths and topology
+ :param str topology_default: the default topology assigned to
+ every sequence
+ """
+ # TODO: any systematic way infer topology from a FASTA file?
+ _LOGGER.info("Loading fasta file...")
+ fa_object = parse_fasta(fa_file)
+ aslist = []
+ for k in fa_object.keys():
+ seq = str(fa_object[k])
+ _LOGGER.info("Loading key: {k} / Length: {l}...".format(k=k, l=len(seq)))
+ aslist.append(
+ {
+ NAME_KEY: k,
+ LEN_KEY: len(seq),
+ TOPO_KEY: topology_default,
+ SEQ_KEY: "" if skip_seq else seq,
+ }
+ )
+ _LOGGER.info("Inserting into database...")
+ collection_checksum = self.insert(aslist, "RawSeqCol")
+ _LOGGER.debug(f"Loaded {ASL_NAME}: {aslist}")
+ return collection_checksum, aslist
+
+ def compare_digests(self, digestA, digestB):
+ A = self.retrieve(digestA, reclimit=1)
+ B = self.retrieve(digestB, reclimit=1)
+ # _LOGGER.info(A)
+ # _LOGGER.info(B)
+ return compare_seqcols(A, B)
+
+ def retrieve(self, druid, reclimit=None, raw=False):
+ try:
+ return super(SeqColHenge, self).retrieve(druid, reclimit, raw)
+ except henge.NotFoundException as e:
+ _LOGGER.debug(e)
+ raise e
+ try:
+ return self.refget(druid)
+ except Exception as e:
+ _LOGGER.debug(e)
+ raise e
+ return henge.NotFoundException(
+ "{} not found in database, or in refget.".format(druid)
+ )
+
+ def load_fasta_from_refgenie(self, rgc, refgenie_key):
+ """
+ @param rgc RefGenConf object
+ @param refgenie_key key of genome to load
+ """
+ filepath = rgc.seek(refgenie_key, "fasta")
+ return self.load_fasta_from_filepath(filepath)
+
+ def load_fasta_from_filepath(self, filepath):
+ """
+ @param filepath Path to fasta file
+ """
+ fa_object = parse_fasta(filepath)
+ SCAS = fasta_obj_to_seqcol(fa_object, digest_function=self.checksum_function)
+ digest = self.insert(SCAS, "SeqColArraySet", reclimit=1)
+ return {
+ "fa_file": filepath,
+ "fa_object": fa_object,
+ "SCAS": SCAS,
+ "digest": digest,
+ }
+
+ def load_from_chromsizes(self, chromsizes):
+ """
+ @param chromsizes Path to chromsizes file
+ """
+ SCAS = chrom_sizes_to_seqcol(chromsizes, digest_function=self.checksum_function)
+ digest = self.insert(SCAS, "SeqColArraySet", reclimit=1)
+ return {
+ "chromsizes_file": chromsizes,
+ "SCAS": SCAS,
+ "digest": digest,
+ }
+
+ def load_multiple_fastas(self, fasta_dict):
+ """
+ Wrapper for load_fasta_from_filepath
+
+ @param fasta_list
+ """
+ results = {}
+ for name in fasta_dict.keys():
+ path = fasta_dict[name]["fasta"]
+ print(f"Processing fasta '{name}'' at path '{path}'...")
+ results[name] = self.load_fasta_from_filepath(path)
+ return results
diff --git a/refget/seqcol_client.py b/refget/seqcol_client.py
new file mode 100644
index 0000000..58f4e40
--- /dev/null
+++ b/refget/seqcol_client.py
@@ -0,0 +1,49 @@
+import requests
+
+
+class SeqColClient(object):
+ """
+ A client for interacting with a sequence collection API.
+
+ Args:
+ url (str, optional): The base URL of the sequence collection API. Defaults to "http://seqcolapi.databio.org".
+
+ Attributes:
+ url (str): The base URL of the sequence collection API.
+
+ Methods:
+ get_collection(accession, level=2): Retrieves a sequence collection for a given accession and level.
+ """
+
+ def __init__(self, url="http://seqcolapi.databio.org"):
+ self.url = url
+
+ def get_collection(self, accession, level=2):
+ """
+ Retrieves a sequence collection for a given accession and detail level.
+
+ Args:
+ accession (str): The accession of the sequence collection.
+ level (int, optional): The level of detail for the sequence collection. Defaults to 2.
+
+ Returns:
+ dict: The JSON response containing the sequence collection.
+ """
+ url = f"{self.url}/collection/{accession}?level={level}"
+ response = requests.get(url)
+ return response.json()
+
+ def compare(self, accession1, accession2):
+ """
+ Compares two sequence collections.
+
+ Args:
+ accession1 (str): The accession of the first sequence collection.
+ accession2 (str): The accession of the second sequence collection.
+
+ Returns:
+ dict: The JSON response containing the comparison of the two sequence collections.
+ """
+ url = f"{self.url}/comparison/{accession1}/{accession2}"
+ response = requests.get(url)
+ return response.json()
diff --git a/refget/utilities.py b/refget/utilities.py
new file mode 100644
index 0000000..573e3ef
--- /dev/null
+++ b/refget/utilities.py
@@ -0,0 +1,312 @@
+import base64
+import binascii
+import hashlib
+import json
+import logging
+import os
+import pyfaidx
+
+from jsonschema import Draft7Validator
+from typing import Optional, Callable
+from yacman import load_yaml
+
+from .const import SeqCol
+from .exceptions import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def trunc512_digest(seq, offset=24) -> str:
+ """Deprecated GA4GH digest function"""
+ digest = hashlib.sha512(seq.encode()).digest()
+ hex_digest = binascii.hexlify(digest[:offset])
+ return hex_digest.decode()
+
+
+def sha512t24u_digest(seq: str, offset: int = 24) -> str:
+ """GA4GH digest function"""
+ digest = hashlib.sha512(seq.encode()).digest()
+ tdigest_b64us = base64.urlsafe_b64encode(digest[:offset])
+ return tdigest_b64us.decode("ascii")
+
+
+def canonical_str(item: dict) -> str:
+ """Convert a dict into a canonical string representation"""
+ return json.dumps(
+ item, separators=(",", ":"), ensure_ascii=False, allow_nan=False, sort_keys=True
+ )
+
+
+def print_csc(csc: dict) -> str:
+ """Convenience function to pretty-print a canonical sequence collection"""
+ return print(json.dumps(csc, indent=2))
+
+
+def validate_seqcol_bool(seqcol_obj: SeqCol, schema=None) -> bool:
+ """
+ Validate a seqcol object against the seqcol schema. Returns True if valid, False if not.
+
+ To enumerate the errors, use validate_seqcol instead.
+ """
+ schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
+ schema = load_yaml(schema_path)
+ validator = Draft7Validator(schema)
+ return validator.is_valid(seqcol_obj)
+
+
+def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]:
+ """Validate a seqcol object against the seqcol schema.
+ Returns True if valid, raises InvalidSeqColError if not, which enumerates the errors.
+ Retrieve individual errors with exception.errors
+ """
+ schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
+ schema = load_yaml(schema_path)
+ validator = Draft7Validator(schema)
+ if not validator.is_valid(seqcol_obj):
+ errors = sorted(validator.iter_errors(seqcol_obj), key=lambda e: e.path)
+ raise InvalidSeqColError("Validation failed", errors)
+ return True
+
+
+def format_itemwise(csc: SeqCol) -> list:
+ """
+ Format a SeqCol object into a list of dicts, one per sequence.
+ """
+ list_of_dicts = []
+ # TODO: handle all properties, not just these 3
+ # TODO: handle non-collated attributes, somehow
+ for i in range(len(csc["lengths"])):
+ list_of_dicts.append(
+ {
+ "name": csc["names"][i],
+ "length": csc["lengths"][i],
+ "sequence": csc["sequences"][i],
+ }
+ )
+ return {"sequences": list_of_dicts}
+
+
+def parse_fasta(fa_file) -> pyfaidx.Fasta:
+ """
+ Read in a gzipped or not gzipped FASTA file
+ """
+ try:
+ return pyfaidx.Fasta(fa_file)
+ except pyfaidx.UnsupportedCompressionFormat:
+ # pyfaidx can handle bgzip but not gzip; so we just hack it here and
+ # gunzip the file into a temporary one and read it in not to interfere
+ # with the original one.
+ from gzip import open as gzopen
+ from tempfile import NamedTemporaryFile
+
+ with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out:
+ f_out.writelines(f_in.read())
+ f_out.seek(0)
+ return pyfaidx.Fasta(f_out.name)
+
+
+def chrom_sizes_to_digest(chrom_sizes_file_path: str) -> str:
+ """Given a chrom.sizes file, return a digest"""
+ seqcol_obj = chrom_sizes_to_seqcol(chrom_sizes_file_path)
+ return seqcol_digest(seqcol_obj)
+
+
+def chrom_sizes_to_seqcol(
+ chrom_sizes_file_path: str,
+ digest_function: Callable[[str], str] = sha512t24u_digest,
+) -> dict:
+ """Given a chrom.sizes file, return a canonical seqcol object"""
+ with open(chrom_sizes_file_path, "r") as f:
+ lines = f.readlines()
+ CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
+ for line in lines:
+ line = line.strip()
+ if line == "":
+ continue
+ seq_name, seq_length, ga4gh_digest, md5_digest = line.split("\t")
+ snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs
+ snlp_digest = digest_function(canonical_str(snlp))
+ CSC["lengths"].append(int(seq_length))
+ CSC["names"].append(seq_name)
+ CSC["sequences"].append(ga4gh_digest)
+ CSC["sorted_name_length_pairs"].append(snlp_digest)
+ CSC["sorted_name_length_pairs"].sort()
+ return CSC
+
+
+def fasta_file_to_digest(fa_file_path: str) -> str:
+ """Given a fasta, return a digest"""
+ seqcol_obj = fasta_file_to_seqcol(fa_file_path)
+ return seqcol_digest(seqcol_obj)
+
+
+def fasta_file_to_seqcol(fa_file_path: str) -> dict:
+ """Given a fasta, return a canonical seqcol object"""
+ fa_obj = parse_fasta(fa_file_path)
+ return fasta_obj_to_seqcol(fa_obj)
+
+
+def fasta_obj_to_seqcol(
+ fa_object: pyfaidx.Fasta,
+ verbose: bool = True,
+ digest_function: Callable[[str], str] = sha512t24u_digest,
+) -> dict:
+ """
+ Given a fasta object, return a CSC (Canonical Sequence Collection object)
+ """
+ # CSC = SeqColArraySet
+ # Or equivalently, a "Level 1 SeqCol"
+
+ CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
+ seqs = fa_object.keys()
+ nseqs = len(seqs)
+ print(f"Found {nseqs} chromosomes")
+ i = 1
+ for k in fa_object.keys():
+ if verbose:
+ print(f"Processing ({i} of {nseqs}) {k}...")
+ seq = str(fa_object[k])
+ seq_length = len(seq)
+ seq_name = fa_object[k].name
+ seq_digest = "SQ." + digest_function(seq.upper())
+ snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs
+ snlp_digest = digest_function(canonical_str(snlp))
+ CSC["lengths"].append(seq_length)
+ CSC["names"].append(seq_name)
+ CSC["sorted_name_length_pairs"].append(snlp_digest)
+ CSC["sequences"].append(seq_digest)
+ i += 1
+ CSC["sorted_name_length_pairs"].sort()
+ return CSC
+
+
+def build_sorted_name_length_pairs(obj: dict, digest_function):
+ """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system"""
+ sorted_name_length_pairs = []
+ for i in range(len(obj["names"])):
+ sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]})
+ nl_digests = [] # name-length digests
+ for i in range(len(sorted_name_length_pairs)):
+ nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i])))
+
+ nl_digests.sort()
+ return nl_digests
+
+
+def compare_seqcols(A: SeqCol, B: SeqCol):
+ """
+ Workhorse comparison function
+
+ @param A Sequence collection A
+ @param B Sequence collection B
+ @return dict Following formal seqcol specification comparison function return value
+ """
+ validate_seqcol(A) # First ensure these are the right structure
+ validate_seqcol(B)
+
+ all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
+ result = {}
+
+ # Compute lengths of each array; only do this for array attributes
+ a_lengths = {}
+ b_lengths = {}
+ for k in A.keys():
+ a_lengths[k] = len(A[k])
+ for k in B.keys():
+ b_lengths[k] = len(B[k])
+
+ return_obj = {
+ "attributes": {"a_only": [], "b_only": [], "a_and_b": []},
+ "array_elements": {
+ "a": a_lengths,
+ "b": b_lengths,
+ "a_and_b": {},
+ "a_and_b_same_order": {},
+ },
+ }
+
+ for k in all_keys:
+ _LOGGER.info(k)
+ if k not in A:
+ result[k] = {"flag": -1}
+ return_obj["attributes"]["b_only"].append(k)
+ # return_obj["array_elements"]["total"][k] = {"a": None, "b": len(B[k])}
+ elif k not in B:
+ return_obj["attributes"]["a_only"].append(k)
+ # return_obj["array_elements"]["total"][k] = {"a": len(A[k]), "b": None}
+ else:
+ return_obj["attributes"]["a_and_b"].append(k)
+ res = _compare_elements(A[k], B[k])
+ # return_obj["array_elements"]["total"][k] = {"a": len(A[k]), "b": len(B[k])}
+ return_obj["array_elements"]["a_and_b"][k] = res["a_and_b"]
+ return_obj["array_elements"]["a_and_b_same_order"][k] = res["a_and_b_same_order"]
+ return return_obj
+
+
+def _compare_elements(A: list, B: list):
+ """
+ Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function
+ """
+
+ A_filtered = list(filter(lambda x: x in B, A))
+ B_filtered = list(filter(lambda x: x in A, B))
+ A_count = len(A_filtered)
+ B_count = len(B_filtered)
+ overlap = min(len(A_filtered), len(B_filtered)) # counts duplicates
+
+ if A_count + B_count < 1:
+ # order match requires at least 2 matching elements
+ order = None
+ elif not (A_count == B_count == overlap):
+ # duplicated matches means order match is undefined
+ order = None
+ else:
+ order = A_filtered == B_filtered
+ return {"a_and_b": overlap, "a_and_b_same_order": order}
+
+
+def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
+ """
+ Given a canonical sequence collection, compute its digest.
+
+ :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object
+ :param dict schema: Schema defining the inherent attributes to digest
+ :return str: The sequence collection digest
+ """
+
+ validate_seqcol(seqcol_obj)
+ # Step 1a: Remove any non-inherent attributes,
+ # so that only the inherent attributes contribute to the digest.
+ seqcol_obj2 = {}
+ if schema:
+ for k in schema["inherent"]:
+ # Step 2: Apply RFC-8785 to canonicalize the value
+ # associated with each attribute individually.
+ seqcol_obj2[k] = canonical_str(seqcol_obj[k])
+ else: # no schema provided, so assume all attributes are inherent
+ for k in seqcol_obj:
+ seqcol_obj2[k] = canonical_str(seqcol_obj[k])
+ # Step 3: Digest each canonicalized attribute value
+ # using the GA4GH digest algorithm.
+
+ seqcol_obj3 = {}
+ for attribute in seqcol_obj2:
+ seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute])
+ # print(json.dumps(seqcol_obj3, indent=2)) # visualize the result
+
+ # Step 4: Apply RFC-8785 again to canonicalize the JSON
+ # of new seqcol object representation.
+
+ seqcol_obj4 = canonical_str(seqcol_obj3)
+
+ # Step 5: Digest the final canonical representation again.
+ seqcol_digest = sha512t24u_digest(seqcol_obj4)
+ return seqcol_digest
+
+
+def explain_flag(flag):
+ """Explains a compare flag"""
+ print(f"Flag: {flag}\nBinary: {bin(flag)}\n")
+ for e in range(0, 13):
+ if flag & 2**e:
+ print(FLAGS[2**e])
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 0470f30..1afd5a4 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,5 +1,6 @@
-henge==0.1.1
+biopython
+henge>=0.2.1
pyfaidx
+pyyaml
requests
yacman>=0.6.7
-pyyaml
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
new file mode 100644
index 0000000..60c9958
--- /dev/null
+++ b/requirements/requirements-dev.txt
@@ -0,0 +1 @@
+-e git+git://github.com/databio/henge@master#egg=henge
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f45eb89..b7cf1d9 100644
--- a/setup.py
+++ b/setup.py
@@ -19,10 +19,10 @@
extra["install_requires"] = DEPENDENCIES
-with open("{}/_version.py".format(PACKAGE), 'r') as versionfile:
+with open("{}/_version.py".format(PACKAGE), "r") as versionfile:
version = versionfile.readline().split()[-1].strip("\"'\n")
-long_description = open('README.md').read()
+long_description = open("README.md").read()
setup(
name=PACKAGE,
@@ -30,28 +30,28 @@
version=version,
description="Python client for refget",
long_description=long_description,
- long_description_content_type='text/markdown',
+ long_description_content_type="text/markdown",
classifiers=[
"Development Status :: 4 - Beta",
"License :: OSI Approved :: BSD License",
- "Programming Language :: Python :: 2.7",
- "Programming Language :: Python :: 3.5",
- "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
],
keywords="genome, assembly, bioinformatics, reference, sequence",
url="https://github.com/refgenie/refget",
- author=u"Nathan Sheffield, Michal Stolarczyk",
- author_email=u"nathan@code.databio.org",
+ author="Nathan Sheffield, Michal Stolarczyk",
+ author_email="nathan@code.databio.org",
license="BSD2",
entry_points={
- "console_scripts": [
- 'refget = refget.refget:main'
- ],
- },
+ "console_scripts": ["refget = refget.refget:main"],
+ },
# package_data={"refget": [os.path.join("refget", "*")]},
include_package_data=True,
test_suite="tests",
tests_require=(["mock", "pytest"]),
setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []),
- **extra
+ **extra,
)
diff --git a/tests/conftest.py b/tests/conftest.py
index 37d6922..129ff0b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,13 @@
""" Test suite shared objects and setup """
+
import os
import pytest
import oyaml as yaml
+from refget.const import _schema_path
def ly(n, data_path):
+ """Load YAML"""
with open(os.path.join(data_path, n), "r") as f:
return yaml.safe_load(f)
@@ -14,9 +17,19 @@ def schema_path():
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas")
+@pytest.fixture
+def fa_root():
+ return os.path.join(
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir),
+ "demo_fasta",
+ )
+
+
@pytest.fixture
def fasta_path():
- return os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), "demo_fasta")
+ return os.path.join(
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), "demo_fasta"
+ )
@pytest.fixture
diff --git a/tests/test_refget.py b/tests/test_refget.py
index 5f3a2ae..e53f37a 100644
--- a/tests/test_refget.py
+++ b/tests/test_refget.py
@@ -10,11 +10,10 @@ class TestEmptyConstructor:
def test_no_schemas_required(self):
assert isinstance(RefGetClient(), RefGetClient)
+
class TestInsert:
@pytest.mark.parametrize("fasta_name", DEMO_FILES)
def test_insert_works(self, fasta_name, fasta_path):
rgc = RefGetClient()
d = rgc.load_seq("TCGA")
- assert(rgc.refget(d) == "TCGA")
-
-
+ assert rgc.refget(d) == "TCGA"
diff --git a/tests/test_seqcol.py b/tests/test_seqcol.py
new file mode 100644
index 0000000..85555f9
--- /dev/null
+++ b/tests/test_seqcol.py
@@ -0,0 +1,124 @@
+import json
+import os
+import pytest
+import refget
+
+# from seqcol import SeqColHenge, validate_seqcol, compare
+# from seqcol.const import *
+
+DEMO_FILES = [
+ "demo0.fa",
+ "demo1.fa.gz",
+ "demo2.fa",
+ "demo3.fa",
+ "demo4.fa",
+ "demo5.fa.gz",
+ "demo6.fa",
+]
+
+# Pairs of files to compare, with the "correct" compare response
+COMPARE_TESTS = [
+ (DEMO_FILES[1], DEMO_FILES[1], "demo_fasta/compare-1vs1.json"),
+ (DEMO_FILES[0], DEMO_FILES[1], "demo_fasta/compare-0vs1.json"),
+]
+SNLP_TESTS = [
+ (DEMO_FILES[5], DEMO_FILES[6], "demo_fasta/compare-5vs6.json"),
+] # sorted_name_length_pairs
+
+
+class TestGeneral:
+ def test_no_schemas_required(self):
+ """
+ In contrast to the generic Henge object, SeqColHenge does not
+ require schemas as input, they are predefined in the constructor
+ """
+ assert isinstance(refget.SeqColHenge(database={}), refget.SeqColHenge)
+
+
+class TestFastaInserting:
+ @pytest.mark.parametrize("fasta_name", DEMO_FILES)
+ def test_fasta_loading_works(self, fasta_name, fa_root):
+ scc = refget.SeqColHenge(database={})
+ f = os.path.join(fa_root, fasta_name)
+ print("Fasta file to be loaded: {}".format(f))
+ res = scc.load_fasta(f)
+ assert len(res) == 2 # returns digest and list of AnnotatedSequencesList
+
+
+class TestRetrieval:
+ @pytest.mark.parametrize("fasta_name", DEMO_FILES)
+ def test_retrieval_works(self, fasta_name, fa_root):
+ scc = refget.SeqColHenge(database={})
+ f = os.path.join(fa_root, fasta_name)
+ print("Fasta file to be loaded: {}".format(f))
+ d, asds = scc.load_fasta(f)
+ # convert integers in the dicts to strings
+ # {k: str(v) if isinstance(v, int) else v for k, v in asd.items()}
+ lst = [{k: v for k, v in asd.items()} for asd in asds]
+ assert scc.retrieve(d) == lst
+
+
+def check_comparison(fasta1, fasta2, expected_comparison):
+ print(f"Comparison: Fasta1: {fasta1} vs Fasta2: {fasta2}. Expected: {expected_comparison}")
+ scc = refget.SeqColHenge(database={})
+ d = scc.load_fasta_from_filepath(fasta1)
+ d2 = scc.load_fasta_from_filepath(fasta2)
+ with open(expected_comparison) as fp:
+ correct_compare_response = json.load(fp)
+ proposed_compare_response = refget.compare_seqcols(d["SCAS"], d2["SCAS"])
+ print(
+ json.dumps(
+ proposed_compare_response,
+ separators=(",", ":"),
+ ensure_ascii=False,
+ allow_nan=False,
+ sort_keys=True,
+ indent=2,
+ )
+ )
+ assert proposed_compare_response == correct_compare_response
+
+
+class TestCompare:
+ """
+ Test the compare function, using demo fasta files, and pre-computed
+ compare function results stored as answer files.
+ """
+
+ @pytest.mark.parametrize(["fasta1", "fasta2", "answer_file"], COMPARE_TESTS)
+ def test_fasta_compare(self, fasta1, fasta2, answer_file, fa_root):
+ check_comparison(os.path.join(fa_root, fasta1), os.path.join(fa_root, fasta2), answer_file)
+
+ @pytest.mark.parametrize(["fasta1", "fasta2", "answer_file"], SNLP_TESTS)
+ def test_names_lengths_order(self, fasta1, fasta2, answer_file, fa_root):
+ """Does the names_lengths array correctly identify order variants"""
+ check_comparison(os.path.join(fa_root, fasta1), os.path.join(fa_root, fasta2), answer_file)
+
+
+seqcol_obj = {
+ "lengths": [248956422, 133797422, 135086622],
+ "names": ["chr1", "chr2", "chr3"],
+ "sequences": [
+ "2648ae1bacce4ec4b6cf337dcae37816",
+ "907112d17fcb73bcab1ed1c72b97ce68",
+ "1511375dc2dd1b633af8cf439ae90cec",
+ ],
+}
+
+bad_seqcol = {"bogus": True}
+
+
+class TestValidate:
+ """
+ Test validation
+ """
+
+ @pytest.mark.parametrize(["seqcol_obj"], [[seqcol_obj]])
+ def test_validate(self, seqcol_obj):
+ is_valid = refget.validate_seqcol(seqcol_obj)
+ assert is_valid
+
+ @pytest.mark.parametrize(["seqcol_obj"], [[bad_seqcol]])
+ def test_failure(self, seqcol_obj):
+ with pytest.raises(Exception):
+ refget.validate_seqcol(seqcol_obj)