diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..8b48ddf --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: psf/black@stable diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4e1ef42..e54ad87 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -11,11 +11,13 @@ jobs: deploy: runs-on: ubuntu-latest - + name: upload release to PyPI + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install dependencies @@ -23,9 +25,8 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel twine - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python setup.py sdist bdist_wheel - twine upload dist/* + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + diff --git a/.github/workflows/run-codecov.yml b/.github/workflows/run-codecov.yml new file mode 100644 index 0000000..a41a1fd --- /dev/null +++ b/.github/workflows/run-codecov.yml @@ -0,0 +1,21 @@ +name: Run codecov + +on: + pull_request: + branches: [master] + +jobs: + pytest: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.9] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v2 + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + file: ./coverage.xml + name: py-${{ matrix.python-version }}-${{ matrix.os }} diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml new file mode 100644 index 0000000..a4fb5fe --- /dev/null +++ b/.github/workflows/run-pytest.yml @@ -0,0 +1,30 @@ +name: Run pytests + +on: + pull_request: + branches: [master, dev] + +jobs: + pytest: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ["3.8", "3.11"] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install test dependencies + run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + + - name: Install package + run: python -m pip install . + + - name: Run pytest tests + run: pytest tests -x -vv --cov=./ --cov-report=xml diff --git a/.gitignore b/.gitignore index 7523f45..55ae73f 100644 --- a/.gitignore +++ b/.gitignore @@ -71,9 +71,12 @@ open_pipelines/ *RESERVE* # Build-related stuff +build dist/ refget.egg-info/ #ipynm checkpoints *ipynb_checkpoints* *.egg-info* + + diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9e5fd48..0000000 --- a/.travis.yml +++ /dev/null @@ -1,17 +0,0 @@ -language: python -python: - - "3.5" - - "3.6" - - "3.8" -os: - - linux -before_install: - - pip install git+https://github.com/databio/henge.git@master -install: - - pip install . - - pip install -r requirements/requirements-test.txt -script: pytest tests -x -vv -branches: - only: - - dev - - master diff --git a/LICENSE.txt b/LICENSE.txt index 1b78bad..ec94eeb 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright 2017 Nathan Sheffield +Copyright 2024 Nathan Sheffield Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index b855130..7d9660d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![Build Status](https://travis-ci.com/refgenie/refget.svg?branch=master)](https://travis-ci.com/refgenie/refget) -The refget package provides a Python interface to both remote and local use of the refget protocol. +The refget package provides a Python interface to both remote and local use of the refget protocol. -Documentation is hosted at [refget.databio.org](http://refget.databio.org). +This package provides clients and functions for both refget sequences and refget sequence collections (seqcol). + +Documentation is hosted at [refgenie.org/refget](https://refgenie.org/refget/). diff --git a/array_overlap.py b/array_overlap.py new file mode 100644 index 0000000..4012c1a --- /dev/null +++ b/array_overlap.py @@ -0,0 +1,55 @@ +# from itertools import compress +# from collections import Counter +# overlap = sum((Counter(A) & Counter(B)).values()) +# A=example1 +# B=example4 +# overlap +# all = list(A) + list(set(B) - set(list(A))) +# A_compressed = list(compress(A, B)) +# B_compressed = list(compress(B, A)) +# A_filtered = list(filter(lambda x: x in B, A)) +# B_filtered = list(filter(lambda x: x in A, B)) +# len(A_compressed) == len(B_compressed) +# C_compressed = list(compress(C, A)) + +# from itertools import filter + + +def array_overlap(A, B): + A_filtered = list(filter(lambda x: x in B, A)) + B_filtered = list(filter(lambda x: x in A, B)) + A_count = len(A_filtered) + B_count = len(B_filtered) + overlap = min(len(A_filtered), len(B_filtered)) + if A_count + B_count < 1: + # order match requires at least 2 matching elements + order = None + elif not (A_count == B_count): + # duplicated matches means order match is undefined + order = None + else: + order = A_filtered == B_filtered + return {"overlap": overlap, "order-match": order} + + +example1 = ["A", "B", "C", "D"] +example2 = ["A", "B", "C"] +example3 = ["A", "B", "C", "B"] +example4 = ["B", "B", "B", "B"] +example5 = ["X", "A", "B", "Y", "C", "D", "E"] +example6 = ["A", "B", "C", "D", "B"] +example7 = ["A", "B", "C", "D", "A"] +example8 = ["A", "B", "C", "D", "B", "A"] + + +compatoverlap(example1, example2) +compatoverlap(example1, example3) +compatoverlap(example2, example3) +compatoverlap(example1, example4) +compatoverlap(example1, example5) +compatoverlap(example3, example5) +compatoverlap(example5, example3) +compatoverlap(example3, example6) +compatoverlap(example6, example7) +compatoverlap(example7, example8) +compatoverlap(example8, example8) diff --git a/demo_fasta/compare-0vs1.json b/demo_fasta/compare-0vs1.json new file mode 100644 index 0000000..e0d9ae6 --- /dev/null +++ b/demo_fasta/compare-0vs1.json @@ -0,0 +1,38 @@ +{ + "array_elements":{ + "a":{ + "lengths":3, + "names":3, + "sequences":3, + "sorted_name_length_pairs":3 + }, + "a_and_b":{ + "lengths":2, + "names":2, + "sequences":0, + "sorted_name_length_pairs":2 + }, + "a_and_b_same_order":{ + "lengths":true, + "names":true, + "sequences":null, + "sorted_name_length_pairs":true + }, + "b":{ + "lengths":2, + "names":2, + "sequences":2, + "sorted_name_length_pairs":2 + } + }, + "attributes":{ + "a_and_b":[ + "lengths", + "names", + "sequences", + "sorted_name_length_pairs" + ], + "a_only":[], + "b_only":[] + } +} \ No newline at end of file diff --git a/demo_fasta/compare-1vs1.json b/demo_fasta/compare-1vs1.json new file mode 100644 index 0000000..6c01b0c --- /dev/null +++ b/demo_fasta/compare-1vs1.json @@ -0,0 +1,38 @@ +{ + "array_elements":{ + "a":{ + "lengths":2, + "names":2, + "sequences":2, + "sorted_name_length_pairs":2 + }, + "a_and_b":{ + "lengths":2, + "names":2, + "sequences":2, + "sorted_name_length_pairs":2 + }, + "a_and_b_same_order":{ + "lengths":true, + "names":true, + "sequences":true, + "sorted_name_length_pairs":true + }, + "b":{ + "lengths":2, + "names":2, + "sequences":2, + "sorted_name_length_pairs":2 + } + }, + "attributes":{ + "a_and_b":[ + "lengths", + "names", + "sequences", + "sorted_name_length_pairs" + ], + "a_only":[], + "b_only":[] + } +} \ No newline at end of file diff --git a/demo_fasta/compare-5vs6.json b/demo_fasta/compare-5vs6.json new file mode 100644 index 0000000..347f738 --- /dev/null +++ b/demo_fasta/compare-5vs6.json @@ -0,0 +1,38 @@ +{ + "array_elements":{ + "a":{ + "lengths":3, + "names":3, + "sequences":3, + "sorted_name_length_pairs":3 + }, + "a_and_b":{ + "lengths":3, + "names":3, + "sequences":3, + "sorted_name_length_pairs":3 + }, + "a_and_b_same_order":{ + "lengths":false, + "names":false, + "sequences":false, + "sorted_name_length_pairs":true + }, + "b":{ + "lengths":3, + "names":3, + "sequences":3, + "sorted_name_length_pairs":3 + } + }, + "attributes":{ + "a_and_b":[ + "lengths", + "names", + "sequences", + "sorted_name_length_pairs" + ], + "a_only":[], + "b_only":[] + } +} \ No newline at end of file diff --git a/demo_fasta/demo0.fa b/demo_fasta/demo0.fa new file mode 100644 index 0000000..dd08063 --- /dev/null +++ b/demo_fasta/demo0.fa @@ -0,0 +1,6 @@ +>chrX +TTGGGGAA +>chr1 +GGAA +>chr2 +GCGC diff --git a/demo_fasta/demo0.fa.fai b/demo_fasta/demo0.fa.fai new file mode 100644 index 0000000..dc37dcd --- /dev/null +++ b/demo_fasta/demo0.fa.fai @@ -0,0 +1,3 @@ +chrX 8 6 8 9 +chr1 4 21 4 5 +chr2 4 32 4 5 diff --git a/demo_fasta/demo1.fa.fai b/demo_fasta/demo1.fa.fai new file mode 100644 index 0000000..c55c6c1 --- /dev/null +++ b/demo_fasta/demo1.fa.fai @@ -0,0 +1,2 @@ +chr1 4 6 4 5 +chr2 4 17 4 5 diff --git a/demo_fasta/demo1.fa.gz b/demo_fasta/demo1.fa.gz new file mode 100644 index 0000000..c2174d6 Binary files /dev/null and b/demo_fasta/demo1.fa.gz differ diff --git a/demo_fasta/demo5.fa.gz b/demo_fasta/demo5.fa.gz new file mode 100644 index 0000000..be98af2 Binary files /dev/null and b/demo_fasta/demo5.fa.gz differ diff --git a/demo_fasta/demo6.fa b/demo_fasta/demo6.fa new file mode 100644 index 0000000..dd08063 --- /dev/null +++ b/demo_fasta/demo6.fa @@ -0,0 +1,6 @@ +>chrX +TTGGGGAA +>chr1 +GGAA +>chr2 +GCGC diff --git a/demo_fasta/demo6.fa.fai b/demo_fasta/demo6.fa.fai new file mode 100644 index 0000000..dc37dcd --- /dev/null +++ b/demo_fasta/demo6.fa.fai @@ -0,0 +1,3 @@ +chrX 8 6 8 9 +chr1 4 21 4 5 +chr2 4 32 4 5 diff --git a/deprecated.py b/deprecated.py new file mode 100644 index 0000000..6038709 --- /dev/null +++ b/deprecated.py @@ -0,0 +1,106 @@ +from itertools import compress +from functools import reduce + +import seqcol + +sc1 = {"names": ["chr1", "chr2"], "sequences": ["ay89fw", "we4f9x"]} + +sc2 = {"names": ["1", "2", "3"], "sequences": ["ay89fw", "we4f9x", "3n20xk2"]} + +sc3 = {"names": ["2", "3", "1"], "sequences": ["we4f9x", "3n20xk2", "ay89fw"]} + +sc4 = {"names": ["chr1", "3", "2"], "sequences": ["zyt2fw", "9snm23k", "fsd2x3"]} + +sc5 = { + "names": ["chr1", "3", "2"], + "sequences": ["zyt2fw", "9snm23k", "fsd2x3"], + "topologies": ["circular", "linear", "linear"], +} + + +def compat(A, B): + ainb = [x in B for x in A] + bina = [x in A for x in B] + if any(ainb): + order = list(compress(B, bina)) == list(compress(A, ainb)) + else: + order = False + + any(ainb) + + flag = 0 + flag += 2 if all(ainb) else 0 + flag += 4 if all(bina) else 0 + flag += 8 if order else 0 + flag += 1 if any(ainb) else 0 + + return flag + + +# New compat function that adds true/false +def compat(A, B): + ainb = [x in B for x in A] + bina = [x in A for x in B] + if any(ainb): + order = list(compress(B, bina)) == list(compress(A, ainb)) + else: + order = False + + any(ainb) + + flag = 0 + flag += 2 if all(ainb) else 0 + flag += 4 if all(bina) else 0 + flag += 8 if order else 0 + flag += 1 if any(ainb) else 0 + result = { + "any-elements-shared": any(ainb), + "a-subset-of-b": all(ainb), + "b-subset-of-a": all(bina), + "order-match": order, + "flag": flag, + } + return result + + +# For each array: +# - any-elements-shared (1) (0001) +# - all-a-in-b (2) (0010) +# - all-b-in-a (4) (0100) +# - order-match (8) (1000) + +# no match: 0000 = 0 +# one or more shared elements: 0001 = 1 +# all-a-in-b (a is a subset of b, in different order) = 0011 = 3 +# all-b-in-a (b is a subset of a, in different order) = 0101 = 5 +# same content, different order: 0111 = 7 +# a is a subset of b, in same order: 1011 = 11 +# b is a subset of a, in same order: 1101 = 13 +# identity: 1111 = 15 + + +compat(sc1["sequences"], sc2["sequences"]) +compat(sc1["names"], sc2["names"]) + +compat(sc3["sequences"], sc2["sequences"]) +compat(sc3["names"], sc2["names"]) + +compat(sc1["sequences"], sc3["sequences"]) + + +def compat_all(A, B): + all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys()))) + result = {} + for k in all_keys: + if k not in A or k not in B: + result[k] = {"flag": -1} + else: + result[k] = compat(A[k], B[k]) + # result["all"] = reduce(lambda x,y: x['flag']&y['flag'], list(result.values())) + return result + + +compat_all(sc1, sc2) +compat_all(sc3, sc2) +compat_all(sc1, sc3) +compat_all(sc1, sc5) diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 0d94560..0000000 --- a/docs/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Refget - -## Introduction - -The refget package provides a Python interface to both remote and local use of the refget protocol. This package serves 4 functions: - -1. A lightweight python interface to a remote refget API. - -2. Local caching of retrieved results, improving performance for applications that require repeated lookups. - -3. A fully functioning local implementation of the refget protocol for local analysis backed by either memory, SQLite, or MongoDB. - -4. Convenience functions for computing refget checksums from python and handling FASTA files directly. - -## Install - -```console -pip install refget -``` - -## Basic use - -### Retrieve results from a RESTful API - -```python -import refget - -rgc = refget.RefGetClient("https://refget.herokuapp.com/sequence/") -rgc.refget("6681ac2f62509cfc220d78751b8dc524", start=0, end=10) - -``` - -### Compute digests locally - -```python -refget.trunc512_digest("TCGA") -``` - -### Insert and retrieve sequences with a local database - -```python -checksum = rgc.load_seq("GGAA") -rgc.refget(checksum) -``` - -For more details, see the [tutorial](tutorial.md). diff --git a/docs/autodoc_build/refget.md b/docs/autodoc_build/refget.md deleted file mode 100644 index 6f38ada..0000000 --- a/docs/autodoc_build/refget.md +++ /dev/null @@ -1,110 +0,0 @@ - - - - - -# Package `refget` Documentation - -## Class `RefGetClient` -```python -def __init__(self, api_url_base=None, database={}, schemas=None, henges=None, checksum_function=, suppress_connect=True) -``` - -A user interface to insert and retrieve decomposable recursive unique identifiers (DRUIDs). -#### Parameters: - -- `database` (`dict`): Dict-like lookup database with sequences and hashes. -- `schemas` (`dict`): One or more jsonschema schemas describing thedata types stored by this Henge -- `checksum_function` (`function(str) -> str`): Default function to handle the digest of theserialized items stored in this henge. - - - - -```python -def get_service_info(self) -``` - - - -```python -def item_types(self) -``` - -A list of item types handled by this Henge instance - - - -```python -def load_fasta(self, fa_file, lengths_only=False) -``` - -Calculates checksums and loads each sequence in a fasta file into the database. - - - -```python -def load_seq(self, seq) -``` - - - -```python -def load_sequence_dict(self, seqset) -``` - -Convert a 'seqset', which is a dict with names as sequence names and values as sequences, into the 'asdlist' required for henge insert. - - - -```python -def meta(self, digest) -``` - - - -```python -def refget(self, digest, start=None, end=None) -``` - - - -```python -def refget_remote(self, digest, start=None, end=None) -``` - - - -```python -def service_info(self) -``` - - - - - - -*Version Information: `refget` v0.0.1, generated by `lucidoc` v0.4.2* \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index a067237..0000000 --- a/docs/changelog.md +++ /dev/null @@ -1,11 +0,0 @@ -# Changelog - -This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. - -## [0.1.0] - 2021-06-17 - -First public version, backed by henge version 0.1.1. - -## [0.0.1] - 2020-06-25 - -Beta version for testing diff --git a/docs_jupyter/build/.gitignore b/docs_jupyter/build/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/docs_jupyter/build/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb deleted file mode 100644 index acff965..0000000 --- a/docs_jupyter/tutorial.ipynb +++ /dev/null @@ -1,690 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Refget python package tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Record some versions:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'3.8.5'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from platform import python_version \n", - "python_version()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.1.0'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import refget\n", - "refget.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Computing digests locally" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from refget import trunc512_digest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Show some results for sequence digests:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trunc512_digest('ACGT')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trunc512_digest('TCGA')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf70'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trunc512_digest('ACGT', 26)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Connecting to a remote API\n", - "\n", - "The refget package provides a simple python wrapper around a remote hosted refget RESTful API. Provide the base url when construction a RefGetClient object and you can retrieve sequences from the remote server." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACA'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also hit the `{digest}/metadata` and `service_info` API endpoints described in the refget API specification:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'metadata': {'aliases': [{'alias': 'ga4gh:SQ.lZyxiD_ByprhOUzrR1o1bq0ezO_1gkrn',\n", - " 'naming_authority': 'ga4gh'},\n", - " {'alias': 'I', 'naming_authority': 'unknown'}],\n", - " 'length': 230218,\n", - " 'md5': '6681ac2f62509cfc220d78751b8dc524',\n", - " 'trunc512': '959cb1883fc1ca9ae1394ceb475a356ead1ecceff5824ae7'}}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.meta(\"6681ac2f62509cfc220d78751b8dc524\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'service': {'algorithms': ['ga4gh', 'md5', 'trunc512'],\n", - " 'circular_supported': True,\n", - " 'subsequence_limit': None,\n", - " 'supported_api_versions': ['1.0.0']}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.service_info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When requesting a sequence that is not found, the service responds appropriately:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(trunc512_digest('TCGATCGA'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use a local database for caching\n", - "\n", - "By default, any full-sequences retrieved from an API are cached locally in memory (in a Python Dict). This data will not persist past a current session, but is useful if you have an application that requires repeated requests. here, we re-request the sequence requested above. It is much faster this time because it uses a local cache:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACA'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also add new sequences into the database:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(refget.md5('TCGATCGA')) # This sequence is not found in our database yet" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "checksum = rgc.load_seq(\"TCGATCGA\") # So, let's add it into database" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'TCGATCGA'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(checksum) # This time it returns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Keep in mind that sequences added in this way are added to your *local* database, not to the remote API, so when we restart, they will be gone:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "del rgc" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")\n", - "rgc.refget(refget.md5('TCGA'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Making data persist\n", - "\n", - "If you want to retain your local cache, you can use a Dict that is backed by some persistent storage, such as a database on disk or another running process. There are many ways to do this, for example, you can use an sqlite database, a Redis database, or a MongoDB database. Here we'll show you how to use the `sqlitedict` package to back your local database.\n", - "\n", - "To start, you need to create a dict object and pass that to the RefGetClient constructor." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "import refget\n", - "from sqlitedict import SqliteDict\n", - "mydict = SqliteDict('./my_db.sqlite', autocommit=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\", mydict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now when we retrieve a sequence it will be added to the local sqlite database automatically." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look, we can see that this object has been added to our sqlite database:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mydict[\"6681ac2f62509cfc220d78751b8dc524\"][1:50]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So now if we kill this object and start it up again *without the API connection*, but with the mydict local backend, we can still retrieve it:" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "del rgc" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "rgc = refget.RefGetClient(database=mydict)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading a fasta file\n", - "\n", - "The package also comes with a helper function for computing checksums for an entire fasta file." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "fa_file = \"../demo_fasta/demo.fa\"\n", - "content = rgc.load_fasta(fa_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'chr1',\n", - " 'length': 4,\n", - " 'sequence_digest': 'f1f8f4bf413b16ad135722aa4591043e'},\n", - " {'name': 'chr2',\n", - " 'length': 4,\n", - " 'sequence_digest': '45d0ff9f1a9504cf2039f89c1ffb4c32'}]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "content" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'ACGT'" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(content[0]['sequence_digest'])" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No remote URL connected\n" - ] - } - ], - "source": [ - "rgc.refget(\"blah\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "rgc.api_url_base = \"https://refget.herokuapp.com/sequence/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"blah\")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "# You can show the complete contents of the database like this:\n", - "# rgc.show()\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/interactive_tests.py b/interactive_tests.py new file mode 100644 index 0000000..e16cdaa --- /dev/null +++ b/interactive_tests.py @@ -0,0 +1,168 @@ +import seqcol +from seqcol import SeqColHenge + + +scc = SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"]) +scc + +fa_file = "demo_fasta/demo0.fa" +fa_object = seqcol.parse_fasta(fa_file) + +skip_seq = False +aslist = [] +names = [] +lengths = [] +lengthsi = [] +sequences = [] +for k in fa_object.keys(): + seq = str(fa_object[k]) + names.append(k) + lengths.append(str(len(seq))) + lengthsi.append(len(seq)) + sequences.append(seq) + +array_set = {"names": names, "lengths": lengths, "sequences": sequences} + +collection_checksum = scc.insert(array_set, "SeqColArraySet") +collection_checksum + +scc.retrieve(collection_checksum) +scc.retrieve(collection_checksum, reclimit=1) + +# scc.retrieve("5c4b07f08319d3d0815f5ee25c45916a01f9d1519f0112e8") + +scc.retrieve(collection_checksum, reclimit=1) +scc.retrieve(collection_checksum, reclimit=2) +scc.retrieve(collection_checksum) +scc.supports_inherent_attrs + +# Now a test of inherent attributes +import seqcol + +scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"]) +scci +scci.schemas + + +fa_file = "demo_fasta/demo0.fa" +fa_object = seqcol.parse_fasta(fa_file) + +array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"} +array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"} + + +di = scci.insert(array_set_i, "SeqColArraySet") +di = scci.insert(array_set_i2, "SeqColArraySet") +di +# scc.retrieve(di) +scci.retrieve(di) +fasta_path = "demo_fasta" +fasta1 = "demo2.fa" +fasta2 = "demo3.fa" +fasta5 = "demo5.fa.gz" +fasta6 = "demo6.fa" + +import os + +d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1)) +d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2)) +d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2)) +d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5)) +d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6)) +scci.retrieve(d["digest"]) + +scci.retrieve(d5["digest"]) + +fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1)) +SCAS = seqcol.fasta_to_csc(fa_object) +digest = scci.insert(SCAS, "SeqColArraySet", reclimit=1) + + +d["digest"] +d2["digest"] + +scci.compare_digests(d["digest"], d2["digest"]) +scci.compare(d["SCAS"], d2["SCAS"]) + + +json.dumps(scci.compare(d5["SCAS"], d6["SCAS"])) +print( + json.dumps( + scci.compare(d5["SCAS"], d6["SCAS"]), + separators=(",", ":"), + ensure_ascii=False, + allow_nan=False, + sort_keys=True, + indent=2, + ) +) + +build_sorted_name_length_pairs(array_set_i) + +# reorder + +array_set_reordered = {} +for k, v in array_set.items(): + print(k, v) + array_set_reordered[k] = list(reversed(v)) + +array_set +array_set_reordered + +build_sorted_name_length_pairs(array_set) +build_sorted_name_length_pairs(array_set_reordered) + + +import henge + + +from henge import md5 + +names = [] +lengths = [] +seq_digests = [] +for k in fa_object.keys(): + seq = str(fa_object[k]) + names.append(k) + lengths.append(str(len(seq))) + seq_digests.append(scc.checksum_function(seq)) + + +array_set = {"names": names, "lengths": lengths, "sequences": seq_digests} +array_set +collection_checksum = scc.insert(array_set, "SeqColArraySet", reclimit=1) + +scc.database = {} +scc.retrieve("d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a", reclimit=1) +scc.retrieve("d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a", reclimit=2) +scc.database[collection_checksum] +scc.checksum_function + + +scc.database["d229d5c16b3a1b3788f01aa439f01e682ba84bc9935ad08a"] + +scc.database["a99fe2e92875099c84f73b20ef8e7dd2f2d12f063383bed0"] +scc.database["ca82b053295b6f49923d0b2cedb83de49c6be59688c3dfd9"] + + +import os + +os.getcwd() + + +## standalone functions + +import seqcol + +fa_file = "demo_fasta/demo0.fa" +fa_object = seqcol.parse_fasta(fa_file) + +# get a canonical seqcol object +csc = seqcol.fasta_to_csc(fa_object) +csc +import json + +print(json.dumps(csc, indent=2)) + + +seqcol.seqcol_digest(csc) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9348158 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.black] +line-length = 99 +target-version = ['py38', 'py311'] +include = '\.pyi?$' diff --git a/refget/__init__.py b/refget/__init__.py index 3841f46..897c9fa 100644 --- a/refget/__init__.py +++ b/refget/__init__.py @@ -1,11 +1,12 @@ # Project configuration, particularly for logging. import logging + from ._version import __version__ +from .const import * from .hash_functions import * from .refget import RefGetClient from .refget import parse_fasta - - -__classes__ = ["RefGetClient"] -__all__ = __classes__ + [] +from .seqcol import * +from .utilities import * +from .seqcol_client import * diff --git a/refget/_version.py b/refget/_version.py index 3dc1f76..d3ec452 100644 --- a/refget/_version.py +++ b/refget/_version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/refget/const.py b/refget/const.py new file mode 100644 index 0000000..987b644 --- /dev/null +++ b/refget/const.py @@ -0,0 +1,51 @@ +import os + + +def _schema_path(name): + return os.path.join(SCHEMA_FILEPATH, name) + + +CONTENT_ALL_A_IN_B = 2**0 +CONTENT_ALL_B_IN_A = 2**1 +LENGTHS_ALL_A_IN_B = 2**2 +LENGTHS_ALL_B_IN_A = 2**3 +NAMES_ALL_A_IN_B = 2**4 +NAMES_ALL_B_IN_A = 2**5 +TOPO_ALL_A_IN_B = 2**6 +TOPO_ALL_B_IN_A = 2**7 +CONTENT_ANY_SHARED = 2**8 +LENGTHS_ANY_SHARED = 2**9 +NAMES_ANY_SHARED = 2**10 +CONTENT_A_ORDER = 2**11 +CONTENT_B_ORDER = 2**12 + +FLAGS = { + CONTENT_ALL_A_IN_B: "CONTENT_ALL_A_IN_B", + CONTENT_ALL_B_IN_A: "CONTENT_ALL_B_IN_A", + LENGTHS_ALL_A_IN_B: "LENGTHS_ALL_A_IN_B", + LENGTHS_ALL_B_IN_A: "LENGTHS_ALL_B_IN_A", + NAMES_ALL_A_IN_B: "NAMES_ALL_A_IN_B", + NAMES_ALL_B_IN_A: "NAMES_ALL_B_IN_A", + TOPO_ALL_A_IN_B: "TOPO_ALL_A_IN_B", + TOPO_ALL_B_IN_A: "TOPO_ALL_B_IN_A", + CONTENT_ANY_SHARED: "CONTENT_ANY_SHARED", + LENGTHS_ANY_SHARED: "LENGTHS_ANY_SHARED", + NAMES_ANY_SHARED: "NAMES_ANY_SHARED", + CONTENT_A_ORDER: "CONTENT_A_ORDER", + CONTENT_B_ORDER: "CONTENT_B_ORDER", +} + +KNOWN_TOPOS = ["linear", "circular"] +NAME_KEY = "name" +SEQ_KEY = "sequence" +TOPO_KEY = "topology" +LEN_KEY = "length" + +# internal schemas paths determination +ASL_NAME = "AnnotatedSequenceList" +SCHEMA_NAMES = [ASL_NAME + ".yaml"] +SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas") +INTERNAL_SCHEMAS = [_schema_path(s) for s in SCHEMA_NAMES] + +# Alias dict to make typehinting clearer +SeqCol = dict diff --git a/refget/exceptions.py b/refget/exceptions.py new file mode 100644 index 0000000..fd7bde0 --- /dev/null +++ b/refget/exceptions.py @@ -0,0 +1,10 @@ +class InvalidSeqColError(Exception): + """Object was not validated successfully according to schema.""" + + def __init__(self, message, errors): + super().__init__(message) + self.message = message + self.errors = errors + + def __str__(self): + return f"InvalidSeqColError ({self.message}): {self.errors}" diff --git a/refget/hash_functions.py b/refget/hash_functions.py index 234d98a..34a29b3 100644 --- a/refget/hash_functions.py +++ b/refget/hash_functions.py @@ -6,32 +6,38 @@ import hashlib import binascii + def trunc512_digest(seq, offset=24): - digest = hashlib.sha512(seq.encode('utf-8')).digest() + digest = hashlib.sha512(seq.encode("utf-8")).digest() hex_digest = binascii.hexlify(digest[:offset]) - return hex_digest.decode("utf-8") + return hex_digest.decode("utf-8") + def ga4gh_digest(seq, digest_size=24): # b64 encoding results in 4/3 size expansion of data and padded if # not multiple of 3, which doesn't make sense for this use assert digest_size % 3 == 0, "digest size must be multiple of 3" - digest = hashlib.sha512(seq.encode('utf-8')).digest() + digest = hashlib.sha512(seq.encode("utf-8")).digest() return _ga4gh_format(digest, digest_size) + def _ga4gh_format(digest, digest_size=24): tdigest_b64us = base64.urlsafe_b64encode(digest[:digest_size]) return "ga4gh:SQ.{}".format(tdigest_b64us.decode("utf-8")) + def ga4gh_to_trunc512(vmc): - base64_strip = vmc.replace("ga4gh:SQ.","") + base64_strip = vmc.replace("ga4gh:SQ.", "") digest = base64.urlsafe_b64decode(base64_strip) hex_digest = binascii.hexlify(digest) - return hex_digest.decode("utf-8") + return hex_digest.decode("utf-8") + def trunc512_to_ga4gh(trunc512): - digest_length = len(trunc512)*2 + digest_length = len(trunc512) * 2 digest = binascii.unhexlify(trunc512) return _ga4gh_format(digest, digest_length) + def md5(seq): return hashlib.md5(seq.encode()).hexdigest() diff --git a/refget/refget.py b/refget/refget.py index 08468d6..7bc351b 100644 --- a/refget/refget.py +++ b/refget/refget.py @@ -12,9 +12,7 @@ _LOGGER = logging.getLogger(__name__) henge.ITEM_TYPE = "_item_type" -SCHEMA_FILEPATH = os.path.join( - os.path.dirname(__file__), - "schemas") +SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas") sequence_schema = """description: "Schema for a single raw sequence" henge_class: sequence @@ -23,9 +21,17 @@ """ - class RefGetClient(henge.Henge): - def __init__(self, api_url_base=None, database={}, schemas=[], schemas_str=[], henges=None, checksum_function=henge.md5, suppress_connect=True): + def __init__( + self, + api_url_base=None, + database={}, + schemas=[], + schemas_str=[], + henges=None, + checksum_function=henge.md5, + suppress_connect=True, + ): """ A user interface to insert and retrieve decomposable recursive unique identifiers (DRUIDs). @@ -35,29 +41,26 @@ def __init__(self, api_url_base=None, database={}, schemas=[], schemas_str=[], h data types stored by this Henge :param function(str) -> str checksum_function: Default function to handle the digest of the serialized items stored in this henge. - """ + """ self.api_url_base = api_url_base self.local_cache = database self.info = None if not suppress_connect: self.info = self.get_service_info() - # These are the item types that this henge can understand. - if len(schemas) ==0 and len(schemas_str) == 0: + if len(schemas) == 0 and len(schemas_str) == 0: schemas_str = [sequence_schema] - super(RefGetClient, self).__init__(database, schemas, schemas_str, henges=henges, - checksum_function=checksum_function) + super(RefGetClient, self).__init__( + database, schemas, schemas_str, henges=henges, checksum_function=checksum_function + ) def refget_remote(self, digest, start=None, end=None): if not self.api_url_base: print("No remote URL connected") return - url = "{base}{digest}?accept=text/plain".format( - base=self.api_url_base, - digest=digest) - + url = "{base}{digest}?accept=text/plain".format(base=self.api_url_base, digest=digest) if start is not None and end is not None: full_retrieved = False @@ -66,11 +69,11 @@ def refget_remote(self, digest, start=None, end=None): full_retrieved = True r = requests.get(url) - result = r.content.decode('utf-8') + result = r.content.decode("utf-8") if full_retrieved: self.load_seq(result) - + return result def refget(self, digest, start=None, end=None): @@ -81,7 +84,7 @@ def refget(self, digest, start=None, end=None): result = full_data[start:end] else: result = full_data - + return result except henge.NotFoundException: return self.refget_remote(digest, start, end) @@ -96,22 +99,18 @@ def service_info(self): if not self.info: self.info = self.get_service_info() return self.info - + def meta(self, digest): - url = "{base}{digest}/metadata".format( - base=self.api_url_base, - digest=digest) - + url = "{base}{digest}/metadata".format(base=self.api_url_base, digest=digest) + r = requests.get(url) - return json.loads(r.content.decode('utf-8')) - + return json.loads(r.content.decode("utf-8")) + def get_service_info(self): - url = "{base}service-info".format( - base=self.api_url_base) - - r = requests.get(url) - return json.loads(r.content.decode('utf-8')) + url = "{base}service-info".format(base=self.api_url_base) + r = requests.get(url) + return json.loads(r.content.decode("utf-8")) def load_fasta(self, fa_file, lengths_only=False): """ @@ -123,9 +122,7 @@ def load_fasta(self, fa_file, lengths_only=False): for k in fa_object.keys(): seq = str(fa_object[k]) seq_digest = self.load_seq(seq) - asdlist.append({'name': k, - 'length': len(seq), - 'sequence_digest': seq_digest}) + asdlist.append({"name": k, "length": len(seq), "sequence_digest": seq_digest}) _LOGGER.debug(asdlist) return asdlist @@ -139,30 +136,29 @@ def load_sequence_dict(self, seqset): for k, v in seqset.items(): if isinstance(v, str): seq = v - v = {'sequence': seq} - if 'length' not in v.keys(): - if 'sequence' not in v.keys(): - _LOGGER.warning( - "Each sequence must have either length or a sequence.") + v = {"sequence": seq} + if "length" not in v.keys(): + if "sequence" not in v.keys(): + _LOGGER.warning("Each sequence must have either length or a sequence.") else: - v['length'] = len(v['sequence']) - if 'sequence' in v.keys(): - v['sequence_digest'] = self.load_seq(seq) - del v['sequence'] - if 'name' not in v.keys(): - v['name'] = k - if 'toplogy' not in v.keys(): - v['toplogy'] = 'linear' + v["length"] = len(v["sequence"]) + if "sequence" in v.keys(): + v["sequence_digest"] = self.load_seq(seq) + del v["sequence"] + if "name" not in v.keys(): + v["name"] = k + if "toplogy" not in v.keys(): + v["toplogy"] = "linear" seqset_new[k] = v - collection_checksum = self.insert(list(seqset_new.values()), 'ASDList') + collection_checksum = self.insert(list(seqset_new.values()), "ASDList") return collection_checksum, seqset_new - # Static functions below (these don't require a database) + def parse_fasta(fa_file): _LOGGER.debug("Hashing {}".format(fa_file)) try: @@ -176,4 +172,4 @@ def parse_fasta(fa_file): fa_file_unzipped = fa_file.replace(".gz", "") fa_object = pyfaidx.Fasta(fa_file_unzipped) os.system("gzip {}".format(fa_file_unzipped)) - return fa_object \ No newline at end of file + return fa_object diff --git a/refget/schemas/AnnotatedSequenceList.yaml b/refget/schemas/AnnotatedSequenceList.yaml new file mode 100644 index 0000000..320fdf9 --- /dev/null +++ b/refget/schemas/AnnotatedSequenceList.yaml @@ -0,0 +1,33 @@ +description: "Schema for List of ASDs" +henge_class: "AnnotatedSequenceList" +recursive: true +type: array +items: + description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata" + type: object + henge_class: ASD + properties: + name: + type: string + length: + type: "integer" + topology: + type: string + enum: ["circular", "linear"] + default: "linear" + sequence: + description: "Schema for a single raw sequence" + henge_class: sequence + type: object + properties: + sequence: + type: string + description: "Actual sequence content" + required: + - sequence + required: + - length + - name + - topology + recursive: + - sequence diff --git a/refget/schemas/RawSeqCol.yaml b/refget/schemas/RawSeqCol.yaml new file mode 100644 index 0000000..1881280 --- /dev/null +++ b/refget/schemas/RawSeqCol.yaml @@ -0,0 +1,19 @@ +description: "Schema for List of ASDs" +type: array +henge_class: "RawSeqCol" +items: + description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata" + type: object + henge_class: ASD + properties: + name: + type: string + length: + type: "integer" + sequence: + description: "Actual sequence content for a single raw sequence" + henge_class: sequence + type: string + required: + - length + - name diff --git a/refget/schemas/SeqColArraySet.yaml b/refget/schemas/SeqColArraySet.yaml new file mode 100644 index 0000000..d37158f --- /dev/null +++ b/refget/schemas/SeqColArraySet.yaml @@ -0,0 +1,27 @@ +description: "SeqColArraySet" +type: object +henge_class: "SeqColArraySet" +properties: + topologies: + type: array + henge_class: "array" + items: + type: string + enum: ["circular", "linear"] + default: "linear" + names: + type: array + henge_class: "array" + items: + type: string + lengths: + type: array + henge_class: "array" + items: + type: string + sequences: + type: array + henge_class: "seqarray" + items: + type: string + henge_class: sequence diff --git a/refget/schemas/SeqColArraySetInherent.yaml b/refget/schemas/SeqColArraySetInherent.yaml new file mode 100644 index 0000000..743cc28 --- /dev/null +++ b/refget/schemas/SeqColArraySetInherent.yaml @@ -0,0 +1,36 @@ +description: "SeqColArraySet" +type: object +henge_class: "SeqColArraySet" +properties: + topologies: + type: array + henge_class: "strarray" + items: + type: string + enum: ["circular", "linear"] + default: "linear" + names: + type: array + henge_class: "strarray" + items: + type: string + lengths: + type: array + henge_class: "intarray" + items: + type: integer + sequences: + type: array + henge_class: "seqarray" + items: + type: string + henge_class: sequence + sorted_name_length_pairs: + type: array + henge_class: "strarray" + items: + type: string +inherent: + - names + - lengths + - sequences diff --git a/refget/schemas/TASeqCol.yaml b/refget/schemas/TASeqCol.yaml new file mode 100644 index 0000000..19a4eb7 --- /dev/null +++ b/refget/schemas/TASeqCol.yaml @@ -0,0 +1,31 @@ +description: "TAseqcol" +type: object +henge_class: "TASeqCol" +properties: + topology: + type: array + henge_class: "TopologyVec" + items: + type: string + enum: ["circular", "linear"] + default: "linear" + rawseqcol: + description: "Schema for List of ASDs" + type: array + henge_class: "RawSeqCol" + items: + description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata" + type: object + henge_class: ASD + properties: + name: + type: string + length: + type: "integer" + sequence: + type: string + henge_class: sequence + description: "Actual sequence content" + required: + - length + - name diff --git a/refget/schemas/seqcol.yaml b/refget/schemas/seqcol.yaml new file mode 100644 index 0000000..42a722b --- /dev/null +++ b/refget/schemas/seqcol.yaml @@ -0,0 +1,28 @@ +description: "A collection of biological sequences." +type: object +properties: + lengths: + type: array + collated: true + description: "Number of elements, such as nucleotides or amino acids, in each sequence." + items: + type: integer + names: + type: array + collated: true + description: "Human-readable identifiers of each sequence (chromosome names)." + items: + type: string + sequences: + type: array + collated: true + items: + type: string + description: "Digests of sequences computed using the GA4GH digest algorithm (sha512t24u)." +required: + - lengths + - names +inherent: + - lengths + - names + - sequences \ No newline at end of file diff --git a/refget/seqcol.py b/refget/seqcol.py new file mode 100644 index 0000000..ba6ff56 --- /dev/null +++ b/refget/seqcol.py @@ -0,0 +1,199 @@ +import henge +import logging +import yacman + +from itertools import compress + +from .const import * +from .utilities import * + + +_LOGGER = logging.getLogger(__name__) +henge.ITEM_TYPE = "_item_type" + + +class SeqColConf(yacman.YAMLConfigManager): + """ + Simple configuration manager object for SeqColHenge. + """ + + def __init__( + self, + entries={}, + filepath=None, + yamldata=None, + writable=False, + wait_max=60, + skip_read_lock=False, + ): + filepath = yacman.select_config( + config_filepath=filepath, config_env_vars=["SEQCOLAPI_CONFIG"], config_name="seqcol" + ) + super(SeqColConf, self).__init__(entries, filepath, yamldata, writable) + + +class SeqColHenge(henge.Henge): + """ + Extension of henge that accommodates collections of sequences. + """ + + def __init__( + self, + database={}, + schemas=None, + henges=None, + checksum_function=sha512t24u_digest, + ): + """ + A user interface to insert and retrieve decomposable recursive unique + identifiers (DRUIDs). + + :param dict database: Dict-like lookup database with sequences + and hashes + :param dict schemas: One or more jsonschema schemas describing the + data types stored by this Henge + :param function(str) -> str checksum_function: Default function to + handle the digest of the + serialized items stored in this henge. + """ + super(SeqColHenge, self).__init__( + database=database, + schemas=schemas or INTERNAL_SCHEMAS, + henges=henges, + checksum_function=checksum_function, + ) + _LOGGER.info("Initializing SeqColHenge") + + def load_fasta(self, fa_file, skip_seq=False, topology_default="linear"): + """ + Load a sequence collection into the database + + :param str fa_file: path to the FASTA file to parse and load + :param bool skip_seq: whether to disregard the actual sequences, + load just the names and lengths and topology + :param bool skip_seq: whether to disregard the actual sequences, + load just the names and lengths and topology + :param str topology_default: the default topology assigned to + every sequence + """ + # TODO: any systematic way infer topology from a FASTA file? + if topology_default not in KNOWN_TOPOS: + raise ValueError( + f"Invalid topology ({topology_default}). " f"Choose from: {','.join(KNOWN_TOPOS)}" + ) + fa_object = parse_fasta(fa_file) + aslist = [] + for k in fa_object.keys(): + seq = str(fa_object[k]) + aslist.append( + { + NAME_KEY: k, + LEN_KEY: len(seq), + TOPO_KEY: topology_default, + SEQ_KEY: {"" if skip_seq else SEQ_KEY: seq}, + } + ) + collection_checksum = self.insert(aslist, ASL_NAME) + _LOGGER.debug(f"Loaded {ASL_NAME}: {aslist}") + return collection_checksum, aslist + + def load_fasta2(self, fa_file, skip_seq=False, topology_default="linear"): + """ + Load a sequence collection into the database + + :param str fa_file: path to the FASTA file to parse and load + :param bool skip_seq: whether to disregard the actual sequences, + load just the names and lengths and topology + :param bool skip_seq: whether to disregard the actual sequences, + load just the names and lengths and topology + :param str topology_default: the default topology assigned to + every sequence + """ + # TODO: any systematic way infer topology from a FASTA file? + _LOGGER.info("Loading fasta file...") + fa_object = parse_fasta(fa_file) + aslist = [] + for k in fa_object.keys(): + seq = str(fa_object[k]) + _LOGGER.info("Loading key: {k} / Length: {l}...".format(k=k, l=len(seq))) + aslist.append( + { + NAME_KEY: k, + LEN_KEY: len(seq), + TOPO_KEY: topology_default, + SEQ_KEY: "" if skip_seq else seq, + } + ) + _LOGGER.info("Inserting into database...") + collection_checksum = self.insert(aslist, "RawSeqCol") + _LOGGER.debug(f"Loaded {ASL_NAME}: {aslist}") + return collection_checksum, aslist + + def compare_digests(self, digestA, digestB): + A = self.retrieve(digestA, reclimit=1) + B = self.retrieve(digestB, reclimit=1) + # _LOGGER.info(A) + # _LOGGER.info(B) + return compare_seqcols(A, B) + + def retrieve(self, druid, reclimit=None, raw=False): + try: + return super(SeqColHenge, self).retrieve(druid, reclimit, raw) + except henge.NotFoundException as e: + _LOGGER.debug(e) + raise e + try: + return self.refget(druid) + except Exception as e: + _LOGGER.debug(e) + raise e + return henge.NotFoundException( + "{} not found in database, or in refget.".format(druid) + ) + + def load_fasta_from_refgenie(self, rgc, refgenie_key): + """ + @param rgc RefGenConf object + @param refgenie_key key of genome to load + """ + filepath = rgc.seek(refgenie_key, "fasta") + return self.load_fasta_from_filepath(filepath) + + def load_fasta_from_filepath(self, filepath): + """ + @param filepath Path to fasta file + """ + fa_object = parse_fasta(filepath) + SCAS = fasta_obj_to_seqcol(fa_object, digest_function=self.checksum_function) + digest = self.insert(SCAS, "SeqColArraySet", reclimit=1) + return { + "fa_file": filepath, + "fa_object": fa_object, + "SCAS": SCAS, + "digest": digest, + } + + def load_from_chromsizes(self, chromsizes): + """ + @param chromsizes Path to chromsizes file + """ + SCAS = chrom_sizes_to_seqcol(chromsizes, digest_function=self.checksum_function) + digest = self.insert(SCAS, "SeqColArraySet", reclimit=1) + return { + "chromsizes_file": chromsizes, + "SCAS": SCAS, + "digest": digest, + } + + def load_multiple_fastas(self, fasta_dict): + """ + Wrapper for load_fasta_from_filepath + + @param fasta_list + """ + results = {} + for name in fasta_dict.keys(): + path = fasta_dict[name]["fasta"] + print(f"Processing fasta '{name}'' at path '{path}'...") + results[name] = self.load_fasta_from_filepath(path) + return results diff --git a/refget/seqcol_client.py b/refget/seqcol_client.py new file mode 100644 index 0000000..58f4e40 --- /dev/null +++ b/refget/seqcol_client.py @@ -0,0 +1,49 @@ +import requests + + +class SeqColClient(object): + """ + A client for interacting with a sequence collection API. + + Args: + url (str, optional): The base URL of the sequence collection API. Defaults to "http://seqcolapi.databio.org". + + Attributes: + url (str): The base URL of the sequence collection API. + + Methods: + get_collection(accession, level=2): Retrieves a sequence collection for a given accession and level. + """ + + def __init__(self, url="http://seqcolapi.databio.org"): + self.url = url + + def get_collection(self, accession, level=2): + """ + Retrieves a sequence collection for a given accession and detail level. + + Args: + accession (str): The accession of the sequence collection. + level (int, optional): The level of detail for the sequence collection. Defaults to 2. + + Returns: + dict: The JSON response containing the sequence collection. + """ + url = f"{self.url}/collection/{accession}?level={level}" + response = requests.get(url) + return response.json() + + def compare(self, accession1, accession2): + """ + Compares two sequence collections. + + Args: + accession1 (str): The accession of the first sequence collection. + accession2 (str): The accession of the second sequence collection. + + Returns: + dict: The JSON response containing the comparison of the two sequence collections. + """ + url = f"{self.url}/comparison/{accession1}/{accession2}" + response = requests.get(url) + return response.json() diff --git a/refget/utilities.py b/refget/utilities.py new file mode 100644 index 0000000..573e3ef --- /dev/null +++ b/refget/utilities.py @@ -0,0 +1,312 @@ +import base64 +import binascii +import hashlib +import json +import logging +import os +import pyfaidx + +from jsonschema import Draft7Validator +from typing import Optional, Callable +from yacman import load_yaml + +from .const import SeqCol +from .exceptions import * + +_LOGGER = logging.getLogger(__name__) + + +def trunc512_digest(seq, offset=24) -> str: + """Deprecated GA4GH digest function""" + digest = hashlib.sha512(seq.encode()).digest() + hex_digest = binascii.hexlify(digest[:offset]) + return hex_digest.decode() + + +def sha512t24u_digest(seq: str, offset: int = 24) -> str: + """GA4GH digest function""" + digest = hashlib.sha512(seq.encode()).digest() + tdigest_b64us = base64.urlsafe_b64encode(digest[:offset]) + return tdigest_b64us.decode("ascii") + + +def canonical_str(item: dict) -> str: + """Convert a dict into a canonical string representation""" + return json.dumps( + item, separators=(",", ":"), ensure_ascii=False, allow_nan=False, sort_keys=True + ) + + +def print_csc(csc: dict) -> str: + """Convenience function to pretty-print a canonical sequence collection""" + return print(json.dumps(csc, indent=2)) + + +def validate_seqcol_bool(seqcol_obj: SeqCol, schema=None) -> bool: + """ + Validate a seqcol object against the seqcol schema. Returns True if valid, False if not. + + To enumerate the errors, use validate_seqcol instead. + """ + schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml") + schema = load_yaml(schema_path) + validator = Draft7Validator(schema) + return validator.is_valid(seqcol_obj) + + +def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]: + """Validate a seqcol object against the seqcol schema. + Returns True if valid, raises InvalidSeqColError if not, which enumerates the errors. + Retrieve individual errors with exception.errors + """ + schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml") + schema = load_yaml(schema_path) + validator = Draft7Validator(schema) + if not validator.is_valid(seqcol_obj): + errors = sorted(validator.iter_errors(seqcol_obj), key=lambda e: e.path) + raise InvalidSeqColError("Validation failed", errors) + return True + + +def format_itemwise(csc: SeqCol) -> list: + """ + Format a SeqCol object into a list of dicts, one per sequence. + """ + list_of_dicts = [] + # TODO: handle all properties, not just these 3 + # TODO: handle non-collated attributes, somehow + for i in range(len(csc["lengths"])): + list_of_dicts.append( + { + "name": csc["names"][i], + "length": csc["lengths"][i], + "sequence": csc["sequences"][i], + } + ) + return {"sequences": list_of_dicts} + + +def parse_fasta(fa_file) -> pyfaidx.Fasta: + """ + Read in a gzipped or not gzipped FASTA file + """ + try: + return pyfaidx.Fasta(fa_file) + except pyfaidx.UnsupportedCompressionFormat: + # pyfaidx can handle bgzip but not gzip; so we just hack it here and + # gunzip the file into a temporary one and read it in not to interfere + # with the original one. + from gzip import open as gzopen + from tempfile import NamedTemporaryFile + + with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out: + f_out.writelines(f_in.read()) + f_out.seek(0) + return pyfaidx.Fasta(f_out.name) + + +def chrom_sizes_to_digest(chrom_sizes_file_path: str) -> str: + """Given a chrom.sizes file, return a digest""" + seqcol_obj = chrom_sizes_to_seqcol(chrom_sizes_file_path) + return seqcol_digest(seqcol_obj) + + +def chrom_sizes_to_seqcol( + chrom_sizes_file_path: str, + digest_function: Callable[[str], str] = sha512t24u_digest, +) -> dict: + """Given a chrom.sizes file, return a canonical seqcol object""" + with open(chrom_sizes_file_path, "r") as f: + lines = f.readlines() + CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []} + for line in lines: + line = line.strip() + if line == "": + continue + seq_name, seq_length, ga4gh_digest, md5_digest = line.split("\t") + snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs + snlp_digest = digest_function(canonical_str(snlp)) + CSC["lengths"].append(int(seq_length)) + CSC["names"].append(seq_name) + CSC["sequences"].append(ga4gh_digest) + CSC["sorted_name_length_pairs"].append(snlp_digest) + CSC["sorted_name_length_pairs"].sort() + return CSC + + +def fasta_file_to_digest(fa_file_path: str) -> str: + """Given a fasta, return a digest""" + seqcol_obj = fasta_file_to_seqcol(fa_file_path) + return seqcol_digest(seqcol_obj) + + +def fasta_file_to_seqcol(fa_file_path: str) -> dict: + """Given a fasta, return a canonical seqcol object""" + fa_obj = parse_fasta(fa_file_path) + return fasta_obj_to_seqcol(fa_obj) + + +def fasta_obj_to_seqcol( + fa_object: pyfaidx.Fasta, + verbose: bool = True, + digest_function: Callable[[str], str] = sha512t24u_digest, +) -> dict: + """ + Given a fasta object, return a CSC (Canonical Sequence Collection object) + """ + # CSC = SeqColArraySet + # Or equivalently, a "Level 1 SeqCol" + + CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []} + seqs = fa_object.keys() + nseqs = len(seqs) + print(f"Found {nseqs} chromosomes") + i = 1 + for k in fa_object.keys(): + if verbose: + print(f"Processing ({i} of {nseqs}) {k}...") + seq = str(fa_object[k]) + seq_length = len(seq) + seq_name = fa_object[k].name + seq_digest = "SQ." + digest_function(seq.upper()) + snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs + snlp_digest = digest_function(canonical_str(snlp)) + CSC["lengths"].append(seq_length) + CSC["names"].append(seq_name) + CSC["sorted_name_length_pairs"].append(snlp_digest) + CSC["sequences"].append(seq_digest) + i += 1 + CSC["sorted_name_length_pairs"].sort() + return CSC + + +def build_sorted_name_length_pairs(obj: dict, digest_function): + """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system""" + sorted_name_length_pairs = [] + for i in range(len(obj["names"])): + sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]}) + nl_digests = [] # name-length digests + for i in range(len(sorted_name_length_pairs)): + nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i]))) + + nl_digests.sort() + return nl_digests + + +def compare_seqcols(A: SeqCol, B: SeqCol): + """ + Workhorse comparison function + + @param A Sequence collection A + @param B Sequence collection B + @return dict Following formal seqcol specification comparison function return value + """ + validate_seqcol(A) # First ensure these are the right structure + validate_seqcol(B) + + all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys()))) + result = {} + + # Compute lengths of each array; only do this for array attributes + a_lengths = {} + b_lengths = {} + for k in A.keys(): + a_lengths[k] = len(A[k]) + for k in B.keys(): + b_lengths[k] = len(B[k]) + + return_obj = { + "attributes": {"a_only": [], "b_only": [], "a_and_b": []}, + "array_elements": { + "a": a_lengths, + "b": b_lengths, + "a_and_b": {}, + "a_and_b_same_order": {}, + }, + } + + for k in all_keys: + _LOGGER.info(k) + if k not in A: + result[k] = {"flag": -1} + return_obj["attributes"]["b_only"].append(k) + # return_obj["array_elements"]["total"][k] = {"a": None, "b": len(B[k])} + elif k not in B: + return_obj["attributes"]["a_only"].append(k) + # return_obj["array_elements"]["total"][k] = {"a": len(A[k]), "b": None} + else: + return_obj["attributes"]["a_and_b"].append(k) + res = _compare_elements(A[k], B[k]) + # return_obj["array_elements"]["total"][k] = {"a": len(A[k]), "b": len(B[k])} + return_obj["array_elements"]["a_and_b"][k] = res["a_and_b"] + return_obj["array_elements"]["a_and_b_same_order"][k] = res["a_and_b_same_order"] + return return_obj + + +def _compare_elements(A: list, B: list): + """ + Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function + """ + + A_filtered = list(filter(lambda x: x in B, A)) + B_filtered = list(filter(lambda x: x in A, B)) + A_count = len(A_filtered) + B_count = len(B_filtered) + overlap = min(len(A_filtered), len(B_filtered)) # counts duplicates + + if A_count + B_count < 1: + # order match requires at least 2 matching elements + order = None + elif not (A_count == B_count == overlap): + # duplicated matches means order match is undefined + order = None + else: + order = A_filtered == B_filtered + return {"a_and_b": overlap, "a_and_b_same_order": order} + + +def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: + """ + Given a canonical sequence collection, compute its digest. + + :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object + :param dict schema: Schema defining the inherent attributes to digest + :return str: The sequence collection digest + """ + + validate_seqcol(seqcol_obj) + # Step 1a: Remove any non-inherent attributes, + # so that only the inherent attributes contribute to the digest. + seqcol_obj2 = {} + if schema: + for k in schema["inherent"]: + # Step 2: Apply RFC-8785 to canonicalize the value + # associated with each attribute individually. + seqcol_obj2[k] = canonical_str(seqcol_obj[k]) + else: # no schema provided, so assume all attributes are inherent + for k in seqcol_obj: + seqcol_obj2[k] = canonical_str(seqcol_obj[k]) + # Step 3: Digest each canonicalized attribute value + # using the GA4GH digest algorithm. + + seqcol_obj3 = {} + for attribute in seqcol_obj2: + seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute]) + # print(json.dumps(seqcol_obj3, indent=2)) # visualize the result + + # Step 4: Apply RFC-8785 again to canonicalize the JSON + # of new seqcol object representation. + + seqcol_obj4 = canonical_str(seqcol_obj3) + + # Step 5: Digest the final canonical representation again. + seqcol_digest = sha512t24u_digest(seqcol_obj4) + return seqcol_digest + + +def explain_flag(flag): + """Explains a compare flag""" + print(f"Flag: {flag}\nBinary: {bin(flag)}\n") + for e in range(0, 13): + if flag & 2**e: + print(FLAGS[2**e]) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 0470f30..1afd5a4 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,6 @@ -henge==0.1.1 +biopython +henge>=0.2.1 pyfaidx +pyyaml requests yacman>=0.6.7 -pyyaml diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt new file mode 100644 index 0000000..60c9958 --- /dev/null +++ b/requirements/requirements-dev.txt @@ -0,0 +1 @@ +-e git+git://github.com/databio/henge@master#egg=henge \ No newline at end of file diff --git a/setup.py b/setup.py index f45eb89..b7cf1d9 100644 --- a/setup.py +++ b/setup.py @@ -19,10 +19,10 @@ extra["install_requires"] = DEPENDENCIES -with open("{}/_version.py".format(PACKAGE), 'r') as versionfile: +with open("{}/_version.py".format(PACKAGE), "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") -long_description = open('README.md').read() +long_description = open("README.md").read() setup( name=PACKAGE, @@ -30,28 +30,28 @@ version=version, description="Python client for refget", long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], keywords="genome, assembly, bioinformatics, reference, sequence", url="https://github.com/refgenie/refget", - author=u"Nathan Sheffield, Michal Stolarczyk", - author_email=u"nathan@code.databio.org", + author="Nathan Sheffield, Michal Stolarczyk", + author_email="nathan@code.databio.org", license="BSD2", entry_points={ - "console_scripts": [ - 'refget = refget.refget:main' - ], - }, + "console_scripts": ["refget = refget.refget:main"], + }, # package_data={"refget": [os.path.join("refget", "*")]}, include_package_data=True, test_suite="tests", tests_require=(["mock", "pytest"]), setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), - **extra + **extra, ) diff --git a/tests/conftest.py b/tests/conftest.py index 37d6922..129ff0b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,13 @@ """ Test suite shared objects and setup """ + import os import pytest import oyaml as yaml +from refget.const import _schema_path def ly(n, data_path): + """Load YAML""" with open(os.path.join(data_path, n), "r") as f: return yaml.safe_load(f) @@ -14,9 +17,19 @@ def schema_path(): return os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas") +@pytest.fixture +def fa_root(): + return os.path.join( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), + "demo_fasta", + ) + + @pytest.fixture def fasta_path(): - return os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), "demo_fasta") + return os.path.join( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), "demo_fasta" + ) @pytest.fixture diff --git a/tests/test_refget.py b/tests/test_refget.py index 5f3a2ae..e53f37a 100644 --- a/tests/test_refget.py +++ b/tests/test_refget.py @@ -10,11 +10,10 @@ class TestEmptyConstructor: def test_no_schemas_required(self): assert isinstance(RefGetClient(), RefGetClient) + class TestInsert: @pytest.mark.parametrize("fasta_name", DEMO_FILES) def test_insert_works(self, fasta_name, fasta_path): rgc = RefGetClient() d = rgc.load_seq("TCGA") - assert(rgc.refget(d) == "TCGA") - - + assert rgc.refget(d) == "TCGA" diff --git a/tests/test_seqcol.py b/tests/test_seqcol.py new file mode 100644 index 0000000..85555f9 --- /dev/null +++ b/tests/test_seqcol.py @@ -0,0 +1,124 @@ +import json +import os +import pytest +import refget + +# from seqcol import SeqColHenge, validate_seqcol, compare +# from seqcol.const import * + +DEMO_FILES = [ + "demo0.fa", + "demo1.fa.gz", + "demo2.fa", + "demo3.fa", + "demo4.fa", + "demo5.fa.gz", + "demo6.fa", +] + +# Pairs of files to compare, with the "correct" compare response +COMPARE_TESTS = [ + (DEMO_FILES[1], DEMO_FILES[1], "demo_fasta/compare-1vs1.json"), + (DEMO_FILES[0], DEMO_FILES[1], "demo_fasta/compare-0vs1.json"), +] +SNLP_TESTS = [ + (DEMO_FILES[5], DEMO_FILES[6], "demo_fasta/compare-5vs6.json"), +] # sorted_name_length_pairs + + +class TestGeneral: + def test_no_schemas_required(self): + """ + In contrast to the generic Henge object, SeqColHenge does not + require schemas as input, they are predefined in the constructor + """ + assert isinstance(refget.SeqColHenge(database={}), refget.SeqColHenge) + + +class TestFastaInserting: + @pytest.mark.parametrize("fasta_name", DEMO_FILES) + def test_fasta_loading_works(self, fasta_name, fa_root): + scc = refget.SeqColHenge(database={}) + f = os.path.join(fa_root, fasta_name) + print("Fasta file to be loaded: {}".format(f)) + res = scc.load_fasta(f) + assert len(res) == 2 # returns digest and list of AnnotatedSequencesList + + +class TestRetrieval: + @pytest.mark.parametrize("fasta_name", DEMO_FILES) + def test_retrieval_works(self, fasta_name, fa_root): + scc = refget.SeqColHenge(database={}) + f = os.path.join(fa_root, fasta_name) + print("Fasta file to be loaded: {}".format(f)) + d, asds = scc.load_fasta(f) + # convert integers in the dicts to strings + # {k: str(v) if isinstance(v, int) else v for k, v in asd.items()} + lst = [{k: v for k, v in asd.items()} for asd in asds] + assert scc.retrieve(d) == lst + + +def check_comparison(fasta1, fasta2, expected_comparison): + print(f"Comparison: Fasta1: {fasta1} vs Fasta2: {fasta2}. Expected: {expected_comparison}") + scc = refget.SeqColHenge(database={}) + d = scc.load_fasta_from_filepath(fasta1) + d2 = scc.load_fasta_from_filepath(fasta2) + with open(expected_comparison) as fp: + correct_compare_response = json.load(fp) + proposed_compare_response = refget.compare_seqcols(d["SCAS"], d2["SCAS"]) + print( + json.dumps( + proposed_compare_response, + separators=(",", ":"), + ensure_ascii=False, + allow_nan=False, + sort_keys=True, + indent=2, + ) + ) + assert proposed_compare_response == correct_compare_response + + +class TestCompare: + """ + Test the compare function, using demo fasta files, and pre-computed + compare function results stored as answer files. + """ + + @pytest.mark.parametrize(["fasta1", "fasta2", "answer_file"], COMPARE_TESTS) + def test_fasta_compare(self, fasta1, fasta2, answer_file, fa_root): + check_comparison(os.path.join(fa_root, fasta1), os.path.join(fa_root, fasta2), answer_file) + + @pytest.mark.parametrize(["fasta1", "fasta2", "answer_file"], SNLP_TESTS) + def test_names_lengths_order(self, fasta1, fasta2, answer_file, fa_root): + """Does the names_lengths array correctly identify order variants""" + check_comparison(os.path.join(fa_root, fasta1), os.path.join(fa_root, fasta2), answer_file) + + +seqcol_obj = { + "lengths": [248956422, 133797422, 135086622], + "names": ["chr1", "chr2", "chr3"], + "sequences": [ + "2648ae1bacce4ec4b6cf337dcae37816", + "907112d17fcb73bcab1ed1c72b97ce68", + "1511375dc2dd1b633af8cf439ae90cec", + ], +} + +bad_seqcol = {"bogus": True} + + +class TestValidate: + """ + Test validation + """ + + @pytest.mark.parametrize(["seqcol_obj"], [[seqcol_obj]]) + def test_validate(self, seqcol_obj): + is_valid = refget.validate_seqcol(seqcol_obj) + assert is_valid + + @pytest.mark.parametrize(["seqcol_obj"], [[bad_seqcol]]) + def test_failure(self, seqcol_obj): + with pytest.raises(Exception): + refget.validate_seqcol(seqcol_obj)