diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 0d94560..0000000 --- a/docs/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Refget - -## Introduction - -The refget package provides a Python interface to both remote and local use of the refget protocol. This package serves 4 functions: - -1. A lightweight python interface to a remote refget API. - -2. Local caching of retrieved results, improving performance for applications that require repeated lookups. - -3. A fully functioning local implementation of the refget protocol for local analysis backed by either memory, SQLite, or MongoDB. - -4. Convenience functions for computing refget checksums from python and handling FASTA files directly. - -## Install - -```console -pip install refget -``` - -## Basic use - -### Retrieve results from a RESTful API - -```python -import refget - -rgc = refget.RefGetClient("https://refget.herokuapp.com/sequence/") -rgc.refget("6681ac2f62509cfc220d78751b8dc524", start=0, end=10) - -``` - -### Compute digests locally - -```python -refget.trunc512_digest("TCGA") -``` - -### Insert and retrieve sequences with a local database - -```python -checksum = rgc.load_seq("GGAA") -rgc.refget(checksum) -``` - -For more details, see the [tutorial](tutorial.md). diff --git a/docs/autodoc_build/refget.md b/docs/autodoc_build/refget.md deleted file mode 100644 index 6f38ada..0000000 --- a/docs/autodoc_build/refget.md +++ /dev/null @@ -1,110 +0,0 @@ - - - - - -# Package `refget` Documentation - -## Class `RefGetClient` -```python -def __init__(self, api_url_base=None, database={}, schemas=None, henges=None, checksum_function=, suppress_connect=True) -``` - -A user interface to insert and retrieve decomposable recursive unique identifiers (DRUIDs). -#### Parameters: - -- `database` (`dict`): Dict-like lookup database with sequences and hashes. -- `schemas` (`dict`): One or more jsonschema schemas describing thedata types stored by this Henge -- `checksum_function` (`function(str) -> str`): Default function to handle the digest of theserialized items stored in this henge. - - - - -```python -def get_service_info(self) -``` - - - -```python -def item_types(self) -``` - -A list of item types handled by this Henge instance - - - -```python -def load_fasta(self, fa_file, lengths_only=False) -``` - -Calculates checksums and loads each sequence in a fasta file into the database. - - - -```python -def load_seq(self, seq) -``` - - - -```python -def load_sequence_dict(self, seqset) -``` - -Convert a 'seqset', which is a dict with names as sequence names and values as sequences, into the 'asdlist' required for henge insert. - - - -```python -def meta(self, digest) -``` - - - -```python -def refget(self, digest, start=None, end=None) -``` - - - -```python -def refget_remote(self, digest, start=None, end=None) -``` - - - -```python -def service_info(self) -``` - - - - - - -*Version Information: `refget` v0.0.1, generated by `lucidoc` v0.4.2* \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index a067237..0000000 --- a/docs/changelog.md +++ /dev/null @@ -1,11 +0,0 @@ -# Changelog - -This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. - -## [0.1.0] - 2021-06-17 - -First public version, backed by henge version 0.1.1. - -## [0.0.1] - 2020-06-25 - -Beta version for testing diff --git a/docs_jupyter/build/.gitignore b/docs_jupyter/build/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/docs_jupyter/build/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb deleted file mode 100644 index acff965..0000000 --- a/docs_jupyter/tutorial.ipynb +++ /dev/null @@ -1,690 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Refget python package tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Record some versions:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'3.8.5'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from platform import python_version \n", - "python_version()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.1.0'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import refget\n", - "refget.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Computing digests locally" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from refget import trunc512_digest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Show some results for sequence digests:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trunc512_digest('ACGT')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trunc512_digest('TCGA')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf70'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trunc512_digest('ACGT', 26)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Connecting to a remote API\n", - "\n", - "The refget package provides a simple python wrapper around a remote hosted refget RESTful API. Provide the base url when construction a RefGetClient object and you can retrieve sequences from the remote server." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACA'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also hit the `{digest}/metadata` and `service_info` API endpoints described in the refget API specification:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'metadata': {'aliases': [{'alias': 'ga4gh:SQ.lZyxiD_ByprhOUzrR1o1bq0ezO_1gkrn',\n", - " 'naming_authority': 'ga4gh'},\n", - " {'alias': 'I', 'naming_authority': 'unknown'}],\n", - " 'length': 230218,\n", - " 'md5': '6681ac2f62509cfc220d78751b8dc524',\n", - " 'trunc512': '959cb1883fc1ca9ae1394ceb475a356ead1ecceff5824ae7'}}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.meta(\"6681ac2f62509cfc220d78751b8dc524\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'service': {'algorithms': ['ga4gh', 'md5', 'trunc512'],\n", - " 'circular_supported': True,\n", - " 'subsequence_limit': None,\n", - " 'supported_api_versions': ['1.0.0']}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.service_info" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When requesting a sequence that is not found, the service responds appropriately:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(trunc512_digest('TCGATCGA'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use a local database for caching\n", - "\n", - "By default, any full-sequences retrieved from an API are cached locally in memory (in a Python Dict). This data will not persist past a current session, but is useful if you have an application that requires repeated requests. here, we re-request the sequence requested above. It is much faster this time because it uses a local cache:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACA'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also add new sequences into the database:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(refget.md5('TCGATCGA')) # This sequence is not found in our database yet" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "checksum = rgc.load_seq(\"TCGATCGA\") # So, let's add it into database" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'TCGATCGA'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(checksum) # This time it returns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Keep in mind that sequences added in this way are added to your *local* database, not to the remote API, so when we restart, they will be gone:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "del rgc" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\")\n", - "rgc.refget(refget.md5('TCGA'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Making data persist\n", - "\n", - "If you want to retain your local cache, you can use a Dict that is backed by some persistent storage, such as a database on disk or another running process. There are many ways to do this, for example, you can use an sqlite database, a Redis database, or a MongoDB database. Here we'll show you how to use the `sqlitedict` package to back your local database.\n", - "\n", - "To start, you need to create a dict object and pass that to the RefGetClient constructor." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "import refget\n", - "from sqlitedict import SqliteDict\n", - "mydict = SqliteDict('./my_db.sqlite', autocommit=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "rgc = refget.RefGetClient(\"https://refget.herokuapp.com/sequence/\", mydict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now when we retrieve a sequence it will be added to the local sqlite database automatically." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look, we can see that this object has been added to our sqlite database:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mydict[\"6681ac2f62509cfc220d78751b8dc524\"][1:50]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So now if we kill this object and start it up again *without the API connection*, but with the mydict local backend, we can still retrieve it:" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "del rgc" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "rgc = refget.RefGetClient(database=mydict)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACC'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"6681ac2f62509cfc220d78751b8dc524\", start=0, end=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading a fasta file\n", - "\n", - "The package also comes with a helper function for computing checksums for an entire fasta file." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "fa_file = \"../demo_fasta/demo.fa\"\n", - "content = rgc.load_fasta(fa_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'chr1',\n", - " 'length': 4,\n", - " 'sequence_digest': 'f1f8f4bf413b16ad135722aa4591043e'},\n", - " {'name': 'chr2',\n", - " 'length': 4,\n", - " 'sequence_digest': '45d0ff9f1a9504cf2039f89c1ffb4c32'}]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "content" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'ACGT'" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(content[0]['sequence_digest'])" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No remote URL connected\n" - ] - } - ], - "source": [ - "rgc.refget(\"blah\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "rgc.api_url_base = \"https://refget.herokuapp.com/sequence/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Not Found'" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rgc.refget(\"blah\")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "# You can show the complete contents of the database like this:\n", - "# rgc.show()\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}