diff --git a/docs/tutorial/map-gene-ids.ipynb b/docs/tutorial/map-gene-ids.ipynb new file mode 100644 index 0000000..6c9dee7 --- /dev/null +++ b/docs/tutorial/map-gene-ids.ipynb @@ -0,0 +1,356 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Map Gene IDs\n", + "\n", + "This tutorial will show you how to directly call the GeneWeaver API to map Gene IDs within a species. For this example, we will map Gene Symbols to Ensemble Gene IDs.\n", + "\n", + "To get started, you can \n", + " \"Open\n", + "" + ], + "metadata": { + "id": "IxAq_kYX5xZB" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "9sNkibUK22so" + }, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Initialize Gene IDs\n", + "First, you will need to initalize a list of the Gene IDs you want to map. Here, we load identifiers from a file.\n", + "\n", + "After the end of this step you will need to have a list of gene IDs as shown below." + ], + "metadata": { + "id": "qqSZt-Ef6HSP" + } + }, + { + "cell_type": "code", + "source": [ + "with open('gene_names.txt', 'r') as file:\n", + " file_content = file.read().strip()\n", + " ids = [id.strip('\"') for id in file_content.split(',')]" + ], + "metadata": { + "id": "dNjGLcYT3H0r" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(ids)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EQLB21_j5HG0", + "outputId": "65ef2d27-b0d7-447e-ab23-00dbc35a3172" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "32285" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "ids[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z7Hz1w8B5OO8", + "outputId": "943c118b-3e99-4290-9bbe-bf4367e46805" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['Xkr4', 'Gm1992', 'Gm19938', 'Gm37381', 'Rp1']" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Call The GeneWeaver ReST API\n", + "\n", + "First we'll construct our ReST call." + ], + "metadata": { + "id": "crIVfKOL7ESB" + } + }, + { + "cell_type": "code", + "source": [ + "payload = {\n", + " \"source_ids\": ids,\n", + " \"target_gene_id_type\": \"Ensemble Gene\",\n", + " \"species\": \"Mus Musculus\"\n", + "}" + ], + "metadata": { + "id": "qdMft1Un3K4-" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The GeneWeaver API uses JSON, so let's specufy that in our request headers." + ], + "metadata": { + "id": "cLlJEtDi7k33" + } + }, + { + "cell_type": "code", + "source": [ + "headers = {\n", + " 'accept': 'application/json',\n", + " 'Content-Type': 'application/json'\n", + "}" + ], + "metadata": { + "id": "f-Y4Wrm93LSc" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "response = requests.post('https://geneweaver.jax.org/api/genes/mapping', json=payload, headers=headers)" + ], + "metadata": { + "id": "wSFcduFY3Npg" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Process Results\n", + "\n", + "The mapping endpoint will return a dictionary with a list of results available on the `gene_ids_map` key. Let's process that into a dictionary with the original IDs as keys and te new IDs as values." + ], + "metadata": { + "id": "TuO5nDxx7wNX" + } + }, + { + "cell_type": "code", + "source": [ + "mapping = {\n", + " r[\"original_ref_id\"]: r[\"mapped_ref_id\"]\n", + " for r in response.json()['gene_ids_map']\n", + "}" + ], + "metadata": { + "id": "bvkt1cef3OiA" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We can now easily access a list of our mapped IDs." + ], + "metadata": { + "id": "jwg4YvS07-EK" + } + }, + { + "cell_type": "code", + "source": [ + "mapped_ids = list(mapping.values())" + ], + "metadata": { + "id": "Rm4fdKMt5c8L" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(mapped_ids)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KW-WnSfg4xau", + "outputId": "e2987b55-735b-4d14-e6c4-863a5346239f" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "32050" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "mapped_ids[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NeNORUE35mnZ", + "outputId": "bb2dc823-b39a-47cb-97b8-9652c42bc11f" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ENSMUSG00000066586',\n", + " 'ENSMUSG00000027596',\n", + " 'ENSMUSG00000030359',\n", + " 'ENSMUSG00000027597',\n", + " 'ENSMUSG00000019986']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "And we can see which IDs couldn't be mapped." + ], + "metadata": { + "id": "QgmtwpaQ8CL1" + } + }, + { + "cell_type": "code", + "source": [ + "unmapped_ids = [\n", + " _id for _id in ids if _id not in mapping\n", + "]\n" + ], + "metadata": { + "id": "qI-la0Fe3S6Y" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(unmapped_ids)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "j_hgdLrT4SJW", + "outputId": "7ab04cb4-eb7a-41e4-c0b3-ec75689d7565" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "235" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "source": [ + "unmapped_ids[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZXYJoV7G5hli", + "outputId": "8ce62cd9-afb5-4eeb-af77-c21e11e04c79" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['Gm28653', 'Ptp4a1.1', 'Arhgef4.1', 'Asdurf', 'AC169382.1']" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + } + ] +} diff --git a/mkdocs.yml b/mkdocs.yml index f6df332..aae7440 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -99,6 +99,7 @@ nav: - Tutorial: - tutorial/index.md - Client Login: tutorial/geneweaver_client_login.ipynb + - Map Gene IDs: tutorial/map-gene-ids.ipynb - NCI-60 Example Workflow: tutorial/nci_60_example_01.ipynb # - Geneweaver Command Line Interface: # - tutorial/geneweaver_command_line.md