diff --git a/aaanalysis/data_handling/_read_fasta.py b/aaanalysis/data_handling/_read_fasta.py index e30036e6..58185ffc 100644 --- a/aaanalysis/data_handling/_read_fasta.py +++ b/aaanalysis/data_handling/_read_fasta.py @@ -92,9 +92,9 @@ def read_fasta(file_path: str, ----- Each ``FASTA`` file entry consists of two parts: - - 'FASTA header': Starting with '>', the header contains the main id and additional information, + - **FASTA header**: Starting with '>', the header contains the main id and additional information, all separated by a specified separator. - - 'Sequence': Sequence of specific entry, directly following the header + - **Sequence**: Sequence of specific entry, directly following the header ``df_seq`` includes at least these columns: @@ -108,6 +108,7 @@ def read_fasta(file_path: str, Examples -------- + .. include:: examples/read_fasta.rst """ # Check input diff --git a/aaanalysis/data_handling/_to_fasta.py b/aaanalysis/data_handling/_to_fasta.py index 7a798f6b..b3ed6b07 100644 --- a/aaanalysis/data_handling/_to_fasta.py +++ b/aaanalysis/data_handling/_to_fasta.py @@ -23,7 +23,7 @@ def to_fasta(df=None, col_db=None, cols_info=None, sep="|"): - """...""" + """""" # Check input ut.check_file(file_path=file_path) ut.check_str(name="col_id", val=col_id, accept_none=False) diff --git a/examples/data_handling/data/example_FASTA.fasta b/examples/data_handling/data/example_FASTA.fasta new file mode 100644 index 00000000..bf9902f0 --- /dev/null +++ b/examples/data_handling/data/example_FASTA.fasta @@ -0,0 +1,24 @@ +>SEMA4A,38.4 +LAAQQSYWPHFVTVTVLFALVLSGALIILVASPLRALRARG +>SEMA4B,47.0 +WGADRSYWKEFLVMCTLFVLAVLLPVLFLLYRHRNSMKVFL +>SEMA4C,86.6 +EARAPLENLGLVWLAVVALGAVCLVLLLLVLSLRRRLREEL +>SEMA4D,19.1 +TMYLKSSDNRLLMSLFLFFFVLFLCLFFYNCYKGYLPRQCL +>SEMA4F,88.5 +RDAPSRAHTVGAGLAGFFLGILAASLTLILIGRRQQRRRQR +>SEMA4G,49.9 +GAQLAPDVRLLYVLAIAALGGLCLILASSLLYVACLREGRR +>SEMA5A,28.1 +EEKRCGEFNMFHMIAVGLSSSILGCLLTLLVYTYCQRYQQQ +>SEMA5B,30.0 +TDCAGFNLIHLVATGISCFLGSGLLTLAVYLSCQHCQRQSQ +>SEMA6A,21.1 +KGHDQLVPVTLLAIAVILAFVMGAVFSGITVYCVCDHRRKD +>SEMA6B,80.0 +VSVNLLVTSSVAAFVVGAVVSGFSVGWFVGLRERRELARRK +>SEMA6C,46.8 +ASASRSVPIPLLLASVAAAFALGASVSGLLVSCACRRAHRR +>SEMA6D,37.7 +GESNQMVHMNVLITCVFAAFVLGAFIAGVAVYCYRDMFVRK diff --git a/examples/data_handling/load_dataset.ipynb b/examples/data_handling/load_dataset.ipynb index 6edcab6a..4583fa95 100644 --- a/examples/data_handling/load_dataset.ipynb +++ b/examples/data_handling/load_dataset.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "outputs": [ { "name": "stdout", @@ -24,7 +24,7 @@ { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 LevelDataset# Sequences# Amino acids# Positives# NegativesPredictorDescriptionReferenceLabel
1Amino acidAA_CASPASE3233185605705184900PROSPERousPrediction of c...3 cleavage siteSong et al., 20181 (adjacent to ... cleavage site)
2Amino acidAA_FURIN715900316358840PROSPERousPrediction of f...n cleavage siteSong et al., 20181 (adjacent to ... cleavage site)
3Amino acidAA_LDR3421182483546982779IDP-Seq2SeqPrediction of l...d regions (LDR)Tang et al., 20201 (disordered), 0 (ordered)
4Amino acidAA_MMP25733129762416310560PROSPERousPrediction of M...) cleavage siteSong et al., 20181 (adjacent to ... cleavage site)
5Amino acidAA_RNABIND22155001649248509GMKSVM-RUPrediction of R...(RBP60 dataset)Yang et al., 20211 (binding), 0 (non-binding)
6Amino acidAA_SA23318560510108284523PROSPERousPrediction of s...PASE3 data set)Song et al., 20181 (exposed/acce...non-accessible)
7SequenceSEQ_AMYLO14148484511903ReRF-PredPrediction of a...ognenic regionsTeng et al. 20211 (amyloidogeni...-amyloidogenic)
8SequenceSEQ_CAPSID7935336468038644071VIRALproPrediction of capdsid proteinsGaliez et al., 20161 (capsid prote...capsid protein)
9SequenceSEQ_DISULFIDE25476144708971650DiproPrediction of d...es in sequencesCheng et al., 20061 (sequence wit...ithout SS bond)
10SequenceSEQ_LOCATION18357323981045790nanPrediction of s...lasma membrane)Shen et al., 20191 (protein in c...asma membrane)
11SequenceSEQ_SOLUBLE17408443226987048704SOLproPrediction of s...oluble proteinsMagnan et al., 20091 (soluble), 0 (insoluble)
12SequenceSEQ_TAIL6668267169025744094VIRALproPrediction of tail proteinsGaliez et al., 20161 (tail protein...n-tail protein)
13DomainDOM_GSEC126929646363nanPrediction of g...tase substratesBreimann et al, 2024c1 (substrate), ...(non-substrate)
14DomainDOM_GSEC_PU694494524630nanPrediction of g...es (PU dataset)Breimann et al, 2024c1 (substrate), ...bstrate status)
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 LevelDataset# Sequences# Amino acids# Positives# NegativesPredictorDescriptionReferenceLabel
1Amino acidAA_CASPASE3233185605705184900PROSPERousPrediction of c...3 cleavage siteSong et al., 20181 (adjacent to ... cleavage site)
2Amino acidAA_FURIN715900316358840PROSPERousPrediction of f...n cleavage siteSong et al., 20181 (adjacent to ... cleavage site)
3Amino acidAA_LDR3421182483546982779IDP-Seq2SeqPrediction of l...d regions (LDR)Tang et al., 20201 (disordered), 0 (ordered)
4Amino acidAA_MMP25733129762416310560PROSPERousPrediction of M...) cleavage siteSong et al., 20181 (adjacent to ... cleavage site)
5Amino acidAA_RNABIND22155001649248509GMKSVM-RUPrediction of R...(RBP60 dataset)Yang et al., 20211 (binding), 0 (non-binding)
6Amino acidAA_SA23318560510108284523PROSPERousPrediction of s...PASE3 data set)Song et al., 20181 (exposed/acce...non-accessible)
7SequenceSEQ_AMYLO14148484511903ReRF-PredPrediction of a...ognenic regionsTeng et al. 20211 (amyloidogeni...-amyloidogenic)
8SequenceSEQ_CAPSID7935336468038644071VIRALproPrediction of capdsid proteinsGaliez et al., 20161 (capsid prote...capsid protein)
9SequenceSEQ_DISULFIDE25476144708971650DiproPrediction of d...es in sequencesCheng et al., 20061 (sequence wit...ithout SS bond)
10SequenceSEQ_LOCATION18357323981045790nanPrediction of s...lasma membrane)Shen et al., 20191 (protein in c...asma membrane)
11SequenceSEQ_SOLUBLE17408443226987048704SOLproPrediction of s...oluble proteinsMagnan et al., 20091 (soluble), 0 (insoluble)
12SequenceSEQ_TAIL6668267169025744094VIRALproPrediction of tail proteinsGaliez et al., 20161 (tail protein...n-tail protein)
13DomainDOM_GSEC126929646363nanPrediction of g...tase substratesBreimann et al, 2024c1 (substrate), ...(non-substrate)
14DomainDOM_GSEC_PU694494524630nanPrediction of g...es (PU dataset)Breimann et al, 2024c1 (substrate), ...bstrate status)
\n" }, "metadata": {}, "output_type": "display_data" @@ -38,8 +38,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-01-09T16:04:09.197319963Z", - "start_time": "2024-01-09T16:04:09.174231205Z" + "end_time": "2024-05-02T21:22:21.499870925Z", + "start_time": "2024-05-02T21:22:20.213918848Z" } }, "id": "initial_id" diff --git a/examples/data_handling/read_fasta.ipynb b/examples/data_handling/read_fasta.ipynb new file mode 100644 index 00000000..1b9ec329 --- /dev/null +++ b/examples/data_handling/read_fasta.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "You can read FASTA files using the ``read_fasta()`` function:" + ], + "metadata": { + "collapsed": false + }, + "id": "f204a08b42e761b5" + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "", + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entrysequence
1SEMA4A,38.4LAAQQSYWPHFVTVT...IILVASPLRALRARG
2SEMA4B,47.0WGADRSYWKEFLVMC...LFLLYRHRNSMKVFL
3SEMA4C,86.6EARAPLENLGLVWLA...LLLVLSLRRRLREEL
4SEMA4D,19.1TMYLKSSDNRLLMSL...FFYNCYKGYLPRQCL
5SEMA4F,88.5RDAPSRAHTVGAGLA...TLILIGRRQQRRRQR
6SEMA4G,49.9GAQLAPDVRLLYVLA...ASSLLYVACLREGRR
7SEMA5A,28.1EEKRCGEFNMFHMIA...LTLLVYTYCQRYQQQ
8SEMA5B,30.0TDCAGFNLIHLVATG...LAVYLSCQHCQRQSQ
9SEMA6A,21.1KGHDQLVPVTLLAIA...SGITVYCVCDHRRKD
10SEMA6B,80.0VSVNLLVTSSVAAFV...WFVGLRERRELARRK
11SEMA6C,46.8ASASRSVPIPLLLAS...SGLLVSCACRRAHRR
12SEMA6D,37.7GESNQMVHMNVLITC...AGVAVYCYRDMFVRK
\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import aaanalysis as aa\n", + "file_path = \"data/example_FASTA.fasta\"\n", + "df_seq = aa.read_fasta(file_path)\n", + "aa.display_df(df_seq)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-02T21:28:50.165467820Z", + "start_time": "2024-05-02T21:28:50.048853581Z" + } + }, + "id": "6221c1f3f94b587" + }, + { + "cell_type": "markdown", + "source": [ + "To adjust the names of the columns for the primary FASTA file information, use ``col_id`` and ``col_seq``:" + ], + "metadata": { + "collapsed": false + }, + "id": "6195ed555b5e7680" + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "", + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 ENTRYSQUENCE
1SEMA4A,38.4LAAQQSYWPHFVTVT...IILVASPLRALRARG
2SEMA4B,47.0WGADRSYWKEFLVMC...LFLLYRHRNSMKVFL
3SEMA4C,86.6EARAPLENLGLVWLA...LLLVLSLRRRLREEL
4SEMA4D,19.1TMYLKSSDNRLLMSL...FFYNCYKGYLPRQCL
5SEMA4F,88.5RDAPSRAHTVGAGLA...TLILIGRRQQRRRQR
6SEMA4G,49.9GAQLAPDVRLLYVLA...ASSLLYVACLREGRR
7SEMA5A,28.1EEKRCGEFNMFHMIA...LTLLVYTYCQRYQQQ
8SEMA5B,30.0TDCAGFNLIHLVATG...LAVYLSCQHCQRQSQ
9SEMA6A,21.1KGHDQLVPVTLLAIA...SGITVYCVCDHRRKD
10SEMA6B,80.0VSVNLLVTSSVAAFV...WFVGLRERRELARRK
11SEMA6C,46.8ASASRSVPIPLLLAS...SGLLVSCACRRAHRR
12SEMA6D,37.7GESNQMVHMNVLITC...AGVAVYCYRDMFVRK
\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_seq = aa.read_fasta(file_path, col_id=\"ENTRY\", col_seq=\"SQUENCE\")\n", + "aa.display_df(df_seq)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-02T21:28:50.965212794Z", + "start_time": "2024-05-02T21:28:50.944687925Z" + } + }, + "id": "8d4c33fedb070859" + }, + { + "cell_type": "markdown", + "source": [ + "The ``col_id`` column should only contain the unique identifier. If the FASTA file comprises additional information, use the ``sep`` (default='|') argument to save them in additional columns:" + ], + "metadata": { + "collapsed": false + }, + "id": "d0ed30c599bcde82" + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "data": { + "text/plain": "", + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entrysequence
1SEMA4ALAAQQSYWPHFVTVT...IILVASPLRALRARG
2SEMA4BWGADRSYWKEFLVMC...LFLLYRHRNSMKVFL
3SEMA4CEARAPLENLGLVWLA...LLLVLSLRRRLREEL
4SEMA4DTMYLKSSDNRLLMSL...FFYNCYKGYLPRQCL
5SEMA4FRDAPSRAHTVGAGLA...TLILIGRRQQRRRQR
6SEMA4GGAQLAPDVRLLYVLA...ASSLLYVACLREGRR
7SEMA5AEEKRCGEFNMFHMIA...LTLLVYTYCQRYQQQ
8SEMA5BTDCAGFNLIHLVATG...LAVYLSCQHCQRQSQ
9SEMA6AKGHDQLVPVTLLAIA...SGITVYCVCDHRRKD
10SEMA6BVSVNLLVTSSVAAFV...WFVGLRERRELARRK
11SEMA6CASASRSVPIPLLLAS...SGLLVSCACRRAHRR
12SEMA6DGESNQMVHMNVLITC...AGVAVYCYRDMFVRK
\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_seq = aa.read_fasta(file_path, sep=\",\")\n", + "aa.display_df(df_seq)\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-02T21:28:51.741497271Z", + "start_time": "2024-05-02T21:28:51.718363188Z" + } + }, + "id": "c63b0719186b520e" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "12c40961e7f22ae4" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/plotting_prelude.ipynb b/tutorials/plotting_prelude.ipynb index 94812712..025745f6 100644 --- a/tutorials/plotting_prelude.ipynb +++ b/tutorials/plotting_prelude.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", @@ -25,10 +25,7 @@ ], "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-02-07T18:53:58.140699276Z", - "start_time": "2024-02-07T18:53:58.140097399Z" - } + "is_executing": true }, "id": "2673a6d600050969" },