Merge pull request #100 from sanjaynagi/enhance-snp-dataframe-nb-11-0…

…1-24 Improve snp-dataframe-excel functions
sanjaynagi · Jan 11, 2024 · 5c7e328 · 5c7e328
2 parents 4f3b6a1 + e57d4f6
commit 5c7e328
Showing 1 changed file with 97 additions and 31 deletions.
diff --git a/workflow/notebooks/snp-dataframe.ipynb b/workflow/notebooks/snp-dataframe.ipynb
@@ -11,29 +11,89 @@
    },
    "outputs": [],
    "source": [
+    "import allel\n",
     "import pandas as pd\n",
+    "import numpy as np\n",
     "\n",
-    "def read_vcf_to_excel(path, out_path):\n",
-    "    import io\n",
-    "    import os\n",
-    "    with open(path, 'r') as f:\n",
-    "        lines = [l for l in f if not l.startswith('##')]\n",
-    "        \n",
-    "    vcf_df = pd.read_csv(\n",
-    "        io.StringIO(''.join(lines)),\n",
-    "        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,\n",
-    "               'QUAL': str, 'FILTER': str, 'INFO': str},\n",
-    "        sep='\\t').rename(columns={'#CHROM': 'CHROM'})\n",
+    "def vcf_to_excel(vcf_path, excel_path, convert_genotypes=False, split_multiallelic=False):\n",
+    "    # Read VCF and create a dictionary\n",
+    "    vcf_df = vcf_to_df(vcf_path)\n",
+    "    samples = vcf_df.columns[6:]\n",
+    "\n",
+    "    # Create a DataFrame for variants\n",
+    "    if split_multiallelic:\n",
+    "        vcf_df = split_rows_with_multiple_alleles(vcf_df, samples)\n",
+    "\n",
+    "    # Convert genotypes to 0/1/2\n",
+    "    if convert_genotypes:\n",
+    "        vcf_df = convert_genotype_to_alt_allele_count(vcf_df, samples)\n",
+    "\n",
+    "    # Write to Excel\n",
+    "    vcf_df.to_excel(excel_path, index=False)\n",
+    "\n",
+    "def vcf_to_df(vcf_path):\n",
     "    \n",
-    "    vcf_df = vcf_df.assign(ANNOTATION=vcf_df['INFO'].str.extract(r'ANN=(.*)'))\n",
-    "    # shift column 'Name' to first position\n",
-    "    first_column = vcf_df.pop('ANNOTATION')\n",
-    "    # insert column using insert(position,column_name,\n",
-    "    # first_column) function\n",
-    "    vcf_df.insert(8, 'ANNOTATION', first_column)\n",
-    "    vcf_df.to_excel(out_path)\n",
+    "    vcf_dict = allel.read_vcf(vcf_path, fields='*')\n",
+    "    contig = vcf_dict['variants/CHROM'] \n",
+    "    pos = vcf_dict['variants/POS']\n",
+    "    filter_pass = vcf_dict['variants/FILTER_PASS']\n",
+    "    ref = vcf_dict['variants/REF']\n",
+    "    alt = np.apply_along_axis(lambda x: ','.join(x[x != '']), 1, vcf_dict['variants/ALT'])\n",
+    "    ann = vcf_dict['variants/ANN']\n",
+    "\n",
+    "    geno = allel.GenotypeArray(vcf_dict['calldata/GT'])\n",
+    "\n",
+    "    ### make pd dataframe version of vcf\n",
+    "    vcf_df = pd.DataFrame({'CHROM': contig, 'POS': pos, 'FILTER_PASS': filter_pass, 'REF': ref, 'ALT': alt, 'ANN': ann})\n",
+    "    # make pd dataframe version of genotypes\n",
+    "    geno_df = pd.DataFrame(geno.to_gt().astype(str), columns=vcf_dict['samples'])\n",
+    "    vcf = pd.concat([vcf_df, geno_df], axis=1)\n",
+    "    return vcf\n",
+    "\n",
+    "def split_rows_with_multiple_alleles(df, samples):\n",
+    "    # Create an empty list to store the new rows\n",
+    "    new_rows = []\n",
+    "    # Iterate through each row\n",
+    "    for index, row in df.iterrows():\n",
+    "        alt_alleles = row['ALT'].split(',')\n",
+    "        # Check if there are multiple alleles in the ALT field\n",
+    "        if len(alt_alleles) > 1:\n",
+    "            for allele_num, allele in enumerate(alt_alleles):\n",
+    "                # Create a new row for each allele\n",
+    "                new_row = row.copy()\n",
+    "                new_row['ALT'] = allele\n",
+    "                # Update genotype fields\n",
+    "                for col in samples:\n",
+    "                    genotype = row[col]\n",
+    "                    # Split the genotype and process it\n",
+    "                    if genotype != './.':\n",
+    "                        gt_alleles = genotype.split('/')\n",
+    "                        new_gt = ['0' if (int(gt) != allele_num + 1 and gt != '0') else gt for gt in gt_alleles]\n",
+    "                        new_row[col] = '/'.join(new_gt)\n",
+    "                new_rows.append(new_row)\n",
+    "        else:\n",
+    "            new_rows.append(row)\n",
     "    \n",
-    "    return vcf_df"
+    "    # Create a new DataFrame from the new rows\n",
+    "    new_df = pd.DataFrame(new_rows).reset_index(drop=True)\n",
+    "    return new_df\n",
+    "\n",
+    "def convert_genotype_to_alt_allele_count(df, samples):\n",
+    "    nalt_df = df.copy()\n",
+    "    # Iterate through each row\n",
+    "    for index, row in df.iterrows():\n",
+    "        # Update genotype fields\n",
+    "        for col in samples:\n",
+    "                genotype = row[col]\n",
+    "                if genotype != './.':\n",
+    "                    # Split the genotype and count non-zero alleles\n",
+    "                    alleles = genotype.split('/')\n",
+    "                    alt_allele_count = sum([1 for allele in alleles if allele != '0'])\n",
+    "                    nalt_df.at[index, col] = alt_allele_count\n",
+    "                else:\n",
+    "                    nalt_df.at[index, col] = np.nan\n",
+    "\n",
+    "    return nalt_df"
    ]
   },
   {
@@ -48,7 +108,7 @@
    },
    "outputs": [],
    "source": [
-    "dataset = \"gaard-sanger\"\n",
+    "dataset = \"gaard-vigg-01\"\n",
     "wkdir = \"\""
    ]
   },
@@ -75,9 +135,11 @@
    },
    "outputs": [],
    "source": [
-    "vcf_df = read_vcf_to_excel(\n",
-    "    path=f\"results/vcfs/targets/{dataset}.annot.vcf\",\n",
-    "    out_path=f\"results/vcfs/targets/{dataset}-snps.xlsx\"\n",
+    "vcf_df = vcf_to_excel(\n",
+    "    vcf_path=f\"results/vcfs/targets/{dataset}.annot.vcf\",\n",
+    "    excel_path=f\"results/vcfs/targets/{dataset}-snps.xlsx\",\n",
+    "    convert_genotypes=True,\n",
+    "    split_multiallelic=True,\n",
     ")\n",
     "pd.set_option(\"display.max_rows\", 1000, \"display.max_columns\", 1000)\n",
     "vcf_df"
@@ -103,7 +165,8 @@
    "outputs": [],
    "source": [
     "from IPython.display import display, Markdown\n",
-    "display(Markdown(f'<a href=\"{wkdir}/results/vcfs/targets/{dataset}-snps.xlsx\">Target SNP data (.xlsx)</a>'))"
+    "display(Markdown(f'<a href=\"{wkdir}/results/vcfs/targets/{dataset}-snps.xlsx\">Target SNP data (.xlsx)</a>'))\n",
+    "display(Markdown(f'<a href=\"{wkdir}/results/vcfs/targets/{dataset}.annot.vcf\">Target SNP data (.vcf)</a>'))"
    ]
   },
   {
@@ -125,9 +188,11 @@
    },
    "outputs": [],
    "source": [
-    "vcf_df = read_vcf_to_excel(\n",
-    "    path=f\"results/vcfs/amplicons/{dataset}.annot.vcf\",\n",
-    "    out_path=f\"results/vcfs/amplicons/{dataset}-snps.xlsx\"\n",
+    "vcf_df = vcf_to_excel(\n",
+    "    vcf_path=f\"results/vcfs/amplicons/{dataset}.annot.vcf\",\n",
+    "    excel_path=f\"results/vcfs/amplicons/{dataset}-snps.xlsx\",\n",
+    "    convert_genotypes=True,\n",
+    "    split_multiallelic=True\n",
     ")\n",
     "vcf_df"
    ]
@@ -151,16 +216,17 @@
    },
    "outputs": [],
    "source": [
-    "display(Markdown(f'<a href=\"{wkdir}/results/vcfs/amplicons/{dataset}-snps.xlsx\">Whole amplicon SNP data (.xlsx)</a>'))"
+    "display(Markdown(f'<a href=\"{wkdir}/results/vcfs/amplicons/{dataset}-snps.xlsx\">Whole amplicon SNP data (.xlsx)</a>'))\n",
+    "display(Markdown(f'<a href=\"{wkdir}/results/vcfs/amplicons/{dataset}.annot.vcf\">Whole amplicon SNP data (.vcf)</a>'))"
    ]
   }
  ],
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "pythonGenomics",
+   "display_name": "base",
    "language": "python",
-   "name": "pythongenomics"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -172,7 +238,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.15"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,