diff --git a/workflow/notebooks/snp-dataframe.ipynb b/workflow/notebooks/snp-dataframe.ipynb index 7140f4a..f3d7450 100644 --- a/workflow/notebooks/snp-dataframe.ipynb +++ b/workflow/notebooks/snp-dataframe.ipynb @@ -11,29 +11,89 @@ }, "outputs": [], "source": [ + "import allel\n", "import pandas as pd\n", + "import numpy as np\n", "\n", - "def read_vcf_to_excel(path, out_path):\n", - " import io\n", - " import os\n", - " with open(path, 'r') as f:\n", - " lines = [l for l in f if not l.startswith('##')]\n", - " \n", - " vcf_df = pd.read_csv(\n", - " io.StringIO(''.join(lines)),\n", - " dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,\n", - " 'QUAL': str, 'FILTER': str, 'INFO': str},\n", - " sep='\\t').rename(columns={'#CHROM': 'CHROM'})\n", + "def vcf_to_excel(vcf_path, excel_path, convert_genotypes=False, split_multiallelic=False):\n", + " # Read VCF and create a dictionary\n", + " vcf_df = vcf_to_df(vcf_path)\n", + " samples = vcf_df.columns[6:]\n", + "\n", + " # Create a DataFrame for variants\n", + " if split_multiallelic:\n", + " vcf_df = split_rows_with_multiple_alleles(vcf_df, samples)\n", + "\n", + " # Convert genotypes to 0/1/2\n", + " if convert_genotypes:\n", + " vcf_df = convert_genotype_to_alt_allele_count(vcf_df, samples)\n", + "\n", + " # Write to Excel\n", + " vcf_df.to_excel(excel_path, index=False)\n", + "\n", + "def vcf_to_df(vcf_path):\n", " \n", - " vcf_df = vcf_df.assign(ANNOTATION=vcf_df['INFO'].str.extract(r'ANN=(.*)'))\n", - " # shift column 'Name' to first position\n", - " first_column = vcf_df.pop('ANNOTATION')\n", - " # insert column using insert(position,column_name,\n", - " # first_column) function\n", - " vcf_df.insert(8, 'ANNOTATION', first_column)\n", - " vcf_df.to_excel(out_path)\n", + " vcf_dict = allel.read_vcf(vcf_path, fields='*')\n", + " contig = vcf_dict['variants/CHROM'] \n", + " pos = vcf_dict['variants/POS']\n", + " filter_pass = vcf_dict['variants/FILTER_PASS']\n", + " ref = vcf_dict['variants/REF']\n", + " alt = np.apply_along_axis(lambda x: ','.join(x[x != '']), 1, vcf_dict['variants/ALT'])\n", + " ann = vcf_dict['variants/ANN']\n", + "\n", + " geno = allel.GenotypeArray(vcf_dict['calldata/GT'])\n", + "\n", + " ### make pd dataframe version of vcf\n", + " vcf_df = pd.DataFrame({'CHROM': contig, 'POS': pos, 'FILTER_PASS': filter_pass, 'REF': ref, 'ALT': alt, 'ANN': ann})\n", + " # make pd dataframe version of genotypes\n", + " geno_df = pd.DataFrame(geno.to_gt().astype(str), columns=vcf_dict['samples'])\n", + " vcf = pd.concat([vcf_df, geno_df], axis=1)\n", + " return vcf\n", + "\n", + "def split_rows_with_multiple_alleles(df, samples):\n", + " # Create an empty list to store the new rows\n", + " new_rows = []\n", + " # Iterate through each row\n", + " for index, row in df.iterrows():\n", + " alt_alleles = row['ALT'].split(',')\n", + " # Check if there are multiple alleles in the ALT field\n", + " if len(alt_alleles) > 1:\n", + " for allele_num, allele in enumerate(alt_alleles):\n", + " # Create a new row for each allele\n", + " new_row = row.copy()\n", + " new_row['ALT'] = allele\n", + " # Update genotype fields\n", + " for col in samples:\n", + " genotype = row[col]\n", + " # Split the genotype and process it\n", + " if genotype != './.':\n", + " gt_alleles = genotype.split('/')\n", + " new_gt = ['0' if (int(gt) != allele_num + 1 and gt != '0') else gt for gt in gt_alleles]\n", + " new_row[col] = '/'.join(new_gt)\n", + " new_rows.append(new_row)\n", + " else:\n", + " new_rows.append(row)\n", " \n", - " return vcf_df" + " # Create a new DataFrame from the new rows\n", + " new_df = pd.DataFrame(new_rows).reset_index(drop=True)\n", + " return new_df\n", + "\n", + "def convert_genotype_to_alt_allele_count(df, samples):\n", + " nalt_df = df.copy()\n", + " # Iterate through each row\n", + " for index, row in df.iterrows():\n", + " # Update genotype fields\n", + " for col in samples:\n", + " genotype = row[col]\n", + " if genotype != './.':\n", + " # Split the genotype and count non-zero alleles\n", + " alleles = genotype.split('/')\n", + " alt_allele_count = sum([1 for allele in alleles if allele != '0'])\n", + " nalt_df.at[index, col] = alt_allele_count\n", + " else:\n", + " nalt_df.at[index, col] = np.nan\n", + "\n", + " return nalt_df" ] }, { @@ -48,7 +108,7 @@ }, "outputs": [], "source": [ - "dataset = \"gaard-sanger\"\n", + "dataset = \"gaard-vigg-01\"\n", "wkdir = \"\"" ] }, @@ -75,9 +135,11 @@ }, "outputs": [], "source": [ - "vcf_df = read_vcf_to_excel(\n", - " path=f\"results/vcfs/targets/{dataset}.annot.vcf\",\n", - " out_path=f\"results/vcfs/targets/{dataset}-snps.xlsx\"\n", + "vcf_df = vcf_to_excel(\n", + " vcf_path=f\"results/vcfs/targets/{dataset}.annot.vcf\",\n", + " excel_path=f\"results/vcfs/targets/{dataset}-snps.xlsx\",\n", + " convert_genotypes=True,\n", + " split_multiallelic=True,\n", ")\n", "pd.set_option(\"display.max_rows\", 1000, \"display.max_columns\", 1000)\n", "vcf_df" @@ -103,7 +165,8 @@ "outputs": [], "source": [ "from IPython.display import display, Markdown\n", - "display(Markdown(f'Target SNP data (.xlsx)'))" + "display(Markdown(f'Target SNP data (.xlsx)'))\n", + "display(Markdown(f'Target SNP data (.vcf)'))" ] }, { @@ -125,9 +188,11 @@ }, "outputs": [], "source": [ - "vcf_df = read_vcf_to_excel(\n", - " path=f\"results/vcfs/amplicons/{dataset}.annot.vcf\",\n", - " out_path=f\"results/vcfs/amplicons/{dataset}-snps.xlsx\"\n", + "vcf_df = vcf_to_excel(\n", + " vcf_path=f\"results/vcfs/amplicons/{dataset}.annot.vcf\",\n", + " excel_path=f\"results/vcfs/amplicons/{dataset}-snps.xlsx\",\n", + " convert_genotypes=True,\n", + " split_multiallelic=True\n", ")\n", "vcf_df" ] @@ -151,16 +216,17 @@ }, "outputs": [], "source": [ - "display(Markdown(f'Whole amplicon SNP data (.xlsx)'))" + "display(Markdown(f'Whole amplicon SNP data (.xlsx)'))\n", + "display(Markdown(f'Whole amplicon SNP data (.vcf)'))" ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "pythonGenomics", + "display_name": "base", "language": "python", - "name": "pythongenomics" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -172,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.11.6" } }, "nbformat": 4,