Skip to content

Commit

Permalink
Merge pull request #100 from sanjaynagi/enhance-snp-dataframe-nb-11-0…
Browse files Browse the repository at this point in the history
…1-24

Improve snp-dataframe-excel functions
  • Loading branch information
sanjaynagi authored Jan 11, 2024
2 parents 4f3b6a1 + e57d4f6 commit 5c7e328
Showing 1 changed file with 97 additions and 31 deletions.
128 changes: 97 additions & 31 deletions workflow/notebooks/snp-dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,89 @@
},
"outputs": [],
"source": [
"import allel\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"def read_vcf_to_excel(path, out_path):\n",
" import io\n",
" import os\n",
" with open(path, 'r') as f:\n",
" lines = [l for l in f if not l.startswith('##')]\n",
" \n",
" vcf_df = pd.read_csv(\n",
" io.StringIO(''.join(lines)),\n",
" dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,\n",
" 'QUAL': str, 'FILTER': str, 'INFO': str},\n",
" sep='\\t').rename(columns={'#CHROM': 'CHROM'})\n",
"def vcf_to_excel(vcf_path, excel_path, convert_genotypes=False, split_multiallelic=False):\n",
" # Read VCF and create a dictionary\n",
" vcf_df = vcf_to_df(vcf_path)\n",
" samples = vcf_df.columns[6:]\n",
"\n",
" # Create a DataFrame for variants\n",
" if split_multiallelic:\n",
" vcf_df = split_rows_with_multiple_alleles(vcf_df, samples)\n",
"\n",
" # Convert genotypes to 0/1/2\n",
" if convert_genotypes:\n",
" vcf_df = convert_genotype_to_alt_allele_count(vcf_df, samples)\n",
"\n",
" # Write to Excel\n",
" vcf_df.to_excel(excel_path, index=False)\n",
"\n",
"def vcf_to_df(vcf_path):\n",
" \n",
" vcf_df = vcf_df.assign(ANNOTATION=vcf_df['INFO'].str.extract(r'ANN=(.*)'))\n",
" # shift column 'Name' to first position\n",
" first_column = vcf_df.pop('ANNOTATION')\n",
" # insert column using insert(position,column_name,\n",
" # first_column) function\n",
" vcf_df.insert(8, 'ANNOTATION', first_column)\n",
" vcf_df.to_excel(out_path)\n",
" vcf_dict = allel.read_vcf(vcf_path, fields='*')\n",
" contig = vcf_dict['variants/CHROM'] \n",
" pos = vcf_dict['variants/POS']\n",
" filter_pass = vcf_dict['variants/FILTER_PASS']\n",
" ref = vcf_dict['variants/REF']\n",
" alt = np.apply_along_axis(lambda x: ','.join(x[x != '']), 1, vcf_dict['variants/ALT'])\n",
" ann = vcf_dict['variants/ANN']\n",
"\n",
" geno = allel.GenotypeArray(vcf_dict['calldata/GT'])\n",
"\n",
" ### make pd dataframe version of vcf\n",
" vcf_df = pd.DataFrame({'CHROM': contig, 'POS': pos, 'FILTER_PASS': filter_pass, 'REF': ref, 'ALT': alt, 'ANN': ann})\n",
" # make pd dataframe version of genotypes\n",
" geno_df = pd.DataFrame(geno.to_gt().astype(str), columns=vcf_dict['samples'])\n",
" vcf = pd.concat([vcf_df, geno_df], axis=1)\n",
" return vcf\n",
"\n",
"def split_rows_with_multiple_alleles(df, samples):\n",
" # Create an empty list to store the new rows\n",
" new_rows = []\n",
" # Iterate through each row\n",
" for index, row in df.iterrows():\n",
" alt_alleles = row['ALT'].split(',')\n",
" # Check if there are multiple alleles in the ALT field\n",
" if len(alt_alleles) > 1:\n",
" for allele_num, allele in enumerate(alt_alleles):\n",
" # Create a new row for each allele\n",
" new_row = row.copy()\n",
" new_row['ALT'] = allele\n",
" # Update genotype fields\n",
" for col in samples:\n",
" genotype = row[col]\n",
" # Split the genotype and process it\n",
" if genotype != './.':\n",
" gt_alleles = genotype.split('/')\n",
" new_gt = ['0' if (int(gt) != allele_num + 1 and gt != '0') else gt for gt in gt_alleles]\n",
" new_row[col] = '/'.join(new_gt)\n",
" new_rows.append(new_row)\n",
" else:\n",
" new_rows.append(row)\n",
" \n",
" return vcf_df"
" # Create a new DataFrame from the new rows\n",
" new_df = pd.DataFrame(new_rows).reset_index(drop=True)\n",
" return new_df\n",
"\n",
"def convert_genotype_to_alt_allele_count(df, samples):\n",
" nalt_df = df.copy()\n",
" # Iterate through each row\n",
" for index, row in df.iterrows():\n",
" # Update genotype fields\n",
" for col in samples:\n",
" genotype = row[col]\n",
" if genotype != './.':\n",
" # Split the genotype and count non-zero alleles\n",
" alleles = genotype.split('/')\n",
" alt_allele_count = sum([1 for allele in alleles if allele != '0'])\n",
" nalt_df.at[index, col] = alt_allele_count\n",
" else:\n",
" nalt_df.at[index, col] = np.nan\n",
"\n",
" return nalt_df"
]
},
{
Expand All @@ -48,7 +108,7 @@
},
"outputs": [],
"source": [
"dataset = \"gaard-sanger\"\n",
"dataset = \"gaard-vigg-01\"\n",
"wkdir = \"\""
]
},
Expand All @@ -75,9 +135,11 @@
},
"outputs": [],
"source": [
"vcf_df = read_vcf_to_excel(\n",
" path=f\"results/vcfs/targets/{dataset}.annot.vcf\",\n",
" out_path=f\"results/vcfs/targets/{dataset}-snps.xlsx\"\n",
"vcf_df = vcf_to_excel(\n",
" vcf_path=f\"results/vcfs/targets/{dataset}.annot.vcf\",\n",
" excel_path=f\"results/vcfs/targets/{dataset}-snps.xlsx\",\n",
" convert_genotypes=True,\n",
" split_multiallelic=True,\n",
")\n",
"pd.set_option(\"display.max_rows\", 1000, \"display.max_columns\", 1000)\n",
"vcf_df"
Expand All @@ -103,7 +165,8 @@
"outputs": [],
"source": [
"from IPython.display import display, Markdown\n",
"display(Markdown(f'<a href=\"{wkdir}/results/vcfs/targets/{dataset}-snps.xlsx\">Target SNP data (.xlsx)</a>'))"
"display(Markdown(f'<a href=\"{wkdir}/results/vcfs/targets/{dataset}-snps.xlsx\">Target SNP data (.xlsx)</a>'))\n",
"display(Markdown(f'<a href=\"{wkdir}/results/vcfs/targets/{dataset}.annot.vcf\">Target SNP data (.vcf)</a>'))"
]
},
{
Expand All @@ -125,9 +188,11 @@
},
"outputs": [],
"source": [
"vcf_df = read_vcf_to_excel(\n",
" path=f\"results/vcfs/amplicons/{dataset}.annot.vcf\",\n",
" out_path=f\"results/vcfs/amplicons/{dataset}-snps.xlsx\"\n",
"vcf_df = vcf_to_excel(\n",
" vcf_path=f\"results/vcfs/amplicons/{dataset}.annot.vcf\",\n",
" excel_path=f\"results/vcfs/amplicons/{dataset}-snps.xlsx\",\n",
" convert_genotypes=True,\n",
" split_multiallelic=True\n",
")\n",
"vcf_df"
]
Expand All @@ -151,16 +216,17 @@
},
"outputs": [],
"source": [
"display(Markdown(f'<a href=\"{wkdir}/results/vcfs/amplicons/{dataset}-snps.xlsx\">Whole amplicon SNP data (.xlsx)</a>'))"
"display(Markdown(f'<a href=\"{wkdir}/results/vcfs/amplicons/{dataset}-snps.xlsx\">Whole amplicon SNP data (.xlsx)</a>'))\n",
"display(Markdown(f'<a href=\"{wkdir}/results/vcfs/amplicons/{dataset}.annot.vcf\">Whole amplicon SNP data (.vcf)</a>'))"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "pythonGenomics",
"display_name": "base",
"language": "python",
"name": "pythongenomics"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -172,7 +238,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 5c7e328

Please sign in to comment.