Skip to content

Commit

Permalink
Merge pull request #3 from NCI-GDC/feature/binf-687-modify-sample
Browse files Browse the repository at this point in the history
[BINF-687] Add modify vcf sample script
  • Loading branch information
czyszCTDS authored Jun 21, 2022
2 parents fda6ae8 + bef2ecc commit 440532d
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 9 deletions.
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ default_language_version:
python_venv: python3.8

repos:
- repo: git@github.com:Yelp/detect-secrets
rev: v1.0.3
- repo: https://github.com/Yelp/detect-secrets
rev: v1.2.0
hooks:
- id: detect-secrets
args: ['--baseline', '.secrets.baseline']
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0
rev: v4.3.0
hooks:
- id: check-yaml
- id: check-toml
Expand All @@ -18,16 +18,16 @@ repos:
args: ["--allow-missing-credentials"]
- id: detect-private-key
- repo: https://github.com/pycqa/isort
rev: 5.8.0
rev: 5.10.1
hooks:
- id: isort
name: isort
- repo: https://github.com/psf/black
rev: 20.8b1
rev: 22.3.0
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: '3.9.2'
rev: '4.0.1'
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down
11 changes: 10 additions & 1 deletion .secrets.baseline
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "1.0.3",
"version": "1.2.0",
"plugins_used": [
{
"name": "ArtifactoryDetector"
Expand Down Expand Up @@ -97,6 +97,15 @@
"pattern": [
"^.secrets.baseline$"
]
},
{
"path": "detect_secrets.filters.heuristic.is_lock_file"
},
{
"path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
},
{
"path": "detect_secrets.filters.heuristic.is_swagger_file"
}
],
"results": {
Expand Down
6 changes: 6 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ COPY requirements.txt /opt/dist

WORKDIR /opt/dist

RUN apt update -y \
&& apt install -y \
libbz2-dev \
liblzma-dev \
zlib1g

RUN pip install -r requirements.txt \
&& pip install *.tar.gz \
&& rm -f *.tar.gz requirements.txt
Expand Down
6 changes: 4 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ cfgv==3.1.0
# via pre-commit
chardet==3.0.4
# via requests
click==8.0.1
# via black
click==8.1.3
# via
# -c requirements.txt
# black
coverage==5.2.1
# via pytest-cov
detect-secrets==1.0.3
Expand Down
93 changes: 93 additions & 0 deletions gatk4_mutect2_tool/modify_vcf_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""
Modify GATK4.1.2 Mutect2 VCF sample header to "TUMOR", "NORMAL"
@author: Shenglai Li
"""
import logging
from typing import Optional

import click
import pysam

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

tumor_bamT = str
normal_bamT = Optional[str]
vcfT = str
outputT = str


def get_sample_name(bam: str) -> str:
'''
Get sample name from BAM file
'''
sample: str
b = pysam.AlignmentFile(bam, 'rb')
sample = b.header['RG'][0]['SM'] # type: ignore
return sample


def modify_vcf_sample(
tumor_bam: tumor_bamT, normal_bam: normal_bamT, vcf: vcfT, output: outputT
) -> None:
'''
Modify VCF sample in the header
'''
out_vcf = output
reader = pysam.BGZFile(vcf, mode='rb') # type: ignore
writer = pysam.BGZFile(out_vcf, mode='wb') # type: ignore
try:
for line in reader:
new_line: str
new_line = line.decode('utf-8')
if new_line.startswith('#CHROM'):
if normal_bam:
assert (
normal_bam in new_line
), f'Unable to find normal sample tag in the vcf file. {normal_bam}'
new_line = new_line.replace(f'{normal_bam}', 'NORMAL')
assert (
tumor_bam in new_line
), f'Unable to find tumor sample tag in the vcf file. {tumor_bam}'
new_line = new_line.replace(f'{tumor_bam}', 'TUMOR')
writer.write(str.encode(f"{new_line}\n", encoding='utf-8'))
else:
new_line = new_line + '\n'
writer.write(new_line.encode('utf-8'))
except AssertionError as e:
logger.exception(e)
finally:
writer.close()
reader.close()
pysam.tabix_index(out_vcf, preset='vcf', force=True)


@click.command()
@click.option('--tumor_bam', required=True)
@click.option('--vcf', required=True)
@click.option('--output', required=True)
@click.option('--normal_bam', required=False)
def main(
tumor_bam: tumor_bamT,
vcf: vcfT,
output: outputT,
normal_bam: normal_bamT = None,
) -> None:
'''
main
'''
tumor_sample = get_sample_name(tumor_bam)
logger.info(f'{tumor_sample=}')
normal_sample = None
if normal_bam:
normal_sample = get_sample_name(normal_bam)
logger.info(f'{normal_sample=}')
modify_vcf_sample(tumor_sample, normal_sample, vcf, output)


if __name__ == "__main__":
main()

# __END__
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,9 @@ python_version = 3.8
disallow_untyped_defs = true
warn_return_any = true
warn_unused_configs = true

[[tool.mypy.overrides]]
module = [
"pysam"
]
ignore_missing_imports = true
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
#
# pip-compile --output-file=requirements.txt
#
click==8.1.3
# via gatk4-mutect2-tool (setup.py)
pysam==0.19.1
# via gatk4-mutect2-tool (setup.py)
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ classifiers =

[options]
install_requires =
click
pysam
zip_safe = False
include_package_data = True
packages = find:
Expand All @@ -25,6 +27,7 @@ setup_requires =
[options.entry_points]
console_scripts =
gatk4_mutect2_tool = gatk4_mutect2_tool.__main__:main
modify_vcf_sample = gatk4_mutect2_tool.modify_vcf_sample:main

[coverage:run]
branch = true
Expand Down
32 changes: 32 additions & 0 deletions tests/test_modify_vcf_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python3

import unittest

from click.testing import CliRunner

from gatk4_mutect2_tool import modify_vcf_sample as MOD


class ThisTestCase(unittest.TestCase):
def setUp(self) -> None:
super().setUp()
self.cli_args_dict = {
"--normal_bam": "normal.bam",
"--tumor_bam": "tumor.bam",
"--vcf": "test.vcf",
"--output": "output.vcf",
}
self.cli_args_list = []
for k, v in self.cli_args_dict.items():
self.cli_args_list.extend([k, v])

def cli_args(self):
runner = CliRunner()
result = runner.invoke(MOD.main, self.cli_args_list)
self.assertEqual(result.exit_code, 0)


if __name__ == "__main__":
unittest.main()

# __END__

0 comments on commit 440532d

Please sign in to comment.