Skip to content
This repository has been archived by the owner on Dec 18, 2023. It is now read-only.

Commit

Permalink
Merge pull request #67 from thehyve/age_at_diagnosis
Browse files Browse the repository at this point in the history
Age at diagnosis
  • Loading branch information
Spayralbe authored Jun 7, 2019
2 parents f0a19ce + 269f112 commit 0d9e3a6
Show file tree
Hide file tree
Showing 11 changed files with 101 additions and 10 deletions.
8 changes: 8 additions & 0 deletions definitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pathlib import Path

# Project root dir
ROOT_DIR = Path(__file__).parent.resolve()

TEST_DATA_DIR = ROOT_DIR.joinpath('test_data')
TESTS_DIR = ROOT_DIR.joinpath('tests')
TEST_EXPECTED_OUT_DIR = TESTS_DIR.joinpath('expected_out')
8 changes: 7 additions & 1 deletion scripts/csr_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,12 +496,18 @@ def calculate_age_at_diagnosis(csr, colname, date_format='%Y-%m-%d'):

for individual in dt_csr[ind].unique():
subset = dt_csr[dt_csr[ind]==individual]

# Skip if individual does not have diagnosis data
if subset.empty:
continue

subset = subset.sort_values('DIAGNOSIS_DATE')
birth_date = subset.loc[(subset['BIRTH_DATE'].notnull()) & (subset[dia].isnull()),'BIRTH_DATE']
first_diagnosis_date = subset.loc[subset.first_valid_index(),'DIAGNOSIS_DATE']
if birth_date.empty:
if birth_date.empty or pd.isnull(first_diagnosis_date):
logger.warning('Assigning NaN age at diagnosis for {}. Diagnosis date: {} - Birth date: {}'. \
format(individual, first_diagnosis_date, birth_date.values[0]))
csr.loc[(csr[ind] == individual) & (csr[dia].isnull()), colname] = pd.np.nan
continue
try:
#days = (first_diagnosis_date - birth_date).dt.days.values[0]
Expand Down
4 changes: 4 additions & 0 deletions test_data/missing_diagnosis_date/biomaterial.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
biomaterial_id src_biosource_id src_biomaterial_id type biomaterial_date
BM1 BS1 RNA 12-10-2017
BM2 BS2 DNA 22-11-2017
BM3 BS3 RNA 12-12-2017
4 changes: 4 additions & 0 deletions test_data/missing_diagnosis_date/biosource.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
biosource_id biosource_dedicated individual_id diagnosis_id src_biosource_id tissue biosource_date disease_status tumor_percentage
BS1 Yes P1 D1 BS1 medula 12-03-2017 ST1 5
BS2 P2 D2 BS2 cortex 01-04-2017 ST2 3
BS3 P3 BS3 cortex 14-05-2017 ST1 2
1 change: 1 addition & 0 deletions test_data/missing_diagnosis_date/codebook.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 GENDER m male f female u unknown
Expand Down
3 changes: 3 additions & 0 deletions test_data/missing_diagnosis_date/diagnosis.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
diagnosis_id individual_id topography treatment_protocol tumor_type tumor_stage diagnosis_date center_treatment
D1 P1 liver chemo neuroblastoma IV 01-05-2016 Center 1
D2 P2 kidney surgery nephroblastoma III Center 2
4 changes: 4 additions & 0 deletions test_data/missing_diagnosis_date/individual.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
individual_id taxonomy birth_date gender death_date IC_type IC_given_date IC_withdrawn_date IC_material IC_data IC_linking_ext report_her_susc report_inc_finding
P1 Human 01-02-1993 f yes 01-03-2017 yes yes NA yes yes
P2 Human 02-03-1992 m yes 10-04-2017 yes yes NA yes yes
P3 Human 03-04-1994 f yes 11-05-2017 yes NA not applicable yes yes
11 changes: 11 additions & 0 deletions test_data/missing_diagnosis_date/individual_study.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
study_id individual_id individual_study_id
STUDY1 P1 1
STUDY1 P2 2
STUDY1 P3 3
STUDY1 P4 4
STUDY1 P5 5
STUDY2 P5 6
STUDY2 P6 7
STUDY2 P7 8
STUDY2 P8 9
STUDY2 P9 10
3 changes: 3 additions & 0 deletions test_data/missing_diagnosis_date/study.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
study_id acronym title datadictionary
STUDY1 STD1 Study 1 http://www.example.com
STUDY2 STD2 Study 2 http://www.example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
INDIVIDUAL_ID BIOMATERIAL_ID DIAGNOSIS_ID BIOSOURCE_ID SRC_BIOSOURCE_ID BIOMATERIAL_DATE BIOSOURCE_DATE BIOSOURCE_DEDICATED BIRTH_DATE CENTER_TREATMENT DEATH_DATE DIAGNOSIS_DATE DISEASE_STATUS GENDER IC_DATA IC_GIVEN_DATE IC_LINKING_EXT IC_MATERIAL IC_TYPE IC_WITHDRAWN_DATE REPORT_HER_SUSC REPORT_INC_FINDING SRC_BIOMATERIAL_ID TAXONOMY TISSUE TOPOGRAPHY TREATMENT_PROTOCOL TUMOR_PERCENTAGE TUMOR_STAGE TUMOR_TYPE TYPE
P1 1993-02-01 f yes 2017-03-01 yes yes yes yes Human
P2 1992-03-02 m yes 2017-04-10 yes yes yes yes Human
P3 1994-04-03 f 2017-05-11 not applicable yes yes yes yes Human
P1 D1 Center 1 2016-05-01 liver chemo IV neuroblastoma
P2 D2 Center 2 kidney surgery III nephroblastoma
P1 D1 BS1 BS1 2017-03-12 Yes ST1 medula 5
P2 D2 BS2 BS2 2017-04-01 ST2 cortex 3
P3 BS3 BS3 2017-05-14 ST1 cortex 2
P1 BM1 D1 BS1 BS1 2017-10-12 RNA
P2 BM2 D2 BS2 BS2 2017-11-22 DNA
P3 BM3 BS3 BS3 2017-12-12 RNA
53 changes: 44 additions & 9 deletions tests/test_csr_transformations.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import unittest
import os
import csr_transformations as ct

import tempfile
import unittest

import pandas as pd

import scripts.csr_transformations as ct
from definitions import ROOT_DIR, TEST_DATA_DIR, TEST_EXPECTED_OUT_DIR


# TODO: Refactor test cases to not rely on production config

class CsrTransformationTests(unittest.TestCase):

def setUp(self):
self.default_data = './test_data/default_data'
self.dummy_test_data = './test_data/dummy_data'
self.test_config = './test_data/test_config'
self.config = './config'
self.default_data = TEST_DATA_DIR.joinpath('default_data')
self.dummy_test_data = TEST_DATA_DIR.joinpath('dummy_data')
self.missing_diag_data = TEST_DATA_DIR.joinpath('missing_diagnosis_date')
self.test_config = TEST_DATA_DIR.joinpath('test_config')
self.config = ROOT_DIR.joinpath('config')

def tearDown(self):
pass
Expand Down Expand Up @@ -45,6 +49,39 @@ def test_csr_transformation(self):
output_study_filename_path = os.path.join(output_dir, output_study_filename)
self.assertTrue(os.path.exists(output_study_filename_path))

def test_calculate_age_at_diagnosis(self):
"""Test that the CSR pipeline correctly handles missing first diagnosis date data
by comparing the resulting csr_transformation_data.tsv file with the expected output."""
# given
output_dir = tempfile.mkdtemp()
output_filename = 'csr_transformation_data.tsv'
output_study_filename = 'study_registry.tsv'

# when
ct.csr_transformation(
input_dir=self.missing_diag_data,
output_dir=output_dir,

config_dir=self.config,
data_model='data_model.json',
column_priority='column_priority.json',
file_headers='file_headers.json',
columns_to_csr='columns_to_csr.json',

output_filename=output_filename,
output_study_filename=output_study_filename
)

# then
reference_df_path = TEST_EXPECTED_OUT_DIR.joinpath('missing_diagnosis_date', 'csr_transformation_data.tsv')
reference_df = pd.read_csv(reference_df_path, sep='\t')
reference_df = reference_df.reindex(sorted(reference_df.columns), axis=1)

csr_output_path = os.path.join(output_dir, output_filename)
csr_df = pd.read_csv(csr_output_path, sep='\t')
csr_df = csr_df.reindex(sorted(csr_df.columns), axis=1)
self.assertTrue(reference_df.equals(csr_df))

def test_read_dict_from_file(self):
ref_dict = {'patient': ['age', 'date', 'gender'],
'single_item': 'item',
Expand All @@ -65,8 +102,6 @@ def test_validate_empty_source_file(self):
value = ct.validate_source_file(file_prop_dict, source_file, 'file_headers.json')
self.assertTrue(value)



def test_get_overlapping_columns(self):
file_prop_dict = ct.read_dict_from_file('file_headers.json', self.config)
header_map = ct.read_dict_from_file('columns_to_csr.json', self.config)
Expand Down

0 comments on commit 0d9e3a6

Please sign in to comment.