Skip to content

Commit

Permalink
Format Correction
Browse files Browse the repository at this point in the history
  • Loading branch information
saanikaaa committed Jun 19, 2024
1 parent 3036fef commit aeb24a8
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 18 deletions.
22 changes: 14 additions & 8 deletions scripts/biomedical/NCBI_Assembly/Summary_Report/scripts/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def generate_column(df):
df['gc_percent_name'] = 'Percent ' + df['gc_percent'].astype(str)
return df


def refseq_category(df):
"""Replacing columns values to required property value.
Args:
Expand All @@ -46,6 +47,7 @@ def refseq_category(df):
conversion_to_refseq_category).fillna(df['refseq_category'])
return df


def tax_id(df):
"""Replacing taxid column value as required.
Args:
Expand All @@ -56,9 +58,10 @@ def tax_id(df):
"""
df['taxid'] = df.apply(lambda row: '' if str(row['taxid']) == str(row[
'species_taxid']) else TAX_ID_DCID_MAPPING.get(row['taxid'], ''),
axis=1)
axis=1)
return df


def species_tax_id(df):
"""Replacing species_taxid column value with tax_id_dcid_mapping.txt.
Args:
Expand All @@ -71,6 +74,7 @@ def species_tax_id(df):
df['species_taxid'])
return df


def infraspecific_name(df):
"""Replacing unwanted char from infraspecific_name and organism_name column value.
Args:
Expand All @@ -84,6 +88,7 @@ def infraspecific_name(df):
']', '')
return df


def format_correction(df):
"""Replacing unwanted char from infraspecific_name and isolate column value.
Args:
Expand All @@ -97,6 +102,7 @@ def format_correction(df):
return df



def assembly_level(df):
"""Replacing columns values to required property value.
Args:
Expand Down Expand Up @@ -161,6 +167,7 @@ def formatdate(df):
df['annotation_date'] = df['annotation_date'].dt.strftime('%Y-%m-%d')
return df


def paired_asm_comp(df):
"""Replacing columns values to required property value.
Args:
Expand All @@ -169,11 +176,11 @@ def paired_asm_comp(df):
df: dataframe with new columns values.
"""
df['paired_asm_comp'] = df['paired_asm_comp'].apply(lambda x: 'True'
if 'identical' in str(x)
else 'False')
df['paired_asm_comp'] = df['paired_asm_comp'].apply(
lambda x: 'True' if 'identical' in str(x) else 'False')
return df


def relation_to_type_material(df):
"""Replacing columns values to required property value.
Args:
Expand Down Expand Up @@ -205,6 +212,7 @@ def relation_to_type_material(df):
return df



def assembly_type(df):
"""Replacing columns values to required property value.
Args:
Expand Down Expand Up @@ -272,7 +280,7 @@ def set_flags():
'Output directory for generated files.')
flags.DEFINE_string('input_dir',
'scripts/input/assembly_summary_genbank.txt',
'Input directory where .txt files downloaded.')
'Input directory where .txt files downloaded.')
flags.DEFINE_string('input_dir1',
'scripts/input/assembly_summary_refseq.txt',
'Output directory for generated files.')
Expand Down Expand Up @@ -317,11 +325,10 @@ def main(_FLAGS):
df['gbrs_paired_asm'] = df['gbrs_paired_asm'].fillna('')

# Perform operations after replacing NaN
df.loc[~df['gbrs_paired_asm'].str.startswith('GC') &
df.loc[~df['gbrs_paired_asm'].str.startswith('GC') &
df['#assembly_accession'].isin(ref_gbrs_paired_asm),
'gbrs_paired_asm'] = df['#assembly_accession']


with open(tax_id_dcid_mapping, 'r') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
Expand All @@ -332,7 +339,6 @@ def main(_FLAGS):
df.to_csv(file_output, index=False)



if __name__ == "__main__":
try:
set_flags() # Parse the flags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@ def set_flag():
global _FLAGS
_FLAGS = flags.FLAGS
flags.DEFINE_string('output_dir', 'test_data/output_file/',
'Output directory for generated files.')
'Output directory for generated files.')
flags.DEFINE_string('input_dir',
'scripts/test_data/input/assembly_summary_genbank.txt',
'Input directory where .txt files downloaded.')
'scripts/test_data/input/assembly_summary_genbank.txt',
'Input directory where .txt files downloaded.')
flags.DEFINE_string('input_dir1',
'scripts/test_data/input/assembly_summary_refseq.txt',
'Output directory for generated files.')
'scripts/test_data/input/assembly_summary_refseq.txt',
'Output directory for generated files.')
flags.DEFINE_string('tax_id_dcid_mapping',
'scripts/test_data/input/tax_id_dcid_mapping.txt',
'Input directory where .txt files downloaded.')
'scripts/test_data/input/tax_id_dcid_mapping.txt',
'Input directory where .txt files downloaded.')
_FLAGS(sys.argv)

class TestSummaryReport(unittest.TestCase):
Expand All @@ -49,10 +49,10 @@ def setUpClass(self):

def test_csv_check(self):
main(_FLAGS)
test_output= os.path.join(MODULE_DIR, _FLAGS.output_dir)
test_output = os.path.join(MODULE_DIR, _FLAGS.output_dir)
same = filecmp.cmp(
os.path.join(test_output,'ncbi_assembly_summary.csv'), MODULE_DIR +
'/test_data/output_file/expected_ncbi_assembly_summary.csv')
os.path.join(test_output,'ncbi_assembly_summary.csv'), MODULE_DIR +
'/test_data/output_file/expected_ncbi_assembly_summary.csv')
self.assertTrue(same)


Expand Down

0 comments on commit aeb24a8

Please sign in to comment.