Format Correction

datacommonsorg · Jun 19, 2024 · aeb24a8 · aeb24a8
1 parent 3036fef
commit aeb24a8
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 18 deletions.
diff --git a/scripts/biomedical/NCBI_Assembly/Summary_Report/scripts/process.py b/scripts/biomedical/NCBI_Assembly/Summary_Report/scripts/process.py
@@ -30,6 +30,7 @@ def generate_column(df):
     df['gc_percent_name'] = 'Percent ' + df['gc_percent'].astype(str)
     return df
 
+
 def refseq_category(df):
     """Replacing columns values to required property value.
 	Args:
@@ -46,6 +47,7 @@ def refseq_category(df):
         conversion_to_refseq_category).fillna(df['refseq_category'])
     return df
 
+
 def tax_id(df):
     """Replacing taxid column value as required.
 	Args:
@@ -56,9 +58,10 @@ def tax_id(df):
 	"""
     df['taxid'] = df.apply(lambda row: '' if str(row['taxid']) == str(row[
         'species_taxid']) else TAX_ID_DCID_MAPPING.get(row['taxid'], ''),
-        axis=1)
+                            axis=1)
     return df
 
+
 def species_tax_id(df):
     """Replacing species_taxid column value with tax_id_dcid_mapping.txt.
 	Args:
@@ -71,6 +74,7 @@ def species_tax_id(df):
         df['species_taxid'])
     return df
 
+
 def infraspecific_name(df):
     """Replacing unwanted char from infraspecific_name and organism_name column value.
 	Args:
@@ -84,6 +88,7 @@ def infraspecific_name(df):
         ']', '')
     return df
 
+
 def format_correction(df):
     """Replacing unwanted char from infraspecific_name and isolate column value.
 	Args:
@@ -97,6 +102,7 @@ def format_correction(df):
     return df
 
 
+
 def assembly_level(df):
     """Replacing columns values to required property value.
 	Args:
@@ -161,6 +167,7 @@ def formatdate(df):
     df['annotation_date'] = df['annotation_date'].dt.strftime('%Y-%m-%d')
     return df
 
+
 def paired_asm_comp(df):
     """Replacing columns values to required property value.
 	Args:
@@ -169,11 +176,11 @@ def paired_asm_comp(df):
 		df: dataframe with new columns values.
 	
 	"""
-    df['paired_asm_comp'] = df['paired_asm_comp'].apply(lambda x: 'True'
-                                                        if 'identical' in str(x)
-                                                        else 'False')
+    df['paired_asm_comp'] = df['paired_asm_comp'].apply(
+        lambda x: 'True' if 'identical' in str(x) else 'False')
     return df
 
+
 def relation_to_type_material(df):
     """Replacing columns values to required property value.
 	Args:
@@ -205,6 +212,7 @@ def relation_to_type_material(df):
     return df
 
 
+
 def assembly_type(df):
     """Replacing columns values to required property value.
 	Args:
@@ -272,7 +280,7 @@ def set_flags():
                         'Output directory for generated files.')
     flags.DEFINE_string('input_dir',
                         'scripts/input/assembly_summary_genbank.txt',
-                        'Input directory where .txt files downloaded.')    
+                        'Input directory where .txt files downloaded.')
     flags.DEFINE_string('input_dir1',
                         'scripts/input/assembly_summary_refseq.txt',
                         'Output directory for generated files.')
@@ -317,11 +325,10 @@ def main(_FLAGS):
     df['gbrs_paired_asm'] = df['gbrs_paired_asm'].fillna('')
 
     # Perform operations after replacing NaN
-    df.loc[~df['gbrs_paired_asm'].str.startswith('GC') & 
+    df.loc[~df['gbrs_paired_asm'].str.startswith('GC') &
            df['#assembly_accession'].isin(ref_gbrs_paired_asm),
             'gbrs_paired_asm'] = df['#assembly_accession']
 
-
     with open(tax_id_dcid_mapping, 'r') as file:
         csv_reader = csv.DictReader(file)
         for row in csv_reader:
@@ -332,7 +339,6 @@ def main(_FLAGS):
     df.to_csv(file_output, index=False)
 
 
-
 if __name__ == "__main__":
     try:
         set_flags()  # Parse the flags

diff --git a/scripts/biomedical/NCBI_Assembly/Summary_Report/scripts/process_test.py b/scripts/biomedical/NCBI_Assembly/Summary_Report/scripts/process_test.py
@@ -29,16 +29,16 @@ def set_flag():
 	global _FLAGS
 	_FLAGS = flags.FLAGS
 	flags.DEFINE_string('output_dir', 'test_data/output_file/',
-                     'Output directory for generated files.')
+                        'Output directory for generated files.')
 	flags.DEFINE_string('input_dir',
-                     'scripts/test_data/input/assembly_summary_genbank.txt',
-                     'Input directory where .txt files downloaded.')	
+                        'scripts/test_data/input/assembly_summary_genbank.txt',
+                        'Input directory where .txt files downloaded.')	
 	flags.DEFINE_string('input_dir1',
-                     'scripts/test_data/input/assembly_summary_refseq.txt',
-                     'Output directory for generated files.')
+                        'scripts/test_data/input/assembly_summary_refseq.txt',
+                        'Output directory for generated files.')
 	flags.DEFINE_string('tax_id_dcid_mapping',
-                     'scripts/test_data/input/tax_id_dcid_mapping.txt',
-                     'Input directory where .txt files downloaded.')
+                        'scripts/test_data/input/tax_id_dcid_mapping.txt',
+                        'Input directory where .txt files downloaded.')
 	_FLAGS(sys.argv)
 
 class TestSummaryReport(unittest.TestCase):
@@ -49,10 +49,10 @@ def setUpClass(self):
 
     def test_csv_check(self):
         main(_FLAGS)
-        test_output= os.path.join(MODULE_DIR, _FLAGS.output_dir)
+        test_output = os.path.join(MODULE_DIR, _FLAGS.output_dir)
         same = filecmp.cmp(
-             os.path.join(test_output,'ncbi_assembly_summary.csv'), MODULE_DIR +
-             '/test_data/output_file/expected_ncbi_assembly_summary.csv')
+            os.path.join(test_output,'ncbi_assembly_summary.csv'), MODULE_DIR +
+            '/test_data/output_file/expected_ncbi_assembly_summary.csv')
         self.assertTrue(same)