Skip to content

Commit

Permalink
added Taxon node in gene info output file
Browse files Browse the repository at this point in the history
  • Loading branch information
krishnaswamypradeep committed Oct 7, 2024
1 parent a33a8de commit 3c7b2e7
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 55 deletions.
57 changes: 3 additions & 54 deletions scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@

GENE_INFO_DICT = {
'taxID': '',
'taxID_dcid': '',
'GeneID': '',
'dcid': '',
'Symbol': '',
Expand Down Expand Up @@ -556,9 +557,6 @@ def process_gene_info(self, file_to_process: str, return_dict) -> None:
counters.add_counter('total',
file_util.file_estimate_num_rows(input_file))
with open(input_file, 'r') as source_file:
log_cnt = 0

# reader = csv.reader(source_file)
for line in source_file:
line = line.replace('\\\\', ' ').replace('\\', ' ')
# skip row
Expand All @@ -576,12 +574,6 @@ def process_gene_info(self, file_to_process: str, return_dict) -> None:
unique_dbXrefs_list, unique_dbXrefs, input_row)
if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {TAX_ID_DCID_MAPPING[input_row[0]]} '
)
log_cnt += 1
counters.add_counter('processed', 1)

logging.info(
Expand Down Expand Up @@ -611,7 +603,8 @@ def parse_gene_info_row(self, taxID, gene_id_dcid_mapping,
_type_: _description_
"""
row = deepcopy(GENE_INFO_DICT)
row['taxID'] = taxID
row['taxID'] = input_row[0]
row['taxID_dcid'] = taxID
row['GeneID'] = input_row[1]
dcid = f"bio/{input_row[1]}" if input_row[
0] != '9606' else f"bio/ncbi_{input_row[1]}"
Expand Down Expand Up @@ -859,7 +852,6 @@ def process_csv_file(self, file_to_process: str) -> None:
file_util.file_estimate_num_rows(input_file))

with open(input_file, 'r') as source_file:
log_cnt = 0
for line in source_file:
# skip row
if line[0] == '#':
Expand All @@ -873,12 +865,6 @@ def process_csv_file(self, file_to_process: str) -> None:

if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {GENE_ID_DCID_MAPPING[input_row[1]]} '
)
log_cnt += 1
else:
logging.info(
f"Missing values to form dcid {input_row[2]} {input_row[4]} {input_row[5]} in file` {input_file}"
Expand Down Expand Up @@ -1093,7 +1079,6 @@ def process_csv_file(self) -> None:
file_util.file_estimate_num_rows(input_file))

with open(input_file, 'r') as source_file:
log_cnt = 0
for line in source_file:
# skip row
if line[0] == '#':
Expand All @@ -1106,12 +1091,6 @@ def process_csv_file(self) -> None:
GENE_ID_DCID_MAPPING[input_row[1]], input_row)
if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {GENE_ID_DCID_MAPPING[input_row[1]]} '
)
log_cnt += 1
counters.add_counter('processed', 1)

def parse_gene_mim2gene_row(self, dcid, input_row):
Expand Down Expand Up @@ -1183,8 +1162,6 @@ def process_csv_file(self, file_to_process: str) -> None:
file_util.file_estimate_num_rows(input_file))

with open(input_file, 'r') as source_file:
log_cnt = 0

for line in source_file:
# skip row
if line[0] == '#':
Expand All @@ -1197,12 +1174,6 @@ def process_csv_file(self, file_to_process: str) -> None:
GENE_ID_DCID_MAPPING[input_row[1]], input_row)
if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {GENE_ID_DCID_MAPPING[input_row[1]]} '
)
log_cnt += 1
counters.add_counter('processed', 1)

def parse_gene_gene2go_row(self, dcid, input_row):
Expand Down Expand Up @@ -1271,8 +1242,6 @@ def process_csv_file(self, file_to_process: str) -> None:
file_util.file_estimate_num_rows(input_file))

with open(input_file, 'r') as source_file:
log_cnt = 0

for line in source_file:
# skip row
if line[0] == '#':
Expand All @@ -1292,12 +1261,6 @@ def process_csv_file(self, file_to_process: str) -> None:

if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {GENE_ID_DCID_MAPPING[input_row[1]]} '
)
log_cnt += 1

if len(input_row[3]) > 1:
row_rna = deepcopy(GENE_ACCESSION_RNA_DICT)
Expand Down Expand Up @@ -1408,12 +1371,6 @@ def process_csv_file(self) -> None:
GENE_ID_DCID_MAPPING[input_row[1]], input_row)
if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {GENE_ID_DCID_MAPPING[input_row[1]]} '
)
log_cnt += 1
counters.add_counter('processed', 1)

def parse_gene_gene2ensembl_row(self, dcid, input_row):
Expand Down Expand Up @@ -1470,8 +1427,6 @@ def process_csv_file(self) -> None:
file_util.file_estimate_num_rows(input_file))

with open(input_file, 'r') as source_file:
log_cnt = 0

for line in source_file:
# skip row
if line[0] == '#':
Expand All @@ -1485,12 +1440,6 @@ def process_csv_file(self) -> None:
GENE_ID_DCID_MAPPING[input_row[1]], input_row)
if row:
writer_gene.writerow(row)
if ',' in line:
if log_cnt < 5:
logging.info(
f'Row {input_row} \n {row} \n {GENE_ID_DCID_MAPPING[input_row[1]]} '
)
log_cnt += 1
counters.add_counter('processed', 1)

def parse_gene_generifs_row(self, dcid, input_row):
Expand Down
2 changes: 2 additions & 0 deletions scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def test_check_gene_info_parser(self):
]
expected_result = {
'taxID':
'2010893',
'taxID_dcid':
'dcid:bio/LobeliaMildbraedii',
'GeneID':
'33370007',
Expand Down
8 changes: 7 additions & 1 deletion scripts/biomedical/NCBI_Gene/tMCFs/ncbi_gene_gene.tmcf
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
Node: E:ncbi_gene_gene->E1
typeOf: dcs:Taxon
dcid: C:ncbi_gene_gene->taxID_dcid
ncbiTaxId: C:ncbi_gene_gene->taxID


Node: E:ncbi_gene_gene->E2
typeOf: dcs:Gene
dcid: C:ncbi_gene_gene->dcid
ofSpecies: C:ncbi_gene_gene->taxID
ofSpecies: E:ncbi_gene_gene->E1
geneID: C:ncbi_gene_gene->GeneID
name: C:ncbi_gene_gene->Symbol
synonym: C:ncbi_gene_gene->synonym
Expand Down

0 comments on commit 3c7b2e7

Please sign in to comment.