Skip to content

Commit

Permalink
introduces QC for table tabs; adds another manual exception
Browse files Browse the repository at this point in the history
  • Loading branch information
rgiessmann committed Jan 29, 2024
1 parent 8b3d1e1 commit 2c40b6c
Showing 1 changed file with 11 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
(2, 558, 2),
(2, 560, 2),
(2, 566, 2),
(3, 1041, 1),
]
if which in MANUALLY_EXCLUDED_COLUMNS:
continue
Expand Down Expand Up @@ -181,12 +182,17 @@

## add manually extracted table codes
manual_table_codes = pandas.read_csv("openTECR recuration - table codes.csv")
# QC
if True:
assert sum(manual_table_codes.duplicated(["part", "page", "col l/r", "table from top"])) == 0, print(
manual_table_codes[manual_table_codes.duplicated(["part", "page", "col l/r", "table from top"])])
# split into tables with table codes from Noor and those which needed to be annotated manually
manual_table_codes = manual_table_codes.drop(["reference", "description"], axis="columns")
tmp_with_table_codes = tmp[~tmp.table_code.isna()]
tmp_without_table_codes = tmp[tmp.table_code.isna()]
tmp_without_table_codes = tmp_without_table_codes.drop("table_code", axis="columns")
tmp_without_table_codes_try_to_add_manual_ones = pandas.merge(tmp_without_table_codes, manual_table_codes, how="left", on=["part","page","col l/r","table from top"])

# concat the two
new = pandas.concat([tmp_with_table_codes, tmp_without_table_codes_try_to_add_manual_ones], ignore_index=True)
## keep only one entry per table code, remove now-meaningless columns, but keep id=NaN rows
new = new[~new.duplicated(["part","page","col l/r","table from top"])]
Expand Down Expand Up @@ -219,6 +225,10 @@
## export tables which need to have their comment extracted
selector = []
tables_with_comments = pandas.read_csv("openTECR recuration - table metadata.csv")
## QC
if True:
assert sum(tables_with_comments.duplicated(["part","page","col l/r","table from top"]))==0, print(tables_with_comments[tables_with_comments.duplicated(["part","page","col l/r","table from top"])])
## extract only the tables which are not mentioned in the spreadsheet yet
for i, s in new.iterrows():
if len(tables_with_comments[(tables_with_comments.part == s.part) &
(tables_with_comments.page == s.page) &
Expand Down

0 comments on commit 2c40b6c

Please sign in to comment.