Skip to content

Commit

Permalink
fix: add script to find missing encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
cindyli committed Sep 22, 2023
1 parent 1208ffa commit d9f0557
Show file tree
Hide file tree
Showing 3 changed files with 331 additions and 0 deletions.
162 changes: 162 additions & 0 deletions data/intermediate_BMW_conversion_data/missing_encodings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
{
"back (adv.)": {
"encoding": [
"RETURN",
"ADVERB"
],
"bci-av-id": 12613
},
"love": {
"encoding": [
"LOVE",
"VERB"
],
"bci-av-id": 15400
},
"third (part)": {
"encoding": [
"NUMBER",
"PART",
"EYE"
],
"bci-av-id": null
},
"quarter (part)": {
"encoding": [
"NUMBER",
"PART",
"REPEAT"
],
"bci-av-id": 24932
},
"fifth (part)": {
"encoding": [
"NUMBER",
"PART",
"WATER"
],
"bci-av-id": null
},
"hand (give)": {
"encoding": [
"GIVE",
"TOOL",
"VERB"
],
"bci-av-id": 14667
},
"put (verb)": {
"encoding": [
"PUT",
"VERB"
],
"bci-av-id": 16440
},
"there (adv.)": {
"encoding": [
"ADVERB"
],
"bci-av-id": 17707
},
"present (noun)": {
"encoding": [
"PRESENT",
"NOUN"
],
"bci-av-id": 14437
},
"falls (noun)": {
"encoding": [
"YEAR",
"EYE",
"MANY"
],
"bci-av-id": [24894, ";", 9011]
},
"bit (noun)": {
"encoding": [
"PART",
"BUT",
"NOUN"
],
"bci-av-id": 12847
},
"so (adv.)": {
"encoding": [
"CAUSE",
"ADVERB"
],
"bci-av-id": 17708
},
"drink (noun)": {
"encoding": [
"WATER",
"FOOD",
"NOUN"
],
"bci-av-id": 13881
},
"type (noun)": {
"encoding": [
"THINK",
"PART",
"NOUN"
],
"bci-av-id": 17961
},
"hurt (present verb)": {
"encoding": [
"WANT",
"CAUSE",
"VERB"
],
"bci-av-id": 14913
},
"opening (noun)": {
"encoding": [
"OPEN",
"NOUN"
],
"bci-av-id": 15923
},
"reading (noun)": {
"encoding": [
"EYE",
"POEM",
"NOUN"
],
"bci-av-id": 24933
},
"read (present verb)": {
"encoding": [
"EYE",
"POEM",
"VERB"
],
"bci-av-id": 16465
},
"let (present verb)": {
"encoding": [
"FORGIVE",
"LOVE",
"VERB"
],
"bci-av-id": 15196
},
"work (noun)": {
"encoding": [
"FINISH",
"WANT",
"NOUN"
],
"bci-av-id": 18273
},
"answer (noun)": {
"encoding": [
"FINISH",
"WHAT",
"NOUN"
],
"bci-av-id": 12393
}
}
6 changes: 6 additions & 0 deletions docs/ConvertBMWToJSON.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,9 @@ python find_null_ids.py ../data/bmw.json
```
python populate_encoding_symbols.py ../data/bmw.json ../data/bliss_symbol_explanations.json ../data/bmw-new.json
```

### Find missing encodings from BMW text file directory

```
python find_missing_encodings.py ../data/bmw.json ../data/intermediate_BMW_conversion_data/bmw_texts/ ../data/bliss_symbol_explanations.json missing_encodings.json error.txt
```
163 changes: 163 additions & 0 deletions utils/find_missing_encodings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
'''
python find_missing_encodings.py ../data/bmw.json ../data/intermediate_BMW_conversion_data/bmw_texts/ ../data/bliss_symbol_explanations.json missing_encodings.json error.txt
'''

import sys
import os
import json


# Joins elements starting at the position `start_pos` to `end_pos` in the `in_array`
# by space. If the joined text matches any value in multiple_words_icons, replace
# the element at the `start_pos` with the joined text. All elements after the `end_pos`
# which be moved forward to be at the `start_pos+1`
# Example:
# Running process_multiple_words_icon(["a", "ABS", "TIME", "b", "c"], 1, 2, ["ABS TIME"])
# outputs ["a", "ABS TIME", "b", "c"]
def process_multiple_words_icon(in_array, start_pos, end_pos, multiple_words_icons):
joined_text = " ".join(in_array[start_pos:end_pos+1])
if joined_text in multiple_words_icons:
return in_array[0:start_pos] + [joined_text] + in_array[end_pos+1:]
else:
return in_array


# The first a few columns have texts in upper case. The next
# one or more columns have texts in lower case. These lower
# case texts should be joined by space to compose the target
# word meaning of the encoding of Bliss symbos in the first
# a few columns.
def process_raw_encoding(line, multiple_words_icons):
columns_in = line.split()
columns_out = []
for index, text in enumerate(columns_in):
if text.isupper():
columns_out.append(text)
elif text in ["*", "|"]: # skip special characters
continue
else:
columns_out.append(' '.join(columns_in[index:]))
break

# process icons composed by multiple words
columns_out = process_multiple_words_icon(columns_out, 0, 1, multiple_words_icons)
columns_out = process_multiple_words_icon(columns_out, 1, 2, multiple_words_icons)
columns_out = process_multiple_words_icon(columns_out, 2, 3, multiple_words_icons)
return columns_out


# Find the Bliss id for the given text:
# 1. Search through the given map first. If found, return id. Otherwise, continue the next step;
# 2. Search through the explanation JSON file. If found, return id. Otherwise, return None.
def get_bliss_id(text, explanation_json_data):
text = text.lower()

for item in explanation_json_data:
defs = item["description"].lower().split(',')

# Skip old descriptions
if defs[-1].endswith("_(OLD)"):
continue
# The suffix "-(to)" indicates a verb. It's not part of the definition
if defs[-1].endswith("-(to)"):
defs[-1] = defs[-1][:-6]

if text in defs:
return item["id"]

return None


# Provide the directory path as a parameter when running the script
source_bmw_path = sys.argv[1]
source_txt_path = sys.argv[2]
bliss_explanation_json_location = sys.argv[3]
output_json_location = sys.argv[4]
output_error_location = sys.argv[5]

missing_encodings = {}
error_rows = []

# load bliss translation json file
with open(bliss_explanation_json_location, 'r') as file:
bliss_explanation_json = json.load(file)

# load bmw.json
with open(source_bmw_path, 'r') as file:
bmw_json = json.load(file)
existing_encodings = bmw_json["encodings"]

missing_bliss_id_texts = set()

multiple_words_icons = ["NOUN PL.", "ABS TIME", "N PERSON", "N.PER PL"]

# Iterate over all text files in the directory
for filename in os.listdir(source_txt_path):
if filename.endswith('.txt'):
txt_path = os.path.join(source_txt_path, filename)

with open(txt_path, 'r') as file:
for line_number, line in enumerate(file, start=1):
line = line.strip()
if not line:
# skip empty lines
continue

columns = process_raw_encoding(line, multiple_words_icons)
if len(columns) < 2 or len(columns) > 4:
print(f"Error: line {line_number} in {filename} with content '{line}' - Row should have 2 to 4 columns.")
error_rows.append(line)
continue

one_encoding = {}
encoding_ids = []
message = ""
has_error = False
message_found = False
for text in columns:
if text.isupper():
# the encoding of bliss symbol texts should be in upper case
encoding_ids.append(text)
bliss_id = get_bliss_id(text, bliss_explanation_json)
if bliss_id is None:
# report the error when a BCI-AV-ID is not found
print(f"Error: line {line_number} in {filename} with content '{line}' - The Bliss id for '{text}' is not found.")
has_error = True
missing_bliss_id_texts.add(text)
else:
# the lower case text is the target message delivered by the Bliss symbol encoding
message = text
message_found = True

if not message_found:
# error occurs when a line only has upper case letters. The expected format should have the target message
# in lower case. When it's not found, report an error.
print(f"Error: line {line_number} in {filename} with content '{line}' - The target message is not found in this line.")
has_error = True

if has_error:
error_rows.append(line)
else:
if message in existing_encodings and existing_encodings[message]["encoding"] != encoding_ids or message not in existing_encodings:
missing_encodings[message] = {}
missing_encodings[message]["encoding"] = encoding_ids
bliss_id = get_bliss_id(message, bliss_explanation_json)
missing_encodings[message]["bci-av-id"] = int(bliss_id) if isinstance(bliss_id, str) else bliss_id

# Write the JSON into a file
with open(output_json_location, "w") as json_file:
json_file.write(json.dumps(missing_encodings, indent=4))
print(f"The final JSON is written into {output_json_location}")

# Write rows with errors into the given error file
if len(error_rows) > 0:
with open(output_error_location, "w") as error_file:
for error_row in error_rows:
error_file.write(error_row + "\n")
print(f"The error rows are written into {output_error_location}")
else:
print("No error detected.")

# Report icons whose corresponding Bliss IDs are not found
if len(missing_bliss_id_texts) > 0:
print(f"Missing Bliss IDs for these texts: {missing_bliss_id_texts}")

0 comments on commit d9f0557

Please sign in to comment.