Skip to content

Commit

Permalink
[Refactor:Plagiarism] Refactor data.json (#76)
Browse files Browse the repository at this point in the history
* progress

* Make the changes backwards-compatible
  • Loading branch information
williamjallen authored Feb 28, 2022
1 parent ed20eb9 commit 47a4e2d
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 49 deletions.
37 changes: 24 additions & 13 deletions bin/tokenize_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import argparse
import os
import json
import subprocess
import humanize
import datetime

Expand All @@ -19,28 +20,38 @@ def parse_args():
def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
language = lichen_config_data["language"]

cli_args = list()
language_token_data = dict()

data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
with open(data_json_path, 'r') as token_data_file:
token_data = json.load(token_data_file)
language_token_data = token_data[language]
data_file = json.load(token_data_file)
language_token_data = data_file[language]
if "arguments" in lichen_config_data.keys(): # For backwards compatibility - TODO: Remove
for argument in lichen_config_data["arguments"]:
if argument in language_token_data["command_args"]:
cli_args.append(language_token_data["command_args"][argument]["argument"])
else:
print(f"Error: Unknown tokenization argument {argument}")
else: # Use the default arguments
for argument in language_token_data["command_args"]:
if "default" in language_token_data["command_args"][argument].keys() and\
language_token_data["command_args"][argument]["default"]:
cli_args.append(language_token_data["command_args"][argument]["argument"])

tokenizer = f"./{language_token_data['tokenizer']}"

if language_token_data.get('input_as_argument') is not None and \
language_token_data['input_as_argument'] is not False:
my_concatenated_file = f'< {my_concatenated_file}'
result = subprocess.run([language_token_data['command_executable'],
tokenizer, my_concatenated_file] + cli_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)

if "command_args" in language_token_data:
cli_args = " ".join(language_token_data["command_args"])
else:
cli_args = ""
stderr = result.stderr.decode('utf-8')
if not stderr.isspace() and stderr is not None and stderr != '':
print(result.stderr.decode("utf-8"))

command = f"{language_token_data['command_executable']} {tokenizer} "\
f"{cli_args} {my_concatenated_file} > {my_tokenized_file}".strip()

os.system(command)
with open(my_tokenized_file, 'w') as file:
file.write(result.stdout.decode('utf-8'))


def main():
Expand Down
97 changes: 61 additions & 36 deletions tokenizer/data.json
Original file line number Diff line number Diff line change
@@ -1,38 +1,63 @@
{
"plaintext": {
"tokenizer": "plaintext_tokenizer.py",
"command_executable": "python3",
"input_as_argument": false,
"command_args": [
"--ignore_newlines"
],
"token_value": "value"
},
"python": {
"tokenizer": "python_tokenizer.py",
"command_executable": "python3",
"input_as_argument": true,
"token_value": "type"
},
"cpp": {
"tokenizer": "c_tokenizer.py",
"command_executable": "python3",
"input_as_argument": false,
"command_args": [
"--ignore_comments"
],
"token_value": "type"
},
"java": {
"tokenizer": "java_tokenizer.py",
"command_executable": "python3",
"input_as_argument": true,
"token_value": "type"
},
"mips": {
"tokenizer": "mips_tokenizer.py",
"command_executable": "python3",
"input_as_argument": true,
"token_value": "type"
}
"plaintext": {
"name": "Plain Text",
"tokenizer": "plaintext_tokenizer.py",
"command_executable": "python3",
"command_args": {
"ignore_punctuation": {
"name": "Ignore punctuation",
"argument": "--ignore_punctuation"
},
"to_lower": {
"name": "Convert to lower case",
"argument": "--to_lower"
},
"ignore_numbers": {
"name": "Ignore numbers",
"argument": "--ignore_numbers"
},
"ignore_newlines": {
"name": "Ignore newlines",
"argument": "--ignore_newlines",
"default": true
}
},
"token_value": "value",
"default_hash_size": 14
},
"python": {
"name": "Python",
"tokenizer": "python_tokenizer.py",
"command_executable": "python3",
"token_value": "type",
"default_hash_size": 14
},
"cpp": {
"name": "C/C++",
"tokenizer": "c_tokenizer.py",
"command_executable": "python3",
"command_args": {
"ignore_comments": {
"name": "Ignore comments",
"argument": "--ignore_comments",
"default": true
}
},
"token_value": "type",
"default_hash_size": 14
},
"java": {
"name": "Java",
"tokenizer": "java_tokenizer.py",
"command_executable": "python3",
"token_value": "type",
"default_hash_size": 14
},
"mips": {
"name": "MIPS Assembly",
"tokenizer": "mips_tokenizer.py",
"command_executable": "python3",
"token_value": "type",
"default_hash_size": 5
}
}

0 comments on commit 47a4e2d

Please sign in to comment.