From 7ba0d8ddcfe17c6b47e8d3c91176dc87173faf0d Mon Sep 17 00:00:00 2001 From: William Allen <16820599+williamjallen@users.noreply.github.com> Date: Mon, 22 Nov 2021 21:38:13 -0500 Subject: [PATCH] [Feature:Plagiarism] Add flag to ignore C++ comments (#69) --- bin/tokenize_all.py | 3 +- .../tokenizer/c/expected_output/output.json | 102 +++-- .../output_ignore_comments.json | 404 ++++++++++++++++++ tests/data/tokenizer/c/input.cpp | 15 +- tests/unittest/tests.py | 26 ++ tokenizer/c/c_tokenizer.py | 71 +-- tokenizer/data.json | 7 +- 7 files changed, 560 insertions(+), 68 deletions(-) create mode 100644 tests/data/tokenizer/c/expected_output/output_ignore_comments.json diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py index 1a88445..b4304c8 100644 --- a/bin/tokenize_all.py +++ b/bin/tokenize_all.py @@ -27,7 +27,8 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file): tokenizer = f"./{language_token_data['tokenizer']}" - if not language_token_data.get("input_as_argument"): + if language_token_data.get('input_as_argument') is not None and \ + language_token_data['input_as_argument'] is not False: my_concatenated_file = f'< {my_concatenated_file}' if "command_args" in language_token_data: diff --git a/tests/data/tokenizer/c/expected_output/output.json b/tests/data/tokenizer/c/expected_output/output.json index 38f5f36..a0a5020 100644 --- a/tests/data/tokenizer/c/expected_output/output.json +++ b/tests/data/tokenizer/c/expected_output/output.json @@ -107,6 +107,12 @@ "type": "PUNCTUATION-;", "value": ";" }, + { + "char": 21, + "line": 6, + "type": "COMMENT", + "value": "// define a variable" + }, { "char": 5, "line": 7, @@ -149,6 +155,12 @@ "type": "PUNCTUATION-;", "value": ";" }, + { + "char": 39, + "line": 7, + "type": "COMMENT", + "value": "// define a variable and set it equal to 1" + }, { "char": 5, "line": 9, @@ -173,6 +185,12 @@ "type": "PUNCTUATION-;", "value": ";" }, + { + "char": 43, + "line": 9, + "type": "COMMENT", + "value": "// print something" + }, { "char": 5, "line": 10, @@ -200,204 +218,222 @@ { "char": 5, "line": 12, + "type": "COMMENT", + "value": "// loop from 1 to n and multiply the previous result by i" + }, + { + "char": 5, + "line": 13, "type": "KEYWORD", "value": "for" }, { "char": 8, - "line": 12, + "line": 13, "type": "PUNCTUATION-(", "value": "(" }, { "char": 9, - "line": 12, + "line": 13, "type": "KEYWORD", "value": "int" }, { "char": 13, - "line": 12, + "line": 13, "type": "IDENTIFIER", "value": "i" }, { "char": 15, - "line": 12, + "line": 13, "type": "PUNCTUATION-=", "value": "=" }, { "char": 17, - "line": 12, + "line": 13, "type": "LITERAL", "value": "1" }, { "char": 18, - "line": 12, + "line": 13, "type": "PUNCTUATION-;", "value": ";" }, { "char": 20, - "line": 12, + "line": 13, "type": "IDENTIFIER", "value": "i" }, { "char": 22, - "line": 12, + "line": 13, "type": "PUNCTUATION-<=", "value": "<=" }, { "char": 24, - "line": 12, + "line": 13, "type": "IDENTIFIER", "value": "n" }, { "char": 25, - "line": 12, + "line": 13, "type": "PUNCTUATION-;", "value": ";" }, { "char": 27, - "line": 12, + "line": 13, "type": "PUNCTUATION-++", "value": "++" }, { "char": 29, - "line": 12, + "line": 13, "type": "IDENTIFIER", "value": "i" }, { "char": 30, - "line": 12, + "line": 13, "type": "PUNCTUATION-)", "value": ")" }, { "char": 5, - "line": 13, + "line": 14, "type": "PUNCTUATION-{", "value": "{" }, { "char": 9, - "line": 14, + "line": 15, "type": "IDENTIFIER", "value": "factorial" }, { "char": 19, - "line": 14, + "line": 15, "type": "PUNCTUATION-*=", "value": "*=" }, { "char": 22, - "line": 14, + "line": 15, "type": "IDENTIFIER", "value": "i" }, { "char": 23, - "line": 14, + "line": 15, "type": "PUNCTUATION-;", "value": ";" }, + { + "char": 9, + "line": 16, + "type": "COMMENT", + "value": "/*\n factorial += i; // this doesn't work\n factorial -= i; // this doesn't work either\n */" + }, { "char": 5, - "line": 15, + "line": 20, "type": "PUNCTUATION-}", "value": "}" }, { "char": 5, - "line": 17, + "line": 22, "type": "IDENTIFIER", "value": "cout" }, { "char": 10, - "line": 17, + "line": 22, "type": "PUNCTUATION-<<", "value": "<<" }, { "char": 13, - "line": 17, + "line": 22, "type": "LITERAL", "value": "\"Factorial of \"" }, { "char": 29, - "line": 17, + "line": 22, "type": "PUNCTUATION-<<", "value": "<<" }, { "char": 32, - "line": 17, + "line": 22, "type": "IDENTIFIER", "value": "n" }, { "char": 34, - "line": 17, + "line": 22, "type": "PUNCTUATION-<<", "value": "<<" }, { "char": 37, - "line": 17, + "line": 22, "type": "LITERAL", "value": "\" = \"" }, { "char": 43, - "line": 17, + "line": 22, "type": "PUNCTUATION-<<", "value": "<<" }, { "char": 46, - "line": 17, + "line": 22, "type": "IDENTIFIER", "value": "factorial" }, { "char": 55, - "line": 17, + "line": 22, "type": "PUNCTUATION-;", "value": ";" }, + { + "char": 57, + "line": 22, + "type": "COMMENT", + "value": "// print the result" + }, { "char": 5, - "line": 18, + "line": 23, "type": "KEYWORD", "value": "return" }, { "char": 12, - "line": 18, + "line": 23, "type": "LITERAL", "value": "0" }, { "char": 13, - "line": 18, + "line": 23, "type": "PUNCTUATION-;", "value": ";" }, { "char": 1, - "line": 19, + "line": 24, "type": "PUNCTUATION-}", "value": "}" } diff --git a/tests/data/tokenizer/c/expected_output/output_ignore_comments.json b/tests/data/tokenizer/c/expected_output/output_ignore_comments.json new file mode 100644 index 0000000..f8e86a8 --- /dev/null +++ b/tests/data/tokenizer/c/expected_output/output_ignore_comments.json @@ -0,0 +1,404 @@ +[ + { + "char": 1, + "line": 1, + "type": "PUNCTUATION-#", + "value": "#" + }, + { + "char": 2, + "line": 1, + "type": "IDENTIFIER", + "value": "include" + }, + { + "char": 10, + "line": 1, + "type": "PUNCTUATION-<", + "value": "<" + }, + { + "char": 11, + "line": 1, + "type": "IDENTIFIER", + "value": "iostream" + }, + { + "char": 19, + "line": 1, + "type": "PUNCTUATION->", + "value": ">" + }, + { + "char": 1, + "line": 2, + "type": "KEYWORD", + "value": "using" + }, + { + "char": 7, + "line": 2, + "type": "KEYWORD", + "value": "namespace" + }, + { + "char": 17, + "line": 2, + "type": "IDENTIFIER", + "value": "std" + }, + { + "char": 20, + "line": 2, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 1, + "line": 4, + "type": "KEYWORD", + "value": "int" + }, + { + "char": 5, + "line": 4, + "type": "IDENTIFIER", + "value": "main" + }, + { + "char": 9, + "line": 4, + "type": "PUNCTUATION-(", + "value": "(" + }, + { + "char": 10, + "line": 4, + "type": "PUNCTUATION-)", + "value": ")" + }, + { + "char": 1, + "line": 5, + "type": "PUNCTUATION-{", + "value": "{" + }, + { + "char": 5, + "line": 6, + "type": "KEYWORD", + "value": "unsigned" + }, + { + "char": 14, + "line": 6, + "type": "KEYWORD", + "value": "int" + }, + { + "char": 18, + "line": 6, + "type": "IDENTIFIER", + "value": "n" + }, + { + "char": 19, + "line": 6, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 5, + "line": 7, + "type": "KEYWORD", + "value": "unsigned" + }, + { + "char": 14, + "line": 7, + "type": "KEYWORD", + "value": "long" + }, + { + "char": 19, + "line": 7, + "type": "KEYWORD", + "value": "long" + }, + { + "char": 24, + "line": 7, + "type": "IDENTIFIER", + "value": "factorial" + }, + { + "char": 34, + "line": 7, + "type": "PUNCTUATION-=", + "value": "=" + }, + { + "char": 36, + "line": 7, + "type": "LITERAL", + "value": "1" + }, + { + "char": 37, + "line": 7, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 5, + "line": 9, + "type": "IDENTIFIER", + "value": "cout" + }, + { + "char": 10, + "line": 9, + "type": "PUNCTUATION-<<", + "value": "<<" + }, + { + "char": 13, + "line": 9, + "type": "LITERAL", + "value": "\"Enter a positive integer: \"" + }, + { + "char": 41, + "line": 9, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 5, + "line": 10, + "type": "IDENTIFIER", + "value": "cin" + }, + { + "char": 9, + "line": 10, + "type": "PUNCTUATION->>", + "value": ">>" + }, + { + "char": 12, + "line": 10, + "type": "IDENTIFIER", + "value": "n" + }, + { + "char": 13, + "line": 10, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 5, + "line": 13, + "type": "KEYWORD", + "value": "for" + }, + { + "char": 8, + "line": 13, + "type": "PUNCTUATION-(", + "value": "(" + }, + { + "char": 9, + "line": 13, + "type": "KEYWORD", + "value": "int" + }, + { + "char": 13, + "line": 13, + "type": "IDENTIFIER", + "value": "i" + }, + { + "char": 15, + "line": 13, + "type": "PUNCTUATION-=", + "value": "=" + }, + { + "char": 17, + "line": 13, + "type": "LITERAL", + "value": "1" + }, + { + "char": 18, + "line": 13, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 20, + "line": 13, + "type": "IDENTIFIER", + "value": "i" + }, + { + "char": 22, + "line": 13, + "type": "PUNCTUATION-<=", + "value": "<=" + }, + { + "char": 24, + "line": 13, + "type": "IDENTIFIER", + "value": "n" + }, + { + "char": 25, + "line": 13, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 27, + "line": 13, + "type": "PUNCTUATION-++", + "value": "++" + }, + { + "char": 29, + "line": 13, + "type": "IDENTIFIER", + "value": "i" + }, + { + "char": 30, + "line": 13, + "type": "PUNCTUATION-)", + "value": ")" + }, + { + "char": 5, + "line": 14, + "type": "PUNCTUATION-{", + "value": "{" + }, + { + "char": 9, + "line": 15, + "type": "IDENTIFIER", + "value": "factorial" + }, + { + "char": 19, + "line": 15, + "type": "PUNCTUATION-*=", + "value": "*=" + }, + { + "char": 22, + "line": 15, + "type": "IDENTIFIER", + "value": "i" + }, + { + "char": 23, + "line": 15, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 5, + "line": 20, + "type": "PUNCTUATION-}", + "value": "}" + }, + { + "char": 5, + "line": 22, + "type": "IDENTIFIER", + "value": "cout" + }, + { + "char": 10, + "line": 22, + "type": "PUNCTUATION-<<", + "value": "<<" + }, + { + "char": 13, + "line": 22, + "type": "LITERAL", + "value": "\"Factorial of \"" + }, + { + "char": 29, + "line": 22, + "type": "PUNCTUATION-<<", + "value": "<<" + }, + { + "char": 32, + "line": 22, + "type": "IDENTIFIER", + "value": "n" + }, + { + "char": 34, + "line": 22, + "type": "PUNCTUATION-<<", + "value": "<<" + }, + { + "char": 37, + "line": 22, + "type": "LITERAL", + "value": "\" = \"" + }, + { + "char": 43, + "line": 22, + "type": "PUNCTUATION-<<", + "value": "<<" + }, + { + "char": 46, + "line": 22, + "type": "IDENTIFIER", + "value": "factorial" + }, + { + "char": 55, + "line": 22, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 5, + "line": 23, + "type": "KEYWORD", + "value": "return" + }, + { + "char": 12, + "line": 23, + "type": "LITERAL", + "value": "0" + }, + { + "char": 13, + "line": 23, + "type": "PUNCTUATION-;", + "value": ";" + }, + { + "char": 1, + "line": 24, + "type": "PUNCTUATION-}", + "value": "}" + } +] diff --git a/tests/data/tokenizer/c/input.cpp b/tests/data/tokenizer/c/input.cpp index f606f25..c520596 100644 --- a/tests/data/tokenizer/c/input.cpp +++ b/tests/data/tokenizer/c/input.cpp @@ -3,17 +3,22 @@ using namespace std; int main() { - unsigned int n; - unsigned long long factorial = 1; + unsigned int n; // define a variable + unsigned long long factorial = 1; // define a variable and set it equal to 1 - cout << "Enter a positive integer: "; + cout << "Enter a positive integer: "; // print something cin >> n; + // loop from 1 to n and multiply the previous result by i for(int i = 1; i <=n; ++i) { factorial *= i; + /* + factorial += i; // this doesn't work + factorial -= i; // this doesn't work either + */ } - cout << "Factorial of " << n << " = " << factorial; + cout << "Factorial of " << n << " = " << factorial; // print the result return 0; -} \ No newline at end of file +} diff --git a/tests/unittest/tests.py b/tests/unittest/tests.py index 98cbd86..08d3c89 100644 --- a/tests/unittest/tests.py +++ b/tests/unittest/tests.py @@ -165,6 +165,32 @@ def testCTokenizer(self): self.assertEqual(actual_output, expected_output) + def testCTokenizerIgnoreComments(self): + self.maxDiff = None + + with TemporaryDirectory() as temp_dir: + input_file = Path(test_data_dir, "tokenizer", "c", "input.cpp") + output_file_with_comments = Path(temp_dir, "output_with_comments.json") + output_file_ignore_comments = Path(temp_dir, "output_ignore_comments.json") + expected_output_file_ignore_comments = Path(test_data_dir, "tokenizer", "c", "expected_output", "output_ignore_comments.json") + expected_output_file_with_comments = Path(test_data_dir, "tokenizer", "c", "expected_output", "output.json") + + subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'c_tokenizer.py'))} {str(input_file)} --ignore_comments > {str(output_file_ignore_comments)}", shell=True) + subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'c_tokenizer.py'))} {str(input_file)} > {str(output_file_with_comments)}", shell=True) + + with open(output_file_with_comments) as file: + actual_output_with_comments = json.load(file) + with open(output_file_ignore_comments) as file: + actual_output_ignore_comments = json.load(file) + + with open(expected_output_file_with_comments) as file: + expected_output_with_comments = json.load(file) + with open(expected_output_file_ignore_comments) as file: + expected_output_ignore_comments = json.load(file) + + self.assertEqual(actual_output_with_comments, expected_output_with_comments) + self.assertEqual(actual_output_ignore_comments, expected_output_ignore_comments) + class TestPythonTokenizer(unittest.TestCase): def testPythonTokenizer(self): diff --git a/tokenizer/c/c_tokenizer.py b/tokenizer/c/c_tokenizer.py index d71bcb8..37ffba6 100644 --- a/tokenizer/c/c_tokenizer.py +++ b/tokenizer/c/c_tokenizer.py @@ -1,42 +1,59 @@ import clang.cindex import json -import sys import shutil import tempfile import os +import argparse -# apparently, the file name must end in .cpp (or some standard -# c/c++ suffix to be successfully tokenized) +def parse_args(): + parser = argparse.ArgumentParser(description='C Tokenizer') + parser.add_argument('input_file') + parser.add_argument('--ignore_comments', action='store_true') + return parser.parse_args() -# make a temprary filename -tmp_cpp_file_handle, tmp_cpp_file_name = tempfile.mkstemp(suffix=".cpp") -# copy the concatenated file to the temporary file location -shutil.copy(sys.argv[1], tmp_cpp_file_name) -if (os.path.isfile("/usr/lib/llvm-6.0/lib/libclang.so.1")): - clang.cindex.Config.set_library_file("/usr/lib/llvm-6.0/lib/libclang.so.1") -elif (os.path.isfile("/usr/lib/llvm-3.8/lib/libclang-3.8.so.1")): - clang.cindex.Config.set_library_file("/usr/lib/llvm-3.8/lib/libclang-3.8.so.1") -idx = clang.cindex.Index.create() +def main(): + args = parse_args() -# parse the input file -parsed_data = idx.parse(tmp_cpp_file_name) + # apparently, the file name must end in .cpp (or some standard + # c/c++ suffix to be successfully tokenized) -# remove the temporary file -os.remove(tmp_cpp_file_name) + # make a temprary filename + tmp_cpp_file_handle, tmp_cpp_file_name = tempfile.mkstemp(suffix='.cpp') + # copy the concatenated file to the temporary file location + shutil.copy(args.input_file, tmp_cpp_file_name) -tokens = [] + if (os.path.isfile('/usr/lib/llvm-6.0/lib/libclang.so.1')): + clang.cindex.Config.set_library_file('/usr/lib/llvm-6.0/lib/libclang.so.1') + elif (os.path.isfile('/usr/lib/llvm-3.8/lib/libclang-3.8.so.1')): + clang.cindex.Config.set_library_file('/usr/lib/llvm-3.8/lib/libclang-3.8.so.1') + idx = clang.cindex.Index.create() -for token in parsed_data.get_tokens(extent=parsed_data.cursor.extent): - tmp = dict() - tmp["line"] = int(token.location.line) - tmp["char"] = int(token.location.column) - tmp["type"] = (str(token.kind))[10:] - if tmp["type"] == "PUNCTUATION": - tmp["type"] += "-" + str(token.spelling) - tmp["value"] = str(token.spelling) - tokens.append(tmp) + # parse the input file + parsed_data = idx.parse(tmp_cpp_file_name) + # remove the temporary file + os.remove(tmp_cpp_file_name) -print(json.dumps(tokens, indent=4, sort_keys=True)) + tokens = [] + + for token in parsed_data.get_tokens(extent=parsed_data.cursor.extent): + tmp = dict() + tmp['line'] = int(token.location.line) + tmp['char'] = int(token.location.column) + tmp['type'] = (str(token.kind))[10:] + if tmp['type'] == 'PUNCTUATION': + tmp['type'] += '-' + str(token.spelling) + tmp['value'] = str(token.spelling) + + if args.ignore_comments and tmp['type'] == 'COMMENT': + continue + + tokens.append(tmp) + + print(json.dumps(tokens, indent=4, sort_keys=True)) + + +if __name__ == '__main__': + main() diff --git a/tokenizer/data.json b/tokenizer/data.json index 8b9bf85..a60e04f 100644 --- a/tokenizer/data.json +++ b/tokenizer/data.json @@ -2,7 +2,7 @@ "plaintext": { "tokenizer": "plaintext_tokenizer.py", "command_executable": "python3", - "input_as_argument": true, + "input_as_argument": false, "command_args": [ "--ignore_newlines" ], @@ -17,7 +17,10 @@ "cpp": { "tokenizer": "c_tokenizer.py", "command_executable": "python3", - "input_as_argument": true, + "input_as_argument": false, + "command_args": [ + "--ignore_comments" + ], "token_value": "type" }, "java": {