From 7ba0d8ddcfe17c6b47e8d3c91176dc87173faf0d Mon Sep 17 00:00:00 2001
From: William Allen <16820599+williamjallen@users.noreply.github.com>
Date: Mon, 22 Nov 2021 21:38:13 -0500
Subject: [PATCH] [Feature:Plagiarism] Add flag to ignore C++ comments (#69)

---
 bin/tokenize_all.py                           |   3 +-
 .../tokenizer/c/expected_output/output.json   | 102 +++--
 .../output_ignore_comments.json               | 404 ++++++++++++++++++
 tests/data/tokenizer/c/input.cpp              |  15 +-
 tests/unittest/tests.py                       |  26 ++
 tokenizer/c/c_tokenizer.py                    |  71 +--
 tokenizer/data.json                           |   7 +-
 7 files changed, 560 insertions(+), 68 deletions(-)
 create mode 100644 tests/data/tokenizer/c/expected_output/output_ignore_comments.json

diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
index 1a88445..b4304c8 100644
--- a/bin/tokenize_all.py
+++ b/bin/tokenize_all.py
@@ -27,7 +27,8 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
 
     tokenizer = f"./{language_token_data['tokenizer']}"
 
-    if not language_token_data.get("input_as_argument"):
+    if language_token_data.get('input_as_argument') is not None and \
+       language_token_data['input_as_argument'] is not False:
         my_concatenated_file = f'< {my_concatenated_file}'
 
     if "command_args" in language_token_data:
diff --git a/tests/data/tokenizer/c/expected_output/output.json b/tests/data/tokenizer/c/expected_output/output.json
index 38f5f36..a0a5020 100644
--- a/tests/data/tokenizer/c/expected_output/output.json
+++ b/tests/data/tokenizer/c/expected_output/output.json
@@ -107,6 +107,12 @@
         "type": "PUNCTUATION-;",
         "value": ";"
     },
+    {
+        "char": 21,
+        "line": 6,
+        "type": "COMMENT",
+        "value": "// define a variable"
+    },
     {
         "char": 5,
         "line": 7,
@@ -149,6 +155,12 @@
         "type": "PUNCTUATION-;",
         "value": ";"
     },
+    {
+        "char": 39,
+        "line": 7,
+        "type": "COMMENT",
+        "value": "// define a variable and set it equal to 1"
+    },
     {
         "char": 5,
         "line": 9,
@@ -173,6 +185,12 @@
         "type": "PUNCTUATION-;",
         "value": ";"
     },
+    {
+        "char": 43,
+        "line": 9,
+        "type": "COMMENT",
+        "value": "// print something"
+    },
     {
         "char": 5,
         "line": 10,
@@ -200,204 +218,222 @@
     {
         "char": 5,
         "line": 12,
+        "type": "COMMENT",
+        "value": "// loop from 1 to n and multiply the previous result by i"
+    },
+    {
+        "char": 5,
+        "line": 13,
         "type": "KEYWORD",
         "value": "for"
     },
     {
         "char": 8,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-(",
         "value": "("
     },
     {
         "char": 9,
-        "line": 12,
+        "line": 13,
         "type": "KEYWORD",
         "value": "int"
     },
     {
         "char": 13,
-        "line": 12,
+        "line": 13,
         "type": "IDENTIFIER",
         "value": "i"
     },
     {
         "char": 15,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-=",
         "value": "="
     },
     {
         "char": 17,
-        "line": 12,
+        "line": 13,
         "type": "LITERAL",
         "value": "1"
     },
     {
         "char": 18,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-;",
         "value": ";"
     },
     {
         "char": 20,
-        "line": 12,
+        "line": 13,
         "type": "IDENTIFIER",
         "value": "i"
     },
     {
         "char": 22,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-<=",
         "value": "<="
     },
     {
         "char": 24,
-        "line": 12,
+        "line": 13,
         "type": "IDENTIFIER",
         "value": "n"
     },
     {
         "char": 25,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-;",
         "value": ";"
     },
     {
         "char": 27,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-++",
         "value": "++"
     },
     {
         "char": 29,
-        "line": 12,
+        "line": 13,
         "type": "IDENTIFIER",
         "value": "i"
     },
     {
         "char": 30,
-        "line": 12,
+        "line": 13,
         "type": "PUNCTUATION-)",
         "value": ")"
     },
     {
         "char": 5,
-        "line": 13,
+        "line": 14,
         "type": "PUNCTUATION-{",
         "value": "{"
     },
     {
         "char": 9,
-        "line": 14,
+        "line": 15,
         "type": "IDENTIFIER",
         "value": "factorial"
     },
     {
         "char": 19,
-        "line": 14,
+        "line": 15,
         "type": "PUNCTUATION-*=",
         "value": "*="
     },
     {
         "char": 22,
-        "line": 14,
+        "line": 15,
         "type": "IDENTIFIER",
         "value": "i"
     },
     {
         "char": 23,
-        "line": 14,
+        "line": 15,
         "type": "PUNCTUATION-;",
         "value": ";"
     },
+    {
+        "char": 9,
+        "line": 16,
+        "type": "COMMENT",
+        "value": "/*\n        factorial += i; // this doesn't work\n        factorial -= i; // this doesn't work either\n        */"
+    },
     {
         "char": 5,
-        "line": 15,
+        "line": 20,
         "type": "PUNCTUATION-}",
         "value": "}"
     },
     {
         "char": 5,
-        "line": 17,
+        "line": 22,
         "type": "IDENTIFIER",
         "value": "cout"
     },
     {
         "char": 10,
-        "line": 17,
+        "line": 22,
         "type": "PUNCTUATION-<<",
         "value": "<<"
     },
     {
         "char": 13,
-        "line": 17,
+        "line": 22,
         "type": "LITERAL",
         "value": "\"Factorial of \""
     },
     {
         "char": 29,
-        "line": 17,
+        "line": 22,
         "type": "PUNCTUATION-<<",
         "value": "<<"
     },
     {
         "char": 32,
-        "line": 17,
+        "line": 22,
         "type": "IDENTIFIER",
         "value": "n"
     },
     {
         "char": 34,
-        "line": 17,
+        "line": 22,
         "type": "PUNCTUATION-<<",
         "value": "<<"
     },
     {
         "char": 37,
-        "line": 17,
+        "line": 22,
         "type": "LITERAL",
         "value": "\" = \""
     },
     {
         "char": 43,
-        "line": 17,
+        "line": 22,
         "type": "PUNCTUATION-<<",
         "value": "<<"
     },
     {
         "char": 46,
-        "line": 17,
+        "line": 22,
         "type": "IDENTIFIER",
         "value": "factorial"
     },
     {
         "char": 55,
-        "line": 17,
+        "line": 22,
         "type": "PUNCTUATION-;",
         "value": ";"
     },
+    {
+        "char": 57,
+        "line": 22,
+        "type": "COMMENT",
+        "value": "// print the result"
+    },
     {
         "char": 5,
-        "line": 18,
+        "line": 23,
         "type": "KEYWORD",
         "value": "return"
     },
     {
         "char": 12,
-        "line": 18,
+        "line": 23,
         "type": "LITERAL",
         "value": "0"
     },
     {
         "char": 13,
-        "line": 18,
+        "line": 23,
         "type": "PUNCTUATION-;",
         "value": ";"
     },
     {
         "char": 1,
-        "line": 19,
+        "line": 24,
         "type": "PUNCTUATION-}",
         "value": "}"
     }
diff --git a/tests/data/tokenizer/c/expected_output/output_ignore_comments.json b/tests/data/tokenizer/c/expected_output/output_ignore_comments.json
new file mode 100644
index 0000000..f8e86a8
--- /dev/null
+++ b/tests/data/tokenizer/c/expected_output/output_ignore_comments.json
@@ -0,0 +1,404 @@
+[
+    {
+        "char": 1,
+        "line": 1,
+        "type": "PUNCTUATION-#",
+        "value": "#"
+    },
+    {
+        "char": 2,
+        "line": 1,
+        "type": "IDENTIFIER",
+        "value": "include"
+    },
+    {
+        "char": 10,
+        "line": 1,
+        "type": "PUNCTUATION-<",
+        "value": "<"
+    },
+    {
+        "char": 11,
+        "line": 1,
+        "type": "IDENTIFIER",
+        "value": "iostream"
+    },
+    {
+        "char": 19,
+        "line": 1,
+        "type": "PUNCTUATION->",
+        "value": ">"
+    },
+    {
+        "char": 1,
+        "line": 2,
+        "type": "KEYWORD",
+        "value": "using"
+    },
+    {
+        "char": 7,
+        "line": 2,
+        "type": "KEYWORD",
+        "value": "namespace"
+    },
+    {
+        "char": 17,
+        "line": 2,
+        "type": "IDENTIFIER",
+        "value": "std"
+    },
+    {
+        "char": 20,
+        "line": 2,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "KEYWORD",
+        "value": "int"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "IDENTIFIER",
+        "value": "main"
+    },
+    {
+        "char": 9,
+        "line": 4,
+        "type": "PUNCTUATION-(",
+        "value": "("
+    },
+    {
+        "char": 10,
+        "line": 4,
+        "type": "PUNCTUATION-)",
+        "value": ")"
+    },
+    {
+        "char": 1,
+        "line": 5,
+        "type": "PUNCTUATION-{",
+        "value": "{"
+    },
+    {
+        "char": 5,
+        "line": 6,
+        "type": "KEYWORD",
+        "value": "unsigned"
+    },
+    {
+        "char": 14,
+        "line": 6,
+        "type": "KEYWORD",
+        "value": "int"
+    },
+    {
+        "char": 18,
+        "line": 6,
+        "type": "IDENTIFIER",
+        "value": "n"
+    },
+    {
+        "char": 19,
+        "line": 6,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 5,
+        "line": 7,
+        "type": "KEYWORD",
+        "value": "unsigned"
+    },
+    {
+        "char": 14,
+        "line": 7,
+        "type": "KEYWORD",
+        "value": "long"
+    },
+    {
+        "char": 19,
+        "line": 7,
+        "type": "KEYWORD",
+        "value": "long"
+    },
+    {
+        "char": 24,
+        "line": 7,
+        "type": "IDENTIFIER",
+        "value": "factorial"
+    },
+    {
+        "char": 34,
+        "line": 7,
+        "type": "PUNCTUATION-=",
+        "value": "="
+    },
+    {
+        "char": 36,
+        "line": 7,
+        "type": "LITERAL",
+        "value": "1"
+    },
+    {
+        "char": 37,
+        "line": 7,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 5,
+        "line": 9,
+        "type": "IDENTIFIER",
+        "value": "cout"
+    },
+    {
+        "char": 10,
+        "line": 9,
+        "type": "PUNCTUATION-<<",
+        "value": "<<"
+    },
+    {
+        "char": 13,
+        "line": 9,
+        "type": "LITERAL",
+        "value": "\"Enter a positive integer: \""
+    },
+    {
+        "char": 41,
+        "line": 9,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 5,
+        "line": 10,
+        "type": "IDENTIFIER",
+        "value": "cin"
+    },
+    {
+        "char": 9,
+        "line": 10,
+        "type": "PUNCTUATION->>",
+        "value": ">>"
+    },
+    {
+        "char": 12,
+        "line": 10,
+        "type": "IDENTIFIER",
+        "value": "n"
+    },
+    {
+        "char": 13,
+        "line": 10,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 5,
+        "line": 13,
+        "type": "KEYWORD",
+        "value": "for"
+    },
+    {
+        "char": 8,
+        "line": 13,
+        "type": "PUNCTUATION-(",
+        "value": "("
+    },
+    {
+        "char": 9,
+        "line": 13,
+        "type": "KEYWORD",
+        "value": "int"
+    },
+    {
+        "char": 13,
+        "line": 13,
+        "type": "IDENTIFIER",
+        "value": "i"
+    },
+    {
+        "char": 15,
+        "line": 13,
+        "type": "PUNCTUATION-=",
+        "value": "="
+    },
+    {
+        "char": 17,
+        "line": 13,
+        "type": "LITERAL",
+        "value": "1"
+    },
+    {
+        "char": 18,
+        "line": 13,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 20,
+        "line": 13,
+        "type": "IDENTIFIER",
+        "value": "i"
+    },
+    {
+        "char": 22,
+        "line": 13,
+        "type": "PUNCTUATION-<=",
+        "value": "<="
+    },
+    {
+        "char": 24,
+        "line": 13,
+        "type": "IDENTIFIER",
+        "value": "n"
+    },
+    {
+        "char": 25,
+        "line": 13,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 27,
+        "line": 13,
+        "type": "PUNCTUATION-++",
+        "value": "++"
+    },
+    {
+        "char": 29,
+        "line": 13,
+        "type": "IDENTIFIER",
+        "value": "i"
+    },
+    {
+        "char": 30,
+        "line": 13,
+        "type": "PUNCTUATION-)",
+        "value": ")"
+    },
+    {
+        "char": 5,
+        "line": 14,
+        "type": "PUNCTUATION-{",
+        "value": "{"
+    },
+    {
+        "char": 9,
+        "line": 15,
+        "type": "IDENTIFIER",
+        "value": "factorial"
+    },
+    {
+        "char": 19,
+        "line": 15,
+        "type": "PUNCTUATION-*=",
+        "value": "*="
+    },
+    {
+        "char": 22,
+        "line": 15,
+        "type": "IDENTIFIER",
+        "value": "i"
+    },
+    {
+        "char": 23,
+        "line": 15,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 5,
+        "line": 20,
+        "type": "PUNCTUATION-}",
+        "value": "}"
+    },
+    {
+        "char": 5,
+        "line": 22,
+        "type": "IDENTIFIER",
+        "value": "cout"
+    },
+    {
+        "char": 10,
+        "line": 22,
+        "type": "PUNCTUATION-<<",
+        "value": "<<"
+    },
+    {
+        "char": 13,
+        "line": 22,
+        "type": "LITERAL",
+        "value": "\"Factorial of \""
+    },
+    {
+        "char": 29,
+        "line": 22,
+        "type": "PUNCTUATION-<<",
+        "value": "<<"
+    },
+    {
+        "char": 32,
+        "line": 22,
+        "type": "IDENTIFIER",
+        "value": "n"
+    },
+    {
+        "char": 34,
+        "line": 22,
+        "type": "PUNCTUATION-<<",
+        "value": "<<"
+    },
+    {
+        "char": 37,
+        "line": 22,
+        "type": "LITERAL",
+        "value": "\" = \""
+    },
+    {
+        "char": 43,
+        "line": 22,
+        "type": "PUNCTUATION-<<",
+        "value": "<<"
+    },
+    {
+        "char": 46,
+        "line": 22,
+        "type": "IDENTIFIER",
+        "value": "factorial"
+    },
+    {
+        "char": 55,
+        "line": 22,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 5,
+        "line": 23,
+        "type": "KEYWORD",
+        "value": "return"
+    },
+    {
+        "char": 12,
+        "line": 23,
+        "type": "LITERAL",
+        "value": "0"
+    },
+    {
+        "char": 13,
+        "line": 23,
+        "type": "PUNCTUATION-;",
+        "value": ";"
+    },
+    {
+        "char": 1,
+        "line": 24,
+        "type": "PUNCTUATION-}",
+        "value": "}"
+    }
+]
diff --git a/tests/data/tokenizer/c/input.cpp b/tests/data/tokenizer/c/input.cpp
index f606f25..c520596 100644
--- a/tests/data/tokenizer/c/input.cpp
+++ b/tests/data/tokenizer/c/input.cpp
@@ -3,17 +3,22 @@ using namespace std;
 
 int main()
 {
-    unsigned int n;
-    unsigned long long factorial = 1;
+    unsigned int n; // define a variable
+    unsigned long long factorial = 1; // define a variable and set it equal to 1
 
-    cout << "Enter a positive integer: ";
+    cout << "Enter a positive integer: "; // print something
     cin >> n;
 
+    // loop from 1 to n and multiply the previous result by i
     for(int i = 1; i <=n; ++i)
     {
         factorial *= i;
+        /*
+        factorial += i; // this doesn't work
+        factorial -= i; // this doesn't work either
+        */
     }
 
-    cout << "Factorial of " << n << " = " << factorial;    
+    cout << "Factorial of " << n << " = " << factorial; // print the result
     return 0;
-}
\ No newline at end of file
+}
diff --git a/tests/unittest/tests.py b/tests/unittest/tests.py
index 98cbd86..08d3c89 100644
--- a/tests/unittest/tests.py
+++ b/tests/unittest/tests.py
@@ -165,6 +165,32 @@ def testCTokenizer(self):
 
             self.assertEqual(actual_output, expected_output)
 
+    def testCTokenizerIgnoreComments(self):
+        self.maxDiff = None
+
+        with TemporaryDirectory() as temp_dir:
+            input_file = Path(test_data_dir, "tokenizer", "c", "input.cpp")
+            output_file_with_comments = Path(temp_dir, "output_with_comments.json")
+            output_file_ignore_comments = Path(temp_dir, "output_ignore_comments.json")
+            expected_output_file_ignore_comments = Path(test_data_dir, "tokenizer", "c", "expected_output", "output_ignore_comments.json")
+            expected_output_file_with_comments = Path(test_data_dir, "tokenizer", "c", "expected_output", "output.json")
+
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'c_tokenizer.py'))} {str(input_file)} --ignore_comments > {str(output_file_ignore_comments)}", shell=True)
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'c_tokenizer.py'))} {str(input_file)} > {str(output_file_with_comments)}", shell=True)
+
+            with open(output_file_with_comments) as file:
+                actual_output_with_comments = json.load(file)
+            with open(output_file_ignore_comments) as file:
+                actual_output_ignore_comments = json.load(file)
+
+            with open(expected_output_file_with_comments) as file:
+                expected_output_with_comments = json.load(file)
+            with open(expected_output_file_ignore_comments) as file:
+                expected_output_ignore_comments = json.load(file)
+
+            self.assertEqual(actual_output_with_comments, expected_output_with_comments)
+            self.assertEqual(actual_output_ignore_comments, expected_output_ignore_comments)
+
 
 class TestPythonTokenizer(unittest.TestCase):
     def testPythonTokenizer(self):
diff --git a/tokenizer/c/c_tokenizer.py b/tokenizer/c/c_tokenizer.py
index d71bcb8..37ffba6 100644
--- a/tokenizer/c/c_tokenizer.py
+++ b/tokenizer/c/c_tokenizer.py
@@ -1,42 +1,59 @@
 import clang.cindex
 import json
-import sys
 import shutil
 import tempfile
 import os
+import argparse
 
 
-# apparently, the file name must end in .cpp (or some standard
-# c/c++ suffix to be successfully tokenized)
+def parse_args():
+    parser = argparse.ArgumentParser(description='C Tokenizer')
+    parser.add_argument('input_file')
+    parser.add_argument('--ignore_comments', action='store_true')
+    return parser.parse_args()
 
-# make a temprary filename
-tmp_cpp_file_handle, tmp_cpp_file_name = tempfile.mkstemp(suffix=".cpp")
-# copy the concatenated file to the temporary file location
-shutil.copy(sys.argv[1], tmp_cpp_file_name)
 
-if (os.path.isfile("/usr/lib/llvm-6.0/lib/libclang.so.1")):
-    clang.cindex.Config.set_library_file("/usr/lib/llvm-6.0/lib/libclang.so.1")
-elif (os.path.isfile("/usr/lib/llvm-3.8/lib/libclang-3.8.so.1")):
-    clang.cindex.Config.set_library_file("/usr/lib/llvm-3.8/lib/libclang-3.8.so.1")
-idx = clang.cindex.Index.create()
+def main():
+    args = parse_args()
 
-# parse the input file
-parsed_data = idx.parse(tmp_cpp_file_name)
+    # apparently, the file name must end in .cpp (or some standard
+    # c/c++ suffix to be successfully tokenized)
 
-# remove the temporary file
-os.remove(tmp_cpp_file_name)
+    # make a temprary filename
+    tmp_cpp_file_handle, tmp_cpp_file_name = tempfile.mkstemp(suffix='.cpp')
+    # copy the concatenated file to the temporary file location
+    shutil.copy(args.input_file, tmp_cpp_file_name)
 
-tokens = []
+    if (os.path.isfile('/usr/lib/llvm-6.0/lib/libclang.so.1')):
+        clang.cindex.Config.set_library_file('/usr/lib/llvm-6.0/lib/libclang.so.1')
+    elif (os.path.isfile('/usr/lib/llvm-3.8/lib/libclang-3.8.so.1')):
+        clang.cindex.Config.set_library_file('/usr/lib/llvm-3.8/lib/libclang-3.8.so.1')
+    idx = clang.cindex.Index.create()
 
-for token in parsed_data.get_tokens(extent=parsed_data.cursor.extent):
-    tmp = dict()
-    tmp["line"] = int(token.location.line)
-    tmp["char"] = int(token.location.column)
-    tmp["type"] = (str(token.kind))[10:]
-    if tmp["type"] == "PUNCTUATION":
-        tmp["type"] += "-" + str(token.spelling)
-    tmp["value"] = str(token.spelling)
-    tokens.append(tmp)
+    # parse the input file
+    parsed_data = idx.parse(tmp_cpp_file_name)
 
+    # remove the temporary file
+    os.remove(tmp_cpp_file_name)
 
-print(json.dumps(tokens, indent=4, sort_keys=True))
+    tokens = []
+
+    for token in parsed_data.get_tokens(extent=parsed_data.cursor.extent):
+        tmp = dict()
+        tmp['line'] = int(token.location.line)
+        tmp['char'] = int(token.location.column)
+        tmp['type'] = (str(token.kind))[10:]
+        if tmp['type'] == 'PUNCTUATION':
+            tmp['type'] += '-' + str(token.spelling)
+        tmp['value'] = str(token.spelling)
+
+        if args.ignore_comments and tmp['type'] == 'COMMENT':
+            continue
+
+        tokens.append(tmp)
+
+    print(json.dumps(tokens, indent=4, sort_keys=True))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tokenizer/data.json b/tokenizer/data.json
index 8b9bf85..a60e04f 100644
--- a/tokenizer/data.json
+++ b/tokenizer/data.json
@@ -2,7 +2,7 @@
   "plaintext": {
     "tokenizer": "plaintext_tokenizer.py",
     "command_executable": "python3",
-    "input_as_argument": true,
+    "input_as_argument": false,
     "command_args": [
       "--ignore_newlines"
     ],
@@ -17,7 +17,10 @@
   "cpp": {
     "tokenizer": "c_tokenizer.py",
     "command_executable": "python3",
-    "input_as_argument": true,
+    "input_as_argument": false,
+    "command_args": [
+      "--ignore_comments"
+    ],
     "token_value": "type"
   },
   "java": {