-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rough draft of core plagiarism detection process (#5)
* initial tokenize_all script * wip * first draft of hash comparison * wip * wip * fix nested directories in concatenate * improve comments for initial PR * tweak
- Loading branch information
Showing
7 changed files
with
614 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Walks the submission directory and creates a parallel directory of | ||
the concatenated files. | ||
""" | ||
|
||
import argparse | ||
import os | ||
import json | ||
import sys | ||
|
||
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config') | ||
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file: | ||
OPEN_JSON = json.load(open_file) | ||
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir'] | ||
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir'] | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description="") | ||
parser.add_argument("semester") | ||
parser.add_argument("course") | ||
parser.add_argument("gradeable") | ||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
sys.stdout.write("CONCATENATE ALL...") | ||
sys.stdout.flush() | ||
|
||
# =========================================================================== | ||
# error checking | ||
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) | ||
if not os.path.isdir(course_dir): | ||
print("ERROR! ",course_dir," is not a valid course directory") | ||
exit(1) | ||
submission_dir=os.path.join(course_dir,"submissions",args.gradeable) | ||
if not os.path.isdir(submission_dir): | ||
print("ERROR! ",submission_dir," is not a valid gradeable submissions directory") | ||
exit(1) | ||
|
||
# =========================================================================== | ||
# create the directory | ||
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable) | ||
if not os.path.isdir(concatenated_dir): | ||
os.makedirs(concatenated_dir) | ||
|
||
# =========================================================================== | ||
# walk the subdirectories | ||
for user in os.listdir(submission_dir): | ||
if not os.path.isdir(os.path.join(submission_dir,user)): | ||
continue | ||
for version in os.listdir(os.path.join(submission_dir,user)): | ||
if not os.path.isdir(os.path.join(submission_dir,user,version)): | ||
continue | ||
|
||
# --------------------------------------------------------------------- | ||
# concatenate all files for this gradeable/user/version into a single file | ||
my_concatenated_dir=os.path.join(concatenated_dir,user,version) | ||
if not os.path.isdir(my_concatenated_dir): | ||
os.makedirs(my_concatenated_dir) | ||
my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated") | ||
with open(my_concatenated_file,'w') as my_cf: | ||
# print a brief header of information | ||
my_cf.write("SEMESTER: "+args.semester+"\n") | ||
my_cf.write("COURSE: "+args.course+"\n") | ||
my_cf.write("GRADEABLE: "+args.gradeable+"\n") | ||
my_cf.write("USER: "+user+"\n") | ||
my_cf.write("VERSION: "+version+"\n") | ||
# loop over all files in all subdirectories | ||
base_path = os.path.join(submission_dir,user,version) | ||
for my_dir,dirs,my_files in os.walk(base_path): | ||
for my_file in sorted(my_files): | ||
# skip the timestep | ||
if my_file == ".submit.timestamp": | ||
continue | ||
# TODO: skip files that should be ignored | ||
absolute_path=os.path.join(my_dir,my_file) | ||
relative_path=absolute_path[len(base_path):] | ||
# print a separator & filename | ||
my_cf.write("----------------------------------------------------\n") | ||
my_cf.write("FILE: "+relative_path+"\n\n") | ||
with open(absolute_path) as tmp: | ||
# append the contents of the file | ||
my_cf.write(tmp.read()+"\n") | ||
|
||
print ("done") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Walks the submission directory and creates a parallel directory of | ||
the tokenized files. | ||
""" | ||
|
||
import argparse | ||
import os | ||
import json | ||
import subprocess | ||
import sys | ||
import json | ||
import hashlib | ||
|
||
|
||
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config') | ||
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file: | ||
OPEN_JSON = json.load(open_file) | ||
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir'] | ||
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir'] | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description="") | ||
parser.add_argument("semester") | ||
parser.add_argument("course") | ||
parser.add_argument("gradeable") | ||
parser.add_argument("--window",type=int,default=10) | ||
parser.add_argument("--hash_size",type=int,default=100000) | ||
language = parser.add_mutually_exclusive_group(required=True) | ||
language.add_argument ("--plaintext", action='store_true') | ||
language.add_argument ("--python", action='store_true') | ||
language.add_argument ("--cpp", action='store_true') | ||
|
||
args = parser.parse_args() | ||
|
||
if (args.window < 1): | ||
print ("ERROR! window must be >= 1") | ||
exit(1) | ||
|
||
return args | ||
|
||
|
||
def hasher(args,my_tokenized_file,my_hashes_file): | ||
with open(my_tokenized_file,'r') as my_tf: | ||
with open(my_hashes_file,'w') as my_hf: | ||
tokens = json.load(my_tf) | ||
num = len(tokens) | ||
for i in range(0,num-args.window): | ||
foo="" | ||
if args.plaintext: | ||
for j in range(0,args.window): | ||
foo+=str(tokens[i+j].get("value")) | ||
elif args.python: | ||
print("NEED A PYTHON HASHER") | ||
elif args.cpp: | ||
print("NEED A C++ HASHER") | ||
else: | ||
print("UNKNOWN HASHER") | ||
hash_object = hashlib.md5(foo.encode()) | ||
hash_object_string=hash_object.hexdigest() | ||
#FIXME: this truncation should be adjusted after more full-scale testing | ||
hash_object_string_truncated=hash_object_string[0:4] | ||
#my_hf.write(hash_object_string+"\n") | ||
my_hf.write(hash_object_string_truncated+"\n") | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
sys.stdout.write("HASH ALL...") | ||
sys.stdout.flush() | ||
|
||
# =========================================================================== | ||
# error checking | ||
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) | ||
if not os.path.isdir(course_dir): | ||
print("ERROR! ",course_dir," is not a valid course directory") | ||
exit(1) | ||
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) | ||
if not os.path.isdir(tokenized_dir): | ||
print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory") | ||
exit(1) | ||
|
||
hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable) | ||
|
||
# =========================================================================== | ||
# walk the subdirectories | ||
for user in os.listdir(tokenized_dir): | ||
for version in os.listdir(os.path.join(tokenized_dir,user)): | ||
my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json") | ||
|
||
# =========================================================================== | ||
# create the directory | ||
my_hashes_dir=os.path.join(hashes_dir,user,version) | ||
if not os.path.isdir(my_hashes_dir): | ||
os.makedirs(my_hashes_dir) | ||
|
||
my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt") | ||
hasher(args,my_tokenized_file,my_hashes_file) | ||
|
||
|
||
print("done") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
|
||
semester=$1 | ||
course=$2 | ||
gradeable=$3 | ||
|
||
/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable | ||
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --plaintext | ||
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window 5 --plaintext | ||
|
||
/usr/local/submitty/Lichen/bin/compare_hashes.out $semester $course $gradeable | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Tokenizes the concatenated files. | ||
""" | ||
|
||
import argparse | ||
import os | ||
import json | ||
import subprocess | ||
import sys | ||
|
||
|
||
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config') | ||
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file: | ||
OPEN_JSON = json.load(open_file) | ||
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir'] | ||
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir'] | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description="") | ||
parser.add_argument("semester") | ||
parser.add_argument("course") | ||
parser.add_argument("gradeable") | ||
language = parser.add_mutually_exclusive_group(required=True) | ||
language.add_argument ("--plaintext", action='store_true') | ||
language.add_argument ("--python", action='store_true') | ||
language.add_argument ("--cpp", action='store_true') | ||
return parser.parse_args() | ||
|
||
|
||
def tokenize(args,my_concatenated_file,my_tokenized_file): | ||
|
||
if args.plaintext: | ||
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out") | ||
with open(my_concatenated_file,'r') as infile: | ||
with open (my_tokenized_file,'w')as outfile: | ||
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile) | ||
elif args.python: | ||
print("NEED A PYTHON TOKENIZER") | ||
elif args.cpp: | ||
print("NEED A C++ TOKENIZER") | ||
else: | ||
print("UNKNOWN TOKENIZER") | ||
|
||
def main(): | ||
args = parse_args() | ||
|
||
sys.stdout.write("TOKENIZE ALL...") | ||
sys.stdout.flush() | ||
|
||
# =========================================================================== | ||
# error checking | ||
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) | ||
if not os.path.isdir(course_dir): | ||
print("ERROR! ",course_dir," is not a valid course directory") | ||
exit(1) | ||
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable) | ||
if not os.path.isdir(concatenated_dir): | ||
print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory") | ||
exit(1) | ||
|
||
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) | ||
|
||
# =========================================================================== | ||
# walk the subdirectories | ||
for user in os.listdir(concatenated_dir): | ||
for version in os.listdir(os.path.join(concatenated_dir,user)): | ||
my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated") | ||
|
||
# =========================================================================== | ||
# create the directory | ||
my_tokenized_dir=os.path.join(tokenized_dir,user,version) | ||
if not os.path.isdir(my_tokenized_dir): | ||
os.makedirs(my_tokenized_dir) | ||
|
||
my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json") | ||
tokenize(args,my_concatenated_file,my_tokenized_file) | ||
|
||
print ("done") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.