Skip to content

Commit

Permalink
Rough draft of core plagiarism detection process (#5)
Browse files Browse the repository at this point in the history
* initial tokenize_all script

* wip

* first draft of hash comparison

* wip

* wip

* fix nested directories in concatenate

* improve comments for initial PR

* tweak
  • Loading branch information
bmcutler authored Jun 13, 2018
1 parent fbc44d5 commit ed02455
Show file tree
Hide file tree
Showing 7 changed files with 614 additions and 25 deletions.
92 changes: 92 additions & 0 deletions bin/concatenate_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Walks the submission directory and creates a parallel directory of
the concatenated files.
"""

import argparse
import os
import json
import sys

CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
OPEN_JSON = json.load(open_file)
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']


def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("semester")
parser.add_argument("course")
parser.add_argument("gradeable")
return parser.parse_args()


def main():
args = parse_args()

sys.stdout.write("CONCATENATE ALL...")
sys.stdout.flush()

# ===========================================================================
# error checking
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
if not os.path.isdir(course_dir):
print("ERROR! ",course_dir," is not a valid course directory")
exit(1)
submission_dir=os.path.join(course_dir,"submissions",args.gradeable)
if not os.path.isdir(submission_dir):
print("ERROR! ",submission_dir," is not a valid gradeable submissions directory")
exit(1)

# ===========================================================================
# create the directory
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
if not os.path.isdir(concatenated_dir):
os.makedirs(concatenated_dir)

# ===========================================================================
# walk the subdirectories
for user in os.listdir(submission_dir):
if not os.path.isdir(os.path.join(submission_dir,user)):
continue
for version in os.listdir(os.path.join(submission_dir,user)):
if not os.path.isdir(os.path.join(submission_dir,user,version)):
continue

# ---------------------------------------------------------------------
# concatenate all files for this gradeable/user/version into a single file
my_concatenated_dir=os.path.join(concatenated_dir,user,version)
if not os.path.isdir(my_concatenated_dir):
os.makedirs(my_concatenated_dir)
my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated")
with open(my_concatenated_file,'w') as my_cf:
# print a brief header of information
my_cf.write("SEMESTER: "+args.semester+"\n")
my_cf.write("COURSE: "+args.course+"\n")
my_cf.write("GRADEABLE: "+args.gradeable+"\n")
my_cf.write("USER: "+user+"\n")
my_cf.write("VERSION: "+version+"\n")
# loop over all files in all subdirectories
base_path = os.path.join(submission_dir,user,version)
for my_dir,dirs,my_files in os.walk(base_path):
for my_file in sorted(my_files):
# skip the timestep
if my_file == ".submit.timestamp":
continue
# TODO: skip files that should be ignored
absolute_path=os.path.join(my_dir,my_file)
relative_path=absolute_path[len(base_path):]
# print a separator & filename
my_cf.write("----------------------------------------------------\n")
my_cf.write("FILE: "+relative_path+"\n\n")
with open(absolute_path) as tmp:
# append the contents of the file
my_cf.write(tmp.read()+"\n")

print ("done")

if __name__ == "__main__":
main()
107 changes: 107 additions & 0 deletions bin/hash_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
"""
Walks the submission directory and creates a parallel directory of
the tokenized files.
"""

import argparse
import os
import json
import subprocess
import sys
import json
import hashlib


CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
OPEN_JSON = json.load(open_file)
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']


def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("semester")
parser.add_argument("course")
parser.add_argument("gradeable")
parser.add_argument("--window",type=int,default=10)
parser.add_argument("--hash_size",type=int,default=100000)
language = parser.add_mutually_exclusive_group(required=True)
language.add_argument ("--plaintext", action='store_true')
language.add_argument ("--python", action='store_true')
language.add_argument ("--cpp", action='store_true')

args = parser.parse_args()

if (args.window < 1):
print ("ERROR! window must be >= 1")
exit(1)

return args


def hasher(args,my_tokenized_file,my_hashes_file):
with open(my_tokenized_file,'r') as my_tf:
with open(my_hashes_file,'w') as my_hf:
tokens = json.load(my_tf)
num = len(tokens)
for i in range(0,num-args.window):
foo=""
if args.plaintext:
for j in range(0,args.window):
foo+=str(tokens[i+j].get("value"))
elif args.python:
print("NEED A PYTHON HASHER")
elif args.cpp:
print("NEED A C++ HASHER")
else:
print("UNKNOWN HASHER")
hash_object = hashlib.md5(foo.encode())
hash_object_string=hash_object.hexdigest()
#FIXME: this truncation should be adjusted after more full-scale testing
hash_object_string_truncated=hash_object_string[0:4]
#my_hf.write(hash_object_string+"\n")
my_hf.write(hash_object_string_truncated+"\n")


def main():
args = parse_args()

sys.stdout.write("HASH ALL...")
sys.stdout.flush()

# ===========================================================================
# error checking
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
if not os.path.isdir(course_dir):
print("ERROR! ",course_dir," is not a valid course directory")
exit(1)
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
if not os.path.isdir(tokenized_dir):
print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory")
exit(1)

hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable)

# ===========================================================================
# walk the subdirectories
for user in os.listdir(tokenized_dir):
for version in os.listdir(os.path.join(tokenized_dir,user)):
my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json")

# ===========================================================================
# create the directory
my_hashes_dir=os.path.join(hashes_dir,user,version)
if not os.path.isdir(my_hashes_dir):
os.makedirs(my_hashes_dir)

my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt")
hasher(args,my_tokenized_file,my_hashes_file)


print("done")

if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions bin/process_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

semester=$1
course=$2
gradeable=$3

/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --plaintext
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window 5 --plaintext

/usr/local/submitty/Lichen/bin/compare_hashes.out $semester $course $gradeable

84 changes: 84 additions & 0 deletions bin/tokenize_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3
"""
Tokenizes the concatenated files.
"""

import argparse
import os
import json
import subprocess
import sys


CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
OPEN_JSON = json.load(open_file)
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']


def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("semester")
parser.add_argument("course")
parser.add_argument("gradeable")
language = parser.add_mutually_exclusive_group(required=True)
language.add_argument ("--plaintext", action='store_true')
language.add_argument ("--python", action='store_true')
language.add_argument ("--cpp", action='store_true')
return parser.parse_args()


def tokenize(args,my_concatenated_file,my_tokenized_file):

if args.plaintext:
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w')as outfile:
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
elif args.python:
print("NEED A PYTHON TOKENIZER")
elif args.cpp:
print("NEED A C++ TOKENIZER")
else:
print("UNKNOWN TOKENIZER")

def main():
args = parse_args()

sys.stdout.write("TOKENIZE ALL...")
sys.stdout.flush()

# ===========================================================================
# error checking
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
if not os.path.isdir(course_dir):
print("ERROR! ",course_dir," is not a valid course directory")
exit(1)
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
if not os.path.isdir(concatenated_dir):
print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory")
exit(1)

tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)

# ===========================================================================
# walk the subdirectories
for user in os.listdir(concatenated_dir):
for version in os.listdir(os.path.join(concatenated_dir,user)):
my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated")

# ===========================================================================
# create the directory
my_tokenized_dir=os.path.join(tokenized_dir,user,version)
if not os.path.isdir(my_tokenized_dir):
os.makedirs(my_tokenized_dir)

my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json")
tokenize(args,my_concatenated_file,my_tokenized_file)

print ("done")


if __name__ == "__main__":
main()
Loading

0 comments on commit ed02455

Please sign in to comment.