diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py new file mode 100644 index 0000000..98124c0 --- /dev/null +++ b/bin/concatenate_all.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Walks the submission directory and creates a parallel directory of +the concatenated files. +""" + +import argparse +import os +import json +import sys + +CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config') +with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file: + OPEN_JSON = json.load(open_file) +SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir'] +SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir'] + + +def parse_args(): + parser = argparse.ArgumentParser(description="") + parser.add_argument("semester") + parser.add_argument("course") + parser.add_argument("gradeable") + return parser.parse_args() + + +def main(): + args = parse_args() + + sys.stdout.write("CONCATENATE ALL...") + sys.stdout.flush() + + # =========================================================================== + # error checking + course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) + if not os.path.isdir(course_dir): + print("ERROR! ",course_dir," is not a valid course directory") + exit(1) + submission_dir=os.path.join(course_dir,"submissions",args.gradeable) + if not os.path.isdir(submission_dir): + print("ERROR! ",submission_dir," is not a valid gradeable submissions directory") + exit(1) + + # =========================================================================== + # create the directory + concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable) + if not os.path.isdir(concatenated_dir): + os.makedirs(concatenated_dir) + + # =========================================================================== + # walk the subdirectories + for user in os.listdir(submission_dir): + if not os.path.isdir(os.path.join(submission_dir,user)): + continue + for version in os.listdir(os.path.join(submission_dir,user)): + if not os.path.isdir(os.path.join(submission_dir,user,version)): + continue + + # --------------------------------------------------------------------- + # concatenate all files for this gradeable/user/version into a single file + my_concatenated_dir=os.path.join(concatenated_dir,user,version) + if not os.path.isdir(my_concatenated_dir): + os.makedirs(my_concatenated_dir) + my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated") + with open(my_concatenated_file,'w') as my_cf: + # print a brief header of information + my_cf.write("SEMESTER: "+args.semester+"\n") + my_cf.write("COURSE: "+args.course+"\n") + my_cf.write("GRADEABLE: "+args.gradeable+"\n") + my_cf.write("USER: "+user+"\n") + my_cf.write("VERSION: "+version+"\n") + # loop over all files in all subdirectories + base_path = os.path.join(submission_dir,user,version) + for my_dir,dirs,my_files in os.walk(base_path): + for my_file in sorted(my_files): + # skip the timestep + if my_file == ".submit.timestamp": + continue + # TODO: skip files that should be ignored + absolute_path=os.path.join(my_dir,my_file) + relative_path=absolute_path[len(base_path):] + # print a separator & filename + my_cf.write("----------------------------------------------------\n") + my_cf.write("FILE: "+relative_path+"\n\n") + with open(absolute_path) as tmp: + # append the contents of the file + my_cf.write(tmp.read()+"\n") + + print ("done") + +if __name__ == "__main__": + main() diff --git a/bin/hash_all.py b/bin/hash_all.py new file mode 100644 index 0000000..6661d9f --- /dev/null +++ b/bin/hash_all.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +""" +Walks the submission directory and creates a parallel directory of +the tokenized files. + +""" + +import argparse +import os +import json +import subprocess +import sys +import json +import hashlib + + +CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config') +with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file: + OPEN_JSON = json.load(open_file) +SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir'] +SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir'] + + +def parse_args(): + parser = argparse.ArgumentParser(description="") + parser.add_argument("semester") + parser.add_argument("course") + parser.add_argument("gradeable") + parser.add_argument("--window",type=int,default=10) + parser.add_argument("--hash_size",type=int,default=100000) + language = parser.add_mutually_exclusive_group(required=True) + language.add_argument ("--plaintext", action='store_true') + language.add_argument ("--python", action='store_true') + language.add_argument ("--cpp", action='store_true') + + args = parser.parse_args() + + if (args.window < 1): + print ("ERROR! window must be >= 1") + exit(1) + + return args + + +def hasher(args,my_tokenized_file,my_hashes_file): + with open(my_tokenized_file,'r') as my_tf: + with open(my_hashes_file,'w') as my_hf: + tokens = json.load(my_tf) + num = len(tokens) + for i in range(0,num-args.window): + foo="" + if args.plaintext: + for j in range(0,args.window): + foo+=str(tokens[i+j].get("value")) + elif args.python: + print("NEED A PYTHON HASHER") + elif args.cpp: + print("NEED A C++ HASHER") + else: + print("UNKNOWN HASHER") + hash_object = hashlib.md5(foo.encode()) + hash_object_string=hash_object.hexdigest() + #FIXME: this truncation should be adjusted after more full-scale testing + hash_object_string_truncated=hash_object_string[0:4] + #my_hf.write(hash_object_string+"\n") + my_hf.write(hash_object_string_truncated+"\n") + + +def main(): + args = parse_args() + + sys.stdout.write("HASH ALL...") + sys.stdout.flush() + + # =========================================================================== + # error checking + course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) + if not os.path.isdir(course_dir): + print("ERROR! ",course_dir," is not a valid course directory") + exit(1) + tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) + if not os.path.isdir(tokenized_dir): + print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory") + exit(1) + + hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable) + + # =========================================================================== + # walk the subdirectories + for user in os.listdir(tokenized_dir): + for version in os.listdir(os.path.join(tokenized_dir,user)): + my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json") + + # =========================================================================== + # create the directory + my_hashes_dir=os.path.join(hashes_dir,user,version) + if not os.path.isdir(my_hashes_dir): + os.makedirs(my_hashes_dir) + + my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt") + hasher(args,my_tokenized_file,my_hashes_file) + + + print("done") + +if __name__ == "__main__": + main() diff --git a/bin/process_all.sh b/bin/process_all.sh new file mode 100644 index 0000000..230cd10 --- /dev/null +++ b/bin/process_all.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +semester=$1 +course=$2 +gradeable=$3 + +/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable +/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --plaintext +/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window 5 --plaintext + +/usr/local/submitty/Lichen/bin/compare_hashes.out $semester $course $gradeable + diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py new file mode 100644 index 0000000..3909ebd --- /dev/null +++ b/bin/tokenize_all.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Tokenizes the concatenated files. +""" + +import argparse +import os +import json +import subprocess +import sys + + +CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config') +with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file: + OPEN_JSON = json.load(open_file) +SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir'] +SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir'] + + +def parse_args(): + parser = argparse.ArgumentParser(description="") + parser.add_argument("semester") + parser.add_argument("course") + parser.add_argument("gradeable") + language = parser.add_mutually_exclusive_group(required=True) + language.add_argument ("--plaintext", action='store_true') + language.add_argument ("--python", action='store_true') + language.add_argument ("--cpp", action='store_true') + return parser.parse_args() + + +def tokenize(args,my_concatenated_file,my_tokenized_file): + + if args.plaintext: + tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out") + with open(my_concatenated_file,'r') as infile: + with open (my_tokenized_file,'w')as outfile: + subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile) + elif args.python: + print("NEED A PYTHON TOKENIZER") + elif args.cpp: + print("NEED A C++ TOKENIZER") + else: + print("UNKNOWN TOKENIZER") + +def main(): + args = parse_args() + + sys.stdout.write("TOKENIZE ALL...") + sys.stdout.flush() + + # =========================================================================== + # error checking + course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course) + if not os.path.isdir(course_dir): + print("ERROR! ",course_dir," is not a valid course directory") + exit(1) + concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable) + if not os.path.isdir(concatenated_dir): + print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory") + exit(1) + + tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable) + + # =========================================================================== + # walk the subdirectories + for user in os.listdir(concatenated_dir): + for version in os.listdir(os.path.join(concatenated_dir,user)): + my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated") + + # =========================================================================== + # create the directory + my_tokenized_dir=os.path.join(tokenized_dir,user,version) + if not os.path.isdir(my_tokenized_dir): + os.makedirs(my_tokenized_dir) + + my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json") + tokenize(args,my_concatenated_file,my_tokenized_file) + + print ("done") + + +if __name__ == "__main__": + main() diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp new file mode 100644 index 0000000..03c807e --- /dev/null +++ b/compare_hashes/compare_hashes.cpp @@ -0,0 +1,254 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "boost/filesystem/operations.hpp" +#include "boost/filesystem/path.hpp" + +#include "nlohmann/json.hpp" + +// =================================================================================== +// helper classes + + +// A submission is the concatenated files for one submission version +// for a user. +class Submission { +public: + Submission(std::string u, std::string v) : username(u),version(v) {} + std::string username; + std::string version; +}; + +// to allow sorting +bool operator<(const Submission &a, const Submission &b) { + return a.username < b.username || + (a.username == b.username && a.version < b.version); +} + + +// A sequence is represented by the start location (integer index of +// the token) within in a specific concatenated file (the Submission). +class Sequence { +public: + Sequence(std::string u, std::string v, int p) : submission(u,v),position(p) {} + Submission submission; + int position; +}; + + +// =================================================================================== +// helper typedefs + + +// common sequence hash -> ( each user -> all match locations by that user across all versions ) +typedef std::map > > hashed_sequences; + + + +// =================================================================================== +// helper functions + + +// Orders all Submissions by percentage of tokens in that match tokens +// in a small number of other files (but not most or all). +bool ranking_sorter(const std::pair &a, const std::pair &b) { + return + a.second > b.second || + (a.second == b.second && a.first.username < b.first.username) || + (a.second == b.second && a.first.username == b.first.username && a.first.version < b.first.version); +} + + +// =================================================================================== +// =================================================================================== +int main(int argc, char* argv[]) { + + std::cout << "COMPARE HASHES..."; + fflush(stdout); + + + // --------------------------------------------------------------------------- + // deal with command line arguments + assert (argc == 4); + std::string semester = argv[1]; + std::string course = argv[2]; + std::string gradeable = argv[3]; + + + // error checking, confirm there are hashes to work with + std::string tmp = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable; + boost::filesystem::path hashes_root_directory = boost::filesystem::system_complete(tmp); + if (!boost::filesystem::exists(hashes_root_directory) || + !boost::filesystem::is_directory(hashes_root_directory)) { + std::cerr << "ERROR with directory " << hashes_root_directory << std::endl; + exit(0); + } + + + // store the total size (# of tokens) in each submission + std::map submission_length; + + + // --------------------------------------------------------------------------- + // loop over all submissions and populate the hash_counts structure + + // the main data structure that looks for matches between submissions + hashed_sequences hash_counts; + + // loop over all users + boost::filesystem::directory_iterator end_iter; + for (boost::filesystem::directory_iterator dir_itr( hashes_root_directory ); dir_itr != end_iter; ++dir_itr) { + boost::filesystem::path username_path = dir_itr->path(); + assert (is_directory(username_path)); + std::string username = dir_itr->path().filename().string(); + // loop over all versions + for (boost::filesystem::directory_iterator username_itr( username_path ); username_itr != end_iter; ++username_itr) { + boost::filesystem::path version_path = username_itr->path(); + assert (is_directory(version_path)); + std::string version = username_itr->path().filename().string(); + // load the hashes sequences from this submission + boost::filesystem::path hash_file = version_path; + hash_file /= "hashes.txt"; + std::ifstream istr(hash_file.string()); + std::string tmp; + int count = 0; + while (istr >> tmp) { + count++; + hash_counts[tmp][username].push_back(Sequence(username,version,count)); + } + submission_length[Submission(username,version)]=count; + } + } + + // --------------------------------------------------------------------------- + + // label the parts of the file that are common to many + // user,version -> vector + std::map > common; + + // label the parts of the file that match the provided code + // user,version -> vector + std::map > provided; + + // document the suspicious parts of this file, + // user,version -> ( position -> ( other user,version -> std::vector ) ) + std::map > > > suspicious; + + + // --------------------------------------------------------------------------- + // walk over the structure containing all of the hashes identifying + // common to many/all, provided code, suspicious matches, and unique code + for (hashed_sequences::iterator itr = hash_counts.begin(); itr != hash_counts.end(); itr++) { + int count = itr->second.size(); + + if (count >= 20) { + // common to many/all + for (std::map >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) { + for (int i = 0; i < itr2->second.size(); i++) { + common[itr2->second[i].submission].push_back(itr2->second[i].position); + } + } + } else if (count > 1 && count < 20) { + // suspicious matches + for (std::map >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) { + std::string username = itr2->first; + for (int i = 0; i < itr2->second.size(); i++) { + assert (itr2->second[i].submission.username == username); + std::string version = itr2->second[i].submission.version; + int position = itr2->second[i].position; + + std::map > matches; + + for (std::map >::iterator itr3 = itr->second.begin(); itr3 != itr->second.end(); itr3++) { + std::string match_username = itr3->first; + for (int j = 0; j < itr3->second.size(); j++) { + std::string match_version = itr3->second[j].submission.version; + Submission ms(match_username,match_version); + matches[ms].push_back(itr3->second[j]); + } + } + Submission s(username,version); + suspicious[s][position]=matches; + } + } + } + } + + + // --------------------------------------------------------------------------- + // prepare a sorted list of all users sorted by match percent + std::vector > ranking; + for (std::map > > >::iterator itr = suspicious.begin(); + itr != suspicious.end(); itr++) { + int total = submission_length[itr->first]; + int overlap = itr->second.size(); + float percent = float(overlap)/float(total); + + std::vector info; + + std::string username = itr->first.username; + std::string version = itr->first.version; + + ranking.push_back(std::make_pair(itr->first,percent)); + + // prepare the ranges of suspicious matching tokens + int range_start=-1; + int range_end=-1; + for (std::map > >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) { + int pos = itr2->first; + if (range_start==-1) { + range_start = range_end = pos; + } else if (range_end+1 == pos) { + range_end = pos; + } else { + std::map info_data; + info_data["start"]=std::to_string(range_start); + info_data["end"]=std::to_string(range_end); + info_data["type"]=std::string("match"); + info.push_back(info_data); + range_start=range_end=-1; + } + } + if (range_start != -1) { + std::map info_data; + info_data["start"]=std::to_string(range_start); + info_data["end"]=std::to_string(range_end); + info_data["type"]=std::string("match"); + info.push_back(info_data); + range_start=range_end=-1; + } + + // save the file with matches per user + nlohmann::json match_data = info; + std::string matches_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/matches/"+gradeable+"/"+username+"/"+version; + boost::filesystem::create_directories(matches_dir); + std::string matches_file = matches_dir+"/matches.json"; + std::ofstream ostr(matches_file); + assert (ostr.good()); + ostr << match_data.dump(4) << std::endl; + } + + // save the rankings to a file + std::string ranking_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"; + std::string ranking_file = ranking_dir+gradeable+".txt"; + boost::filesystem::create_directories(ranking_dir); + std::ofstream ranking_ostr(ranking_file); + std::sort(ranking.begin(),ranking.end(),ranking_sorter); + for (int i = 0; i < ranking.size(); i++) { + ranking_ostr + << std::setw(6) << std::setprecision(2) << std::fixed << 100.0*ranking[i].second << "% " + << std::setw(15) << std::left << ranking[i].first.username << " " + << std::setw(3) << std::right << ranking[i].first.version << std::endl; + } + + + // --------------------------------------------------------------------------- + std::cout << "done" << std::endl; +} diff --git a/install.sh b/install.sh deleted file mode 100755 index c7115df..0000000 --- a/install.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -src_location="." -build_location="." -bin_location="./bin" - -nlohmann_dir=${src_location}/GIT_NLOHMANN_JSON/ - -if [ ! -d "${nlohmann_dir}" ]; then - echo 'should install' - git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir} -fi - - -mkdir -p ${bin_location} -clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${bin_location}/plaintext_tokenizer.out - -${bin_location}/plaintext_tokenizer.out < tokenizer/plaintext/input.txt > output.json -${bin_location}/plaintext_tokenizer.out --ignore_newlines < tokenizer/plaintext/input.txt > output_ignore_newlines.json -${bin_location}/plaintext_tokenizer.out --to_lower < tokenizer/plaintext/input.txt > output_to_lower.json -${bin_location}/plaintext_tokenizer.out --ignore_punctuation < tokenizer/plaintext/input.txt > output_ignore_punctuation.json -${bin_location}/plaintext_tokenizer.out --ignore_punctuation --ignore_numbers --ignore_newlines --to_lower < tokenizer/plaintext/input.txt > output_ignore_everything.json - - - diff --git a/install_lichen.sh b/install_lichen.sh new file mode 100755 index 0000000..43d9890 --- /dev/null +++ b/install_lichen.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +######################################################################################################################## +######################################################################################################################## +# this script must be run by root or sudo +if [[ "$UID" -ne "0" ]] ; then + echo "ERROR: This script must be run by root or sudo" + exit 1 +fi + +echo -e "Installing lichen... " + +lichen_repository_dir=/usr/local/submitty/GIT_CHECKOUT/Lichen/ +lichen_installation_dir=/usr/local/submitty/Lichen/ + +nlohmann_dir=${lichen_repository_dir}/../vendor/nlohmann/json/ + + +######################################################################################################################## +# get tools/source code from other repositories + +if [ ! -e "${nlohmann_dir}" ]; then + echo "Check out the vendor nlohmann/json repository" + mkdir -p nlohmann_dir + git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir} +fi + + +######################################################################################################################## +# compile & install the tokenizers + +mkdir -p ${lichen_installation_dir}/bin + +pushd ${lichen_repository_dir} > /dev/null +clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${lichen_installation_dir}/bin/plaintext_tokenizer.out +if [ $? -ne 0 ]; then + echo -e "ERROR: FAILED TO BUILD PLAINTEXT TOKENIZER\n" + exit 1 +fi +popd > /dev/null + + +# compile & install the hash comparison tool +pushd ${lichen_repository_dir} > /dev/null +clang++ -I ${nlohmann_dir}/include/ -lboost_system -lboost_filesystem -Wall -g -std=c++11 -Wall compare_hashes/compare_hashes.cpp -o ${lichen_installation_dir}/bin/compare_hashes.out +if [ $? -ne 0 ]; then + echo -e "ERROR: FAILED TO BUILD HASH COMPARISON TOOL\n" + exit 1 +fi +popd > /dev/null + + +######################################################################################################################## + +cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/ + + +######################################################################################################################## +# fix permissions +chown -R root:root ${lichen_installation_dir} +chmod 755 ${lichen_installation_dir} +chmod 755 ${lichen_installation_dir}/bin +chmod 755 ${lichen_installation_dir}/bin/* + +echo "done"