From 6f39cd7601d81927807b84f434f79f9ae78ac3b9 Mon Sep 17 00:00:00 2001 From: William Allen <16820599+williamjallen@users.noreply.github.com> Date: Mon, 28 Feb 2022 15:15:02 -0500 Subject: [PATCH] [Refactor:Plagiarism] Add dedicated ranking step (#79) * Only print warning once * add number of times it was truncated * Add progress bars for most of pipeline * Add compare_hashes progress bar * Finish the Python portion * Remove unnecessary code from compare_hashes * lint * Fix off-by-1 There was a very minor off-by-1 in the original codebase which necessitated the updates to the tests --- bin/process_all.sh | 7 +- bin/similarity_ranking.py | 180 ++++++++++++++++++ compare_hashes/compare_hashes.cpp | 133 +------------ compare_hashes/score.h | 64 ------- compare_hashes/submission.cpp | 4 - compare_hashes/submission.h | 2 - .../expected_output/overall_ranking.txt | 4 +- .../users/aphacker/1/ranking.txt | 2 +- .../users/aphacker/2/ranking.txt | 2 +- .../users/bitdiddle/1/ranking.txt | 4 +- .../expected_output/overall_ranking.txt | 4 +- .../users/aphacker/1/ranking.txt | 2 +- .../users/bitdiddle/1/ranking.txt | 2 +- 13 files changed, 198 insertions(+), 212 deletions(-) create mode 100644 bin/similarity_ranking.py delete mode 100644 compare_hashes/score.h diff --git a/bin/process_all.sh b/bin/process_all.sh index 84a23c4..4bd48de 100644 --- a/bin/process_all.sh +++ b/bin/process_all.sh @@ -86,9 +86,10 @@ mkdir -p "${BASEPATH}/users" ############################################################################ # Run Lichen { # We still want to unzip files if an error occurs when running Lichen here - ./tokenize_all.py "$tmp_location" && - ./hash_all.py "$tmp_location" && - ./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}"; + ./tokenize_all.py "$tmp_location" && + ./hash_all.py "$tmp_location" && + ./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}" && + ./similarity_ranking.py "$tmp_location"; } ############################################################################ diff --git a/bin/similarity_ranking.py b/bin/similarity_ranking.py new file mode 100644 index 0000000..c1fe55c --- /dev/null +++ b/bin/similarity_ranking.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Ranks the submissions in order of plagiarism likelihood +""" + +import argparse +import os +import json +import humanize +import datetime +from pathlib import Path + + +# This is a helper class which is used to store, and ultimately sort, data about submissions +class Submission: + def __init__(self, user_id, version): + self.user_id = user_id + self.version = version + + # The percent of this submission which matches other submissions + self.percent_match = 0 + + # The absolute number of hashes matched + self.total_hashes_matched = 0 + + # The highest number of matches between this user and any other single submission + self.highest_match_count = 0 + + # We use this for sorting submissions later on. Future adjustments to the + # ranking algorithm should modify this function. + def __lt__(self, other): + return self.highest_match_count < other.highest_match_count + + +class Match: + def __init__(self, user_id, version, source_gradeable): + self.user_id = user_id + self.version = version + self.source_gradeable = source_gradeable + + # The number of hashes this match shares with a Submission + self.matching_hash_count = 0 + + +def parse_args(): + parser = argparse.ArgumentParser(description='') + parser.add_argument('basepath') + return parser.parse_args() + + +# get_submission_stats is passed a user, version, a path to a matches.json, a +# path to a hashes.txt file, and the hash size and returns a pair of a Submission() +# object conatining a number of statistics about the specified submission, and a +# list of Match objects which match this submission +def get_submission_stats(user_id, version, matches_file, hashes_file, hash_size): + submission = Submission(user_id, version) + + # Determine how many hashes there are in this submission + with open(hashes_file, 'r') as file: + token_count = len([0 for _ in file]) + hash_size + + # If this is a blank/empty submission, return now + if token_count <= 1: + return submission, [] + + # It is possible that there are no matches and thus a matches.json file isn't + # created. If this is the case, we can simply return now. + if not os.path.isfile(matches_file): + return submission, [] + + with open(matches_file, 'r') as file: + matches_json = json.load(file) + + # Calculate the total number of hashes matched, as well as the number of + # hashes matched for every other submission with matches + matching_submissions = dict() + prev_end = 0 + for match in matches_json: + # Common and provided code doesn't have an others list (due to size contraints) + if match['type'] != 'match': + continue + + for other in match['others']: + other_submission = f"{other['username']}_{other['version']}_{other['source_gradeable']}" # noqa: E501 + if other_submission not in matching_submissions.keys(): + matching_submissions[other_submission] = Match(other['username'], + other['version'], + other['source_gradeable']) + matching_submissions[other_submission].matching_hash_count += \ + match['end'] - max(prev_end, match['start'] - 1) + submission.total_hashes_matched += match['end'] - max(prev_end, match['start'] - 1) + prev_end = match['end'] + + # Actually stored as the fraction of the submission which matches + submission.percent_match = submission.total_hashes_matched / token_count + + if len(matching_submissions.values()) > 0: + matching_submissions = list(matching_submissions.values()) + + matching_submissions.sort(key=lambda x: x.matching_hash_count, reverse=True) + submission.highest_match_count = matching_submissions[0].matching_hash_count + else: + matching_submissions = [] + + return submission, matching_submissions + + +def main(): + start_time = datetime.datetime.now() + args = parse_args() + + print("SIMILARITY RANKING:", flush=True) + print("[0% 25% 50% 75% 100%]\n[", end="", flush=True) # noqa: E501 + + with open(Path(args.basepath, "config.json")) as lichen_config_file: + lichen_config = json.load(lichen_config_file) + + users_dir = Path(args.basepath, 'users') + if not os.path.isdir(users_dir): + raise SystemExit('ERROR! Unable to find users directory') + + # We'll make a rough estimate of the percentage of ranking output done by + # taking the percentage of users which have been done thus far + total_users = len(os.listdir(users_dir)) + users_ranking_output = 0 + percent_progress = 0 + + all_submissions = list() + + for user in sorted(os.listdir(users_dir)): + user_dir = Path(users_dir, user) + if not os.path.isdir(user_dir): + continue + + for version in sorted(os.listdir(user_dir)): + version_dir = Path(user_dir, version) + if not os.path.isdir(version_dir): + continue + + matches_file = Path(version_dir, 'matches.json') + hashes_file = Path(version_dir, 'hashes.txt') + + submission, matching_submissions = get_submission_stats(user, + version, + matches_file, + hashes_file, + lichen_config['hash_size']) + all_submissions.append(submission) + + # Write the ranking.txt for this submission + with open(Path(version_dir, 'ranking.txt'), 'w') as ranking_file: + # matching_submissions is already sorted by the absolute number of hashes matched + for match in matching_submissions: + ranking_file.write(f"{match.user_id:10} {match.version:3} " + f"{match.source_gradeable} {match.matching_hash_count:>8}\n") + + users_ranking_output += 1 + if int((users_ranking_output / total_users) * 100) > percent_progress: + new_percent_progress = int((users_ranking_output / total_users) * 100) + print("|" * (new_percent_progress - percent_progress), end="", flush=True) + percent_progress = new_percent_progress + + all_submissions.sort(reverse=True) + + # A set of all the users we've written lines for thus far (duplicates aren't allowed) + users_written = set('foo') + with open(Path(args.basepath, 'overall_ranking.txt'), 'w') as ranking_file: + for s in all_submissions: + if s.user_id in users_written: + continue + ranking_file.write(f"{s.user_id:10} {s.version:3} " + f"{s.percent_match:4.0%} {s.total_hashes_matched:>8}\n") + users_written.add(s.user_id) + + # ========================================================================== + print("]\nSimilarity ranking done in", humanize.precisedelta(start_time, format="%1.f")) + + +if __name__ == "__main__": + main() diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp index c4a2726..bb0f3a5 100644 --- a/compare_hashes/compare_hashes.cpp +++ b/compare_hashes/compare_hashes.cpp @@ -17,7 +17,6 @@ #include "lichen_config.h" #include "submission.h" #include "hash_location.h" -#include "score.h" // ============================================================================= @@ -29,20 +28,6 @@ typedef std::string user_id; typedef unsigned int version_number; -// ============================================================================= -// helper classes - - -// represents an element in a ranking of students by percent match -struct StudentRanking { - StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {} - user_id student; - version_number version; - std::string source_gradeable; - Score score; -}; - - // ============================================================================= // helper functions @@ -89,12 +74,6 @@ void incrementEndPositionsForMatches(nlohmann::json &others) { } -bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) { - return a.score > b.score || - (a.score == b.score && a.student < b.student); -} - - // ============================================================================= // MAIN @@ -157,10 +136,6 @@ int main(int argc, char* argv[]) { std::unordered_set provided_code; // stores all hashes from other gradeables std::unordered_map>> other_gradeables; - // stores the matches for every student, used later for generating overall_rankings.txt - std::unordered_map>> highest_matches; - // keeps track of max matching hashes across all submissions, used for calculation of ranking score - unsigned int max_hashes_matched = 0; // a map of "user_id:version" strings to the non-zero number of times their matching positions array was truncated std::unordered_map matching_positions_truncations; @@ -323,7 +298,7 @@ int main(int argc, char* argv[]) { // Note: we DO look for matches across submissions of the same student for self-plagiarism - // save the locations of all other occurences from proir term submissions + // save the locations of all other occurences from prior term submissions std::vector::iterator itr = other_occurences_itr->second.begin(); for (; itr != other_occurences_itr->second.end(); ++itr) { (*submission_itr)->addSuspiciousMatch(hash_itr->second, *itr, hash_itr->first); @@ -515,80 +490,14 @@ int main(int argc, char* argv[]) { assert(ostr.good()); ostr << match_data.dump(4) << std::endl; - // ========================================================================= - // create individual ranking file - // the file contains all the other students share matches, sorted by decreasing order of the percent match - - // find and sort the other submissions it matches with - std::vector student_ranking; - std::unordered_map>>> matches = (*submission_itr)->getStudentsMatched(); - - std::unordered_map>>>::const_iterator gradeables_itr = matches.begin(); - for (; gradeables_itr != matches.end(); ++gradeables_itr) { - for (std::unordered_map>>::const_iterator matches_itr = gradeables_itr->second.begin(); - matches_itr != gradeables_itr->second.end(); ++matches_itr) { - - for (std::unordered_map>::const_iterator version_itr = matches_itr->second.begin(); - version_itr != matches_itr->second.end(); ++version_itr) { - - // Calculate the Percent Match: - // count the number of unique hashes for the percent match calculation - std::vector> submission_hashes = (*submission_itr)->getHashes(); - std::unordered_set unique_hashes; - for (std::vector>::const_iterator itr = submission_hashes.begin(); - itr != submission_hashes.end(); ++itr) { - unique_hashes.insert(itr->first); - } - - // the percent match is currently calculated using the number of hashes that match between this - // submission and the other submission, over the total number of hashes this submission has. - // In other words, the percentage is how much of this submission's code was plgairised from the other. - unsigned int num_hashes_matched = version_itr->second.size(); - float percent = (100.0 * num_hashes_matched) / unique_hashes.size(); - student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent))); - student_ranking.back().score.calculateScore(num_hashes_matched); - } - } - } - - // ========================================================================= - // Save this submission's highest percent match for later when we generate overall_rankings.txt - float percentMatch = (*submission_itr)->getPercentage(); - unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount(); - Score submission_score(totalMatchingHashes, percentMatch); - if (max_hashes_matched < totalMatchingHashes) { - max_hashes_matched = totalMatchingHashes; - } - - std::pair new_pair = {(*submission_itr)->version(), submission_score}; - highest_matches[(*submission_itr)->student()].push_back(new_pair); - // ========================================================================= - - std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter); - - // create the directory and a file to write into - boost::filesystem::path ranking_student_dir = users_root_directory / (*submission_itr)->student() / std::to_string((*submission_itr)->version()); - boost::filesystem::path ranking_student_file = ranking_student_dir / "ranking.txt"; - boost::filesystem::create_directories(ranking_student_dir); - std::ofstream ranking_student_ostr(ranking_student_file.string()); - - // finally, write the file of ranking for this submission - for (unsigned int i = 0; i < student_ranking.size(); i++) { - ranking_student_ostr - << std::setw(15) << std::left << student_ranking[i].student << " " - << std::setw(3) << std::left << student_ranking[i].version << " " - << std::setw(1) << std::right << student_ranking[i].source_gradeable << " " - << std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl; - } - // ========================================================================= // Cleanup - // Done with this submissions. discard the data and clear the memory + // Done with this submission. discard the data and clear the memory delete (*submission_itr); (*submission_itr) = nullptr; - // print current progress + // Print current progress my_counter++; if (int((my_counter / float(all_submissions.size())) * 100) > my_percent) { int new_my_percent = int((my_counter / float(all_submissions.size())) * 100); @@ -607,7 +516,7 @@ int main(int argc, char* argv[]) { time(&end); diff = difftime(end, start); - std::cout << "]" << std::endl << "Finished processing submissions in " << diff << " seconds" << std::endl; + std::cout << "]" << std::endl; // Print out the list of users who had their matching positions array truncated if (matching_positions_truncations.size() > 0) { @@ -618,40 +527,6 @@ int main(int argc, char* argv[]) { } std::cout << std::endl << " - Try increasing the hash size or adding a regex to fix this problem." << std::endl; } - fflush(stdout); - - // =========================================================================== - // Create a general summary of rankings of users by percentage match - - // create a single file of students ranked by highest percentage of code plagiarised - boost::filesystem::path ranking_file = lichen_gradeable_path / "overall_ranking.txt"; - std::ofstream ranking_ostr(ranking_file.string()); - - // take the map of highest matches and convert it to a vector so we can sort it - // by percent match and then save it to a file - std::vector ranking; - for (std::unordered_map>>::iterator itr - = highest_matches.begin(); itr != highest_matches.end(); ++itr) { - - std::pair best_score = itr->second.front(); - best_score.second.calculateScore(max_hashes_matched); - for (unsigned int i=0; i < itr->second.size(); i++) { - itr->second[i].second.calculateScore(max_hashes_matched); - if (itr->second[i].second > best_score.second) { - best_score = itr->second[i]; - } - } - ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second)); - } - - std::sort(ranking.begin(), ranking.end(), ranking_sorter); - for (unsigned int i = 0; i < ranking.size(); i++) { - ranking_ostr - << std::left << std::setw(20) << ranking[i].student << " " - << std::setw(3) << ranking[i].version << " " - << std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "% " - << std::setw(5) << ranking[i].score.getHashesMatched() << std::endl; - } // =========================================================================== // Done! diff --git a/compare_hashes/score.h b/compare_hashes/score.h deleted file mode 100644 index af085ed..0000000 --- a/compare_hashes/score.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef SCORE_H -#define SCORE_H - -#include -#include - -typedef int location_in_submission; -typedef unsigned int hash; -typedef std::string user_id; -typedef unsigned int version_number; - -// represents the plagiarism score for a given submissions, used for the overall rankings file -class Score { -public: - // CONSTRUCTOR - Score(unsigned int hashes_matched, float percent): hashes_matched(hashes_matched), percent(percent), score(-1) {} - Score(const Score &other) { copy(other); } - - // GETTERS - float getPercent() const { return percent; } - unsigned int getHashesMatched() const { return hashes_matched; } - - // MODIFIER - // Each submission in the ranking file gets a composite score that weighs both its percentage - // of suspicious matches, and its percentile of total number of hashes matched - void calculateScore(unsigned int max_hashes_matched) { - score = PERCENT_WEIGHT*(percent/100.0) + MATCH_WEIGHT*(static_cast(hashes_matched)/max_hashes_matched); - } - - // OPERATORS - bool operator>(const Score &other_s) const { - constexpr float EPSILON = 0.0001; - return std::abs(getScore() - other_s.getScore()) > EPSILON && getScore() > other_s.getScore(); - } - bool operator==(const Score &other_s) const { - return getScore() == other_s.getScore(); - } - Score& operator=(const Score& other) { - if (this != &other) { - copy(other); - } - return *this; - } - - -private: - static constexpr float PERCENT_WEIGHT = 0.5; - static constexpr float MATCH_WEIGHT = 0.5; - // just a sanity check to make sure these values are appropriately updated in the future - static_assert(PERCENT_WEIGHT + MATCH_WEIGHT == 1, "Weights must add to 1"); - - unsigned int hashes_matched; - float percent; - float score; - - void copy(const Score &other) { - hashes_matched = other.hashes_matched; - percent = other.percent; - score = other.score; - } - float getScore() const { assert(score >= 0 && score <= 1); return score; } -}; - -#endif diff --git a/compare_hashes/submission.cpp b/compare_hashes/submission.cpp index b8dd4cc..17dd8f3 100644 --- a/compare_hashes/submission.cpp +++ b/compare_hashes/submission.cpp @@ -10,10 +10,6 @@ typedef unsigned int hash; typedef std::string user_id; typedef unsigned int version_number; -float Submission::getPercentage() const { - return (100.0 * (suspicious_matches.size())) / hashes.size(); -} - void Submission::addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash) { // figure out if there is an overlap between this hash and a common/provided match int hash_size = config_.hash_size; diff --git a/compare_hashes/submission.h b/compare_hashes/submission.h index a8d0a20..b438770 100644 --- a/compare_hashes/submission.h +++ b/compare_hashes/submission.h @@ -32,8 +32,6 @@ class Submission { const std::set& getProvidedMatches() const { return provided_matches; } const std::unordered_map>>>& getStudentsMatched() const { return students_matched; } const std::vector> & getHashes() const { return hashes; } - unsigned int getMatchCount() const { return suspicious_matches.size(); } - float getPercentage() const; // MODIFIERS void addHash(const hash &h, location_in_submission l) { hashes.push_back(std::make_pair(h, l)); } diff --git a/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt index 76bf21e..a0ca84a 100644 --- a/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt +++ b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt @@ -1,2 +1,2 @@ -aphacker 2 81.4% 35 -bitdiddle 1 81.4% 35 +aphacker 2 94% 44 +bitdiddle 1 94% 44 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt index f2807a3..ad11c6d 100644 --- a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt @@ -1 +1 @@ -bitdiddle 1 f21__plagiarism__multiple_versions 32.79% +bitdiddle 1 f21__plagiarism__multiple_versions 32 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt index ceaae6b..41f5b25 100644 --- a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt @@ -1 +1 @@ -bitdiddle 1 f21__plagiarism__multiple_versions 80.95% +bitdiddle 1 f21__plagiarism__multiple_versions 44 diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt index 8f2a405..0625c58 100644 --- a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt +++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt @@ -1,2 +1,2 @@ -aphacker 2 f21__plagiarism__multiple_versions 80.95% -aphacker 1 f21__plagiarism__multiple_versions 47.62% +aphacker 2 f21__plagiarism__multiple_versions 44 +aphacker 1 f21__plagiarism__multiple_versions 30 diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt index 6b140dc..7b3598f 100644 --- a/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt +++ b/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt @@ -1,2 +1,2 @@ -bitdiddle 1 25.7% 27 -aphacker 1 19.5% 15 +bitdiddle 1 42% 46 +aphacker 1 33% 27 diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt index af37be3..0ecab55 100644 --- a/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt +++ b/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt @@ -1 +1 @@ -bitdiddle 1 f21__plagiarism__repeated_sequences 18.42% +bitdiddle 1 f21__plagiarism__repeated_sequences 27 diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt index 7ecab70..6312744 100644 --- a/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt +++ b/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt @@ -1 +1 @@ -aphacker 1 f21__plagiarism__repeated_sequences 15.22% +aphacker 1 f21__plagiarism__repeated_sequences 46