[Refactor:Plagiarism] Add dedicated ranking step (#79)

* Only print warning once * add number of times it was truncated * Add progress bars for most of pipeline * Add compare_hashes progress bar * Finish the Python portion * Remove unnecessary code from compare_hashes * lint * Fix off-by-1 There was a very minor off-by-1 in the original codebase which necessitated the updates to the tests
Submitty · Feb 28, 2022 · 6f39cd7 · 6f39cd7
1 parent 47a4e2d
commit 6f39cd7
Show file tree

Hide file tree

Showing 13 changed files with 198 additions and 212 deletions.
diff --git a/bin/process_all.sh b/bin/process_all.sh
@@ -86,9 +86,10 @@ mkdir -p "${BASEPATH}/users"
     ############################################################################
     # Run Lichen
     {  # We still want to unzip files if an error occurs when running Lichen here
-      ./tokenize_all.py    "$tmp_location" &&
-      ./hash_all.py        "$tmp_location" &&
-      ./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}";
+      ./tokenize_all.py       "$tmp_location" &&
+      ./hash_all.py           "$tmp_location" &&
+      ./compare_hashes.out    "$tmp_location" || echo "${KILL_ERROR_MESSAGE}" &&
+      ./similarity_ranking.py "$tmp_location";
     }
 
     ############################################################################

diff --git a/bin/similarity_ranking.py b/bin/similarity_ranking.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""
+Ranks the submissions in order of plagiarism likelihood
+"""
+
+import argparse
+import os
+import json
+import humanize
+import datetime
+from pathlib import Path
+
+
+# This is a helper class which is used to store, and ultimately sort, data about submissions
+class Submission:
+    def __init__(self, user_id, version):
+        self.user_id = user_id
+        self.version = version
+
+        # The percent of this submission which matches other submissions
+        self.percent_match = 0
+
+        # The absolute number of hashes matched
+        self.total_hashes_matched = 0
+
+        # The highest number of matches between this user and any other single submission
+        self.highest_match_count = 0
+
+    # We use this for sorting submissions later on.  Future adjustments to the
+    # ranking algorithm should modify this function.
+    def __lt__(self, other):
+        return self.highest_match_count < other.highest_match_count
+
+
+class Match:
+    def __init__(self, user_id, version, source_gradeable):
+        self.user_id = user_id
+        self.version = version
+        self.source_gradeable = source_gradeable
+
+        # The number of hashes this match shares with a Submission
+        self.matching_hash_count = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('basepath')
+    return parser.parse_args()
+
+
+# get_submission_stats is passed a user, version, a path to a matches.json, a
+# path to a hashes.txt file, and the hash size and returns a pair of a Submission()
+# object conatining a number of statistics about the specified submission, and a
+# list of Match objects which match this submission
+def get_submission_stats(user_id, version, matches_file, hashes_file, hash_size):
+    submission = Submission(user_id, version)
+
+    # Determine how many hashes there are in this submission
+    with open(hashes_file, 'r') as file:
+        token_count = len([0 for _ in file]) + hash_size
+
+    # If this is a blank/empty submission, return now
+    if token_count <= 1:
+        return submission, []
+
+    # It is possible that there are no matches and thus a matches.json file isn't
+    # created. If this is the case, we can simply return now.
+    if not os.path.isfile(matches_file):
+        return submission, []
+
+    with open(matches_file, 'r') as file:
+        matches_json = json.load(file)
+
+    # Calculate the total number of hashes matched, as well as the number of
+    # hashes matched for every other submission with matches
+    matching_submissions = dict()
+    prev_end = 0
+    for match in matches_json:
+        # Common and provided code doesn't have an others list (due to size contraints)
+        if match['type'] != 'match':
+            continue
+
+        for other in match['others']:
+            other_submission = f"{other['username']}_{other['version']}_{other['source_gradeable']}"  # noqa: E501
+            if other_submission not in matching_submissions.keys():
+                matching_submissions[other_submission] = Match(other['username'],
+                                                               other['version'],
+                                                               other['source_gradeable'])
+            matching_submissions[other_submission].matching_hash_count += \
+                match['end'] - max(prev_end, match['start'] - 1)
+        submission.total_hashes_matched += match['end'] - max(prev_end, match['start'] - 1)
+        prev_end = match['end']
+
+    # Actually stored as the fraction of the submission which matches
+    submission.percent_match = submission.total_hashes_matched / token_count
+
+    if len(matching_submissions.values()) > 0:
+        matching_submissions = list(matching_submissions.values())
+
+        matching_submissions.sort(key=lambda x: x.matching_hash_count, reverse=True)
+        submission.highest_match_count = matching_submissions[0].matching_hash_count
+    else:
+        matching_submissions = []
+
+    return submission, matching_submissions
+
+
+def main():
+    start_time = datetime.datetime.now()
+    args = parse_args()
+
+    print("SIMILARITY RANKING:", flush=True)
+    print("[0%                      25%                     50%                     75%                     100%]\n[", end="", flush=True)  # noqa: E501
+
+    with open(Path(args.basepath, "config.json")) as lichen_config_file:
+        lichen_config = json.load(lichen_config_file)
+
+    users_dir = Path(args.basepath, 'users')
+    if not os.path.isdir(users_dir):
+        raise SystemExit('ERROR! Unable to find users directory')
+
+    # We'll make a rough estimate of the percentage of ranking output done by
+    # taking the percentage of users which have been done thus far
+    total_users = len(os.listdir(users_dir))
+    users_ranking_output = 0
+    percent_progress = 0
+
+    all_submissions = list()
+
+    for user in sorted(os.listdir(users_dir)):
+        user_dir = Path(users_dir, user)
+        if not os.path.isdir(user_dir):
+            continue
+
+        for version in sorted(os.listdir(user_dir)):
+            version_dir = Path(user_dir, version)
+            if not os.path.isdir(version_dir):
+                continue
+
+            matches_file = Path(version_dir, 'matches.json')
+            hashes_file = Path(version_dir, 'hashes.txt')
+
+            submission, matching_submissions = get_submission_stats(user,
+                                                                    version,
+                                                                    matches_file,
+                                                                    hashes_file,
+                                                                    lichen_config['hash_size'])
+            all_submissions.append(submission)
+
+            # Write the ranking.txt for this submission
+            with open(Path(version_dir, 'ranking.txt'), 'w') as ranking_file:
+                # matching_submissions is already sorted by the absolute number of hashes matched
+                for match in matching_submissions:
+                    ranking_file.write(f"{match.user_id:10} {match.version:3} "
+                                       f"{match.source_gradeable} {match.matching_hash_count:>8}\n")
+
+        users_ranking_output += 1
+        if int((users_ranking_output / total_users) * 100) > percent_progress:
+            new_percent_progress = int((users_ranking_output / total_users) * 100)
+            print("|" * (new_percent_progress - percent_progress), end="", flush=True)
+            percent_progress = new_percent_progress
+
+    all_submissions.sort(reverse=True)
+
+    # A set of all the users we've written lines for thus far (duplicates aren't allowed)
+    users_written = set('foo')
+    with open(Path(args.basepath, 'overall_ranking.txt'), 'w') as ranking_file:
+        for s in all_submissions:
+            if s.user_id in users_written:
+                continue
+            ranking_file.write(f"{s.user_id:10} {s.version:3} "
+                               f"{s.percent_match:4.0%} {s.total_hashes_matched:>8}\n")
+            users_written.add(s.user_id)
+
+    # ==========================================================================
+    print("]\nSimilarity ranking done in", humanize.precisedelta(start_time, format="%1.f"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
@@ -17,7 +17,6 @@
 #include "lichen_config.h"
 #include "submission.h"
 #include "hash_location.h"
-#include "score.h"
 
 
 // =============================================================================
@@ -29,20 +28,6 @@ typedef std::string user_id;
 typedef unsigned int version_number;
 
 
-// =============================================================================
-// helper classes
-
-
-// represents an element in a ranking of students by percent match
-struct StudentRanking {
-  StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {}
-  user_id student;
-  version_number version;
-  std::string source_gradeable;
-  Score score;
-};
-
-
 // =============================================================================
 // helper functions
 
@@ -89,12 +74,6 @@ void incrementEndPositionsForMatches(nlohmann::json &others) {
 }
 
 
-bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
-  return a.score > b.score ||
-        (a.score == b.score && a.student < b.student);
-}
-
-
 // =============================================================================
 // MAIN
 
@@ -157,10 +136,6 @@ int main(int argc, char* argv[]) {
   std::unordered_set<hash> provided_code;
   // stores all hashes from other gradeables
   std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
-  // stores the matches for every student, used later for generating overall_rankings.txt
-  std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>> highest_matches;
-  // keeps track of max matching hashes across all submissions, used for calculation of ranking score
-  unsigned int max_hashes_matched = 0;
   // a map of "user_id:version" strings to the non-zero number of times their matching positions array was truncated
   std::unordered_map<std::string, int> matching_positions_truncations;
 
@@ -323,7 +298,7 @@ int main(int argc, char* argv[]) {
 
           // Note: we DO look for matches across submissions of the same student for self-plagiarism
 
-          // save the locations of all other occurences from proir term submissions
+          // save the locations of all other occurences from prior term submissions
           std::vector<HashLocation>::iterator itr = other_occurences_itr->second.begin();
           for (; itr != other_occurences_itr->second.end(); ++itr) {
             (*submission_itr)->addSuspiciousMatch(hash_itr->second, *itr, hash_itr->first);
@@ -515,80 +490,14 @@ int main(int argc, char* argv[]) {
     assert(ostr.good());
     ostr << match_data.dump(4) << std::endl;
 
-    // =========================================================================
-    // create individual ranking file
-    // the file contains all the other students share matches, sorted by decreasing order of the percent match
-
-    // find and sort the other submissions it matches with
-    std::vector<StudentRanking> student_ranking;
-    std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>> matches = (*submission_itr)->getStudentsMatched();
-
-    std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>::const_iterator gradeables_itr = matches.begin();
-    for (; gradeables_itr != matches.end(); ++gradeables_itr) {
-      for (std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>::const_iterator matches_itr = gradeables_itr->second.begin();
-         matches_itr != gradeables_itr->second.end(); ++matches_itr) {
-
-        for (std::unordered_map<version_number, std::unordered_set<hash>>::const_iterator version_itr = matches_itr->second.begin();
-             version_itr != matches_itr->second.end(); ++version_itr) {
-
-          // Calculate the Percent Match:
-          // count the number of unique hashes for the percent match calculation
-          std::vector<std::pair<hash, location_in_submission>> submission_hashes = (*submission_itr)->getHashes();
-          std::unordered_set<hash> unique_hashes;
-          for (std::vector<std::pair<hash, location_in_submission>>::const_iterator itr = submission_hashes.begin();
-               itr != submission_hashes.end(); ++itr) {
-            unique_hashes.insert(itr->first);
-          }
-
-          // the percent match is currently calculated using the number of hashes that match between this
-          // submission and the other submission, over the total number of hashes this submission has.
-          // In other words, the percentage is how much of this submission's code was plgairised from the other.
-          unsigned int num_hashes_matched = version_itr->second.size();
-          float percent = (100.0 * num_hashes_matched) / unique_hashes.size();
-          student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent)));
-          student_ranking.back().score.calculateScore(num_hashes_matched);
-        }
-      }
-    }
-
-    // =========================================================================
-    // Save this submission's highest percent match for later when we generate overall_rankings.txt
-    float percentMatch = (*submission_itr)->getPercentage();
-    unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
-    Score submission_score(totalMatchingHashes, percentMatch);
-    if (max_hashes_matched < totalMatchingHashes) {
-      max_hashes_matched = totalMatchingHashes;
-    }
-
-    std::pair<version_number, Score> new_pair = {(*submission_itr)->version(), submission_score};
-    highest_matches[(*submission_itr)->student()].push_back(new_pair);
-    // =========================================================================
-
-    std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
-
-    // create the directory and a file to write into
-    boost::filesystem::path ranking_student_dir = users_root_directory / (*submission_itr)->student() / std::to_string((*submission_itr)->version());
-    boost::filesystem::path ranking_student_file = ranking_student_dir / "ranking.txt";
-    boost::filesystem::create_directories(ranking_student_dir);
-    std::ofstream ranking_student_ostr(ranking_student_file.string());
-
-    // finally, write the file of ranking for this submission
-    for (unsigned int i = 0; i < student_ranking.size(); i++) {
-      ranking_student_ostr
-        << std::setw(15) << std::left << student_ranking[i].student << "  "
-        << std::setw(3) << std::left << student_ranking[i].version << "  "
-        << std::setw(1) << std::right << student_ranking[i].source_gradeable << "  "
-        << std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl;
-    }
-
     // =========================================================================
     // Cleanup
 
-    // Done with this submissions. discard the data and clear the memory
+    // Done with this submission. discard the data and clear the memory
     delete (*submission_itr);
     (*submission_itr) = nullptr;
 
-    // print current progress
+    // Print current progress
     my_counter++;
     if (int((my_counter / float(all_submissions.size())) * 100) > my_percent) {
       int new_my_percent = int((my_counter / float(all_submissions.size())) * 100);
@@ -607,7 +516,7 @@ int main(int argc, char* argv[]) {
 
   time(&end);
   diff = difftime(end, start);
-  std::cout << "]" << std::endl << "Finished processing submissions in " << diff << " seconds" << std::endl;
+  std::cout << "]" << std::endl;
 
   // Print out the list of users who had their matching positions array truncated
   if (matching_positions_truncations.size() > 0) {
@@ -618,40 +527,6 @@ int main(int argc, char* argv[]) {
     }
     std::cout << std::endl << "  - Try increasing the hash size or adding a regex to fix this problem." << std::endl;
   }
-  fflush(stdout);
-
-  // ===========================================================================
-  // Create a general summary of rankings of users by percentage match
-
-  // create a single file of students ranked by highest percentage of code plagiarised
-  boost::filesystem::path ranking_file = lichen_gradeable_path / "overall_ranking.txt";
-  std::ofstream ranking_ostr(ranking_file.string());
-
-  // take the map of highest matches and convert it to a vector so we can sort it
-  // by percent match and then save it to a file
-  std::vector<StudentRanking> ranking;
-  for (std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>>::iterator itr
-        = highest_matches.begin(); itr != highest_matches.end(); ++itr) {
-
-    std::pair<version_number, Score> best_score = itr->second.front();
-    best_score.second.calculateScore(max_hashes_matched);
-    for (unsigned int i=0; i < itr->second.size(); i++) {
-      itr->second[i].second.calculateScore(max_hashes_matched);
-      if (itr->second[i].second > best_score.second) {
-        best_score = itr->second[i];
-      }
-    }
-    ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second));
-  }
-
-  std::sort(ranking.begin(), ranking.end(), ranking_sorter);
-  for (unsigned int i = 0; i < ranking.size(); i++) {
-    ranking_ostr
-      << std::left << std::setw(20) << ranking[i].student << "  "
-      << std::setw(3) << ranking[i].version << "  "
-      << std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "%   "
-      << std::setw(5) << ranking[i].score.getHashesMatched() << std::endl;
-  }
 
   // ===========================================================================
   // Done!