From 6f39cd7601d81927807b84f434f79f9ae78ac3b9 Mon Sep 17 00:00:00 2001
From: William Allen <16820599+williamjallen@users.noreply.github.com>
Date: Mon, 28 Feb 2022 15:15:02 -0500
Subject: [PATCH] [Refactor:Plagiarism] Add dedicated ranking step (#79)

* Only print warning once

* add number of times it was truncated

* Add progress bars for most of pipeline

* Add compare_hashes progress bar

* Finish the Python portion

* Remove unnecessary code from compare_hashes

* lint

* Fix off-by-1

There was a very minor off-by-1 in the original codebase which necessitated the updates to the tests
---
 bin/process_all.sh                            |   7 +-
 bin/similarity_ranking.py                     | 180 ++++++++++++++++++
 compare_hashes/compare_hashes.cpp             | 133 +------------
 compare_hashes/score.h                        |  64 -------
 compare_hashes/submission.cpp                 |   4 -
 compare_hashes/submission.h                   |   2 -
 .../expected_output/overall_ranking.txt       |   4 +-
 .../users/aphacker/1/ranking.txt              |   2 +-
 .../users/aphacker/2/ranking.txt              |   2 +-
 .../users/bitdiddle/1/ranking.txt             |   4 +-
 .../expected_output/overall_ranking.txt       |   4 +-
 .../users/aphacker/1/ranking.txt              |   2 +-
 .../users/bitdiddle/1/ranking.txt             |   2 +-
 13 files changed, 198 insertions(+), 212 deletions(-)
 create mode 100644 bin/similarity_ranking.py
 delete mode 100644 compare_hashes/score.h

diff --git a/bin/process_all.sh b/bin/process_all.sh
index 84a23c4..4bd48de 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -86,9 +86,10 @@ mkdir -p "${BASEPATH}/users"
     ############################################################################
     # Run Lichen
     {  # We still want to unzip files if an error occurs when running Lichen here
-      ./tokenize_all.py    "$tmp_location" &&
-      ./hash_all.py        "$tmp_location" &&
-      ./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}";
+      ./tokenize_all.py       "$tmp_location" &&
+      ./hash_all.py           "$tmp_location" &&
+      ./compare_hashes.out    "$tmp_location" || echo "${KILL_ERROR_MESSAGE}" &&
+      ./similarity_ranking.py "$tmp_location";
     }
 
     ############################################################################
diff --git a/bin/similarity_ranking.py b/bin/similarity_ranking.py
new file mode 100644
index 0000000..c1fe55c
--- /dev/null
+++ b/bin/similarity_ranking.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""
+Ranks the submissions in order of plagiarism likelihood
+"""
+
+import argparse
+import os
+import json
+import humanize
+import datetime
+from pathlib import Path
+
+
+# This is a helper class which is used to store, and ultimately sort, data about submissions
+class Submission:
+    def __init__(self, user_id, version):
+        self.user_id = user_id
+        self.version = version
+
+        # The percent of this submission which matches other submissions
+        self.percent_match = 0
+
+        # The absolute number of hashes matched
+        self.total_hashes_matched = 0
+
+        # The highest number of matches between this user and any other single submission
+        self.highest_match_count = 0
+
+    # We use this for sorting submissions later on.  Future adjustments to the
+    # ranking algorithm should modify this function.
+    def __lt__(self, other):
+        return self.highest_match_count < other.highest_match_count
+
+
+class Match:
+    def __init__(self, user_id, version, source_gradeable):
+        self.user_id = user_id
+        self.version = version
+        self.source_gradeable = source_gradeable
+
+        # The number of hashes this match shares with a Submission
+        self.matching_hash_count = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('basepath')
+    return parser.parse_args()
+
+
+# get_submission_stats is passed a user, version, a path to a matches.json, a
+# path to a hashes.txt file, and the hash size and returns a pair of a Submission()
+# object conatining a number of statistics about the specified submission, and a
+# list of Match objects which match this submission
+def get_submission_stats(user_id, version, matches_file, hashes_file, hash_size):
+    submission = Submission(user_id, version)
+
+    # Determine how many hashes there are in this submission
+    with open(hashes_file, 'r') as file:
+        token_count = len([0 for _ in file]) + hash_size
+
+    # If this is a blank/empty submission, return now
+    if token_count <= 1:
+        return submission, []
+
+    # It is possible that there are no matches and thus a matches.json file isn't
+    # created. If this is the case, we can simply return now.
+    if not os.path.isfile(matches_file):
+        return submission, []
+
+    with open(matches_file, 'r') as file:
+        matches_json = json.load(file)
+
+    # Calculate the total number of hashes matched, as well as the number of
+    # hashes matched for every other submission with matches
+    matching_submissions = dict()
+    prev_end = 0
+    for match in matches_json:
+        # Common and provided code doesn't have an others list (due to size contraints)
+        if match['type'] != 'match':
+            continue
+
+        for other in match['others']:
+            other_submission = f"{other['username']}_{other['version']}_{other['source_gradeable']}"  # noqa: E501
+            if other_submission not in matching_submissions.keys():
+                matching_submissions[other_submission] = Match(other['username'],
+                                                               other['version'],
+                                                               other['source_gradeable'])
+            matching_submissions[other_submission].matching_hash_count += \
+                match['end'] - max(prev_end, match['start'] - 1)
+        submission.total_hashes_matched += match['end'] - max(prev_end, match['start'] - 1)
+        prev_end = match['end']
+
+    # Actually stored as the fraction of the submission which matches
+    submission.percent_match = submission.total_hashes_matched / token_count
+
+    if len(matching_submissions.values()) > 0:
+        matching_submissions = list(matching_submissions.values())
+
+        matching_submissions.sort(key=lambda x: x.matching_hash_count, reverse=True)
+        submission.highest_match_count = matching_submissions[0].matching_hash_count
+    else:
+        matching_submissions = []
+
+    return submission, matching_submissions
+
+
+def main():
+    start_time = datetime.datetime.now()
+    args = parse_args()
+
+    print("SIMILARITY RANKING:", flush=True)
+    print("[0%                      25%                     50%                     75%                     100%]\n[", end="", flush=True)  # noqa: E501
+
+    with open(Path(args.basepath, "config.json")) as lichen_config_file:
+        lichen_config = json.load(lichen_config_file)
+
+    users_dir = Path(args.basepath, 'users')
+    if not os.path.isdir(users_dir):
+        raise SystemExit('ERROR! Unable to find users directory')
+
+    # We'll make a rough estimate of the percentage of ranking output done by
+    # taking the percentage of users which have been done thus far
+    total_users = len(os.listdir(users_dir))
+    users_ranking_output = 0
+    percent_progress = 0
+
+    all_submissions = list()
+
+    for user in sorted(os.listdir(users_dir)):
+        user_dir = Path(users_dir, user)
+        if not os.path.isdir(user_dir):
+            continue
+
+        for version in sorted(os.listdir(user_dir)):
+            version_dir = Path(user_dir, version)
+            if not os.path.isdir(version_dir):
+                continue
+
+            matches_file = Path(version_dir, 'matches.json')
+            hashes_file = Path(version_dir, 'hashes.txt')
+
+            submission, matching_submissions = get_submission_stats(user,
+                                                                    version,
+                                                                    matches_file,
+                                                                    hashes_file,
+                                                                    lichen_config['hash_size'])
+            all_submissions.append(submission)
+
+            # Write the ranking.txt for this submission
+            with open(Path(version_dir, 'ranking.txt'), 'w') as ranking_file:
+                # matching_submissions is already sorted by the absolute number of hashes matched
+                for match in matching_submissions:
+                    ranking_file.write(f"{match.user_id:10} {match.version:3} "
+                                       f"{match.source_gradeable} {match.matching_hash_count:>8}\n")
+
+        users_ranking_output += 1
+        if int((users_ranking_output / total_users) * 100) > percent_progress:
+            new_percent_progress = int((users_ranking_output / total_users) * 100)
+            print("|" * (new_percent_progress - percent_progress), end="", flush=True)
+            percent_progress = new_percent_progress
+
+    all_submissions.sort(reverse=True)
+
+    # A set of all the users we've written lines for thus far (duplicates aren't allowed)
+    users_written = set('foo')
+    with open(Path(args.basepath, 'overall_ranking.txt'), 'w') as ranking_file:
+        for s in all_submissions:
+            if s.user_id in users_written:
+                continue
+            ranking_file.write(f"{s.user_id:10} {s.version:3} "
+                               f"{s.percent_match:4.0%} {s.total_hashes_matched:>8}\n")
+            users_written.add(s.user_id)
+
+    # ==========================================================================
+    print("]\nSimilarity ranking done in", humanize.precisedelta(start_time, format="%1.f"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
index c4a2726..bb0f3a5 100644
--- a/compare_hashes/compare_hashes.cpp
+++ b/compare_hashes/compare_hashes.cpp
@@ -17,7 +17,6 @@
 #include "lichen_config.h"
 #include "submission.h"
 #include "hash_location.h"
-#include "score.h"
 
 
 // =============================================================================
@@ -29,20 +28,6 @@ typedef std::string user_id;
 typedef unsigned int version_number;
 
 
-// =============================================================================
-// helper classes
-
-
-// represents an element in a ranking of students by percent match
-struct StudentRanking {
-  StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {}
-  user_id student;
-  version_number version;
-  std::string source_gradeable;
-  Score score;
-};
-
-
 // =============================================================================
 // helper functions
 
@@ -89,12 +74,6 @@ void incrementEndPositionsForMatches(nlohmann::json &others) {
 }
 
 
-bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
-  return a.score > b.score ||
-        (a.score == b.score && a.student < b.student);
-}
-
-
 // =============================================================================
 // MAIN
 
@@ -157,10 +136,6 @@ int main(int argc, char* argv[]) {
   std::unordered_set<hash> provided_code;
   // stores all hashes from other gradeables
   std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
-  // stores the matches for every student, used later for generating overall_rankings.txt
-  std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>> highest_matches;
-  // keeps track of max matching hashes across all submissions, used for calculation of ranking score
-  unsigned int max_hashes_matched = 0;
   // a map of "user_id:version" strings to the non-zero number of times their matching positions array was truncated
   std::unordered_map<std::string, int> matching_positions_truncations;
 
@@ -323,7 +298,7 @@ int main(int argc, char* argv[]) {
 
           // Note: we DO look for matches across submissions of the same student for self-plagiarism
 
-          // save the locations of all other occurences from proir term submissions
+          // save the locations of all other occurences from prior term submissions
           std::vector<HashLocation>::iterator itr = other_occurences_itr->second.begin();
           for (; itr != other_occurences_itr->second.end(); ++itr) {
             (*submission_itr)->addSuspiciousMatch(hash_itr->second, *itr, hash_itr->first);
@@ -515,80 +490,14 @@ int main(int argc, char* argv[]) {
     assert(ostr.good());
     ostr << match_data.dump(4) << std::endl;
 
-    // =========================================================================
-    // create individual ranking file
-    // the file contains all the other students share matches, sorted by decreasing order of the percent match
-
-    // find and sort the other submissions it matches with
-    std::vector<StudentRanking> student_ranking;
-    std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>> matches = (*submission_itr)->getStudentsMatched();
-
-    std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>::const_iterator gradeables_itr = matches.begin();
-    for (; gradeables_itr != matches.end(); ++gradeables_itr) {
-      for (std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>::const_iterator matches_itr = gradeables_itr->second.begin();
-         matches_itr != gradeables_itr->second.end(); ++matches_itr) {
-
-        for (std::unordered_map<version_number, std::unordered_set<hash>>::const_iterator version_itr = matches_itr->second.begin();
-             version_itr != matches_itr->second.end(); ++version_itr) {
-
-          // Calculate the Percent Match:
-          // count the number of unique hashes for the percent match calculation
-          std::vector<std::pair<hash, location_in_submission>> submission_hashes = (*submission_itr)->getHashes();
-          std::unordered_set<hash> unique_hashes;
-          for (std::vector<std::pair<hash, location_in_submission>>::const_iterator itr = submission_hashes.begin();
-               itr != submission_hashes.end(); ++itr) {
-            unique_hashes.insert(itr->first);
-          }
-
-          // the percent match is currently calculated using the number of hashes that match between this
-          // submission and the other submission, over the total number of hashes this submission has.
-          // In other words, the percentage is how much of this submission's code was plgairised from the other.
-          unsigned int num_hashes_matched = version_itr->second.size();
-          float percent = (100.0 * num_hashes_matched) / unique_hashes.size();
-          student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent)));
-          student_ranking.back().score.calculateScore(num_hashes_matched);
-        }
-      }
-    }
-
-    // =========================================================================
-    // Save this submission's highest percent match for later when we generate overall_rankings.txt
-    float percentMatch = (*submission_itr)->getPercentage();
-    unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
-    Score submission_score(totalMatchingHashes, percentMatch);
-    if (max_hashes_matched < totalMatchingHashes) {
-      max_hashes_matched = totalMatchingHashes;
-    }
-
-    std::pair<version_number, Score> new_pair = {(*submission_itr)->version(), submission_score};
-    highest_matches[(*submission_itr)->student()].push_back(new_pair);
-    // =========================================================================
-
-    std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
-
-    // create the directory and a file to write into
-    boost::filesystem::path ranking_student_dir = users_root_directory / (*submission_itr)->student() / std::to_string((*submission_itr)->version());
-    boost::filesystem::path ranking_student_file = ranking_student_dir / "ranking.txt";
-    boost::filesystem::create_directories(ranking_student_dir);
-    std::ofstream ranking_student_ostr(ranking_student_file.string());
-
-    // finally, write the file of ranking for this submission
-    for (unsigned int i = 0; i < student_ranking.size(); i++) {
-      ranking_student_ostr
-        << std::setw(15) << std::left << student_ranking[i].student << "  "
-        << std::setw(3) << std::left << student_ranking[i].version << "  "
-        << std::setw(1) << std::right << student_ranking[i].source_gradeable << "  "
-        << std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl;
-    }
-
     // =========================================================================
     // Cleanup
 
-    // Done with this submissions. discard the data and clear the memory
+    // Done with this submission. discard the data and clear the memory
     delete (*submission_itr);
     (*submission_itr) = nullptr;
 
-    // print current progress
+    // Print current progress
     my_counter++;
     if (int((my_counter / float(all_submissions.size())) * 100) > my_percent) {
       int new_my_percent = int((my_counter / float(all_submissions.size())) * 100);
@@ -607,7 +516,7 @@ int main(int argc, char* argv[]) {
 
   time(&end);
   diff = difftime(end, start);
-  std::cout << "]" << std::endl << "Finished processing submissions in " << diff << " seconds" << std::endl;
+  std::cout << "]" << std::endl;
 
   // Print out the list of users who had their matching positions array truncated
   if (matching_positions_truncations.size() > 0) {
@@ -618,40 +527,6 @@ int main(int argc, char* argv[]) {
     }
     std::cout << std::endl << "  - Try increasing the hash size or adding a regex to fix this problem." << std::endl;
   }
-  fflush(stdout);
-
-  // ===========================================================================
-  // Create a general summary of rankings of users by percentage match
-
-  // create a single file of students ranked by highest percentage of code plagiarised
-  boost::filesystem::path ranking_file = lichen_gradeable_path / "overall_ranking.txt";
-  std::ofstream ranking_ostr(ranking_file.string());
-
-  // take the map of highest matches and convert it to a vector so we can sort it
-  // by percent match and then save it to a file
-  std::vector<StudentRanking> ranking;
-  for (std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>>::iterator itr
-        = highest_matches.begin(); itr != highest_matches.end(); ++itr) {
-
-    std::pair<version_number, Score> best_score = itr->second.front();
-    best_score.second.calculateScore(max_hashes_matched);
-    for (unsigned int i=0; i < itr->second.size(); i++) {
-      itr->second[i].second.calculateScore(max_hashes_matched);
-      if (itr->second[i].second > best_score.second) {
-        best_score = itr->second[i];
-      }
-    }
-    ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second));
-  }
-
-  std::sort(ranking.begin(), ranking.end(), ranking_sorter);
-  for (unsigned int i = 0; i < ranking.size(); i++) {
-    ranking_ostr
-      << std::left << std::setw(20) << ranking[i].student << "  "
-      << std::setw(3) << ranking[i].version << "  "
-      << std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "%   "
-      << std::setw(5) << ranking[i].score.getHashesMatched() << std::endl;
-  }
 
   // ===========================================================================
   // Done!
diff --git a/compare_hashes/score.h b/compare_hashes/score.h
deleted file mode 100644
index af085ed..0000000
--- a/compare_hashes/score.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef SCORE_H
-#define SCORE_H
-
-#include <cassert>
-#include <string>
-
-typedef int location_in_submission;
-typedef unsigned int hash;
-typedef std::string user_id;
-typedef unsigned int version_number;
-
-// represents the plagiarism score for a given submissions, used for the overall rankings file
-class Score {
-public:
-  // CONSTRUCTOR
-  Score(unsigned int hashes_matched, float percent): hashes_matched(hashes_matched), percent(percent), score(-1) {}
-  Score(const Score &other) { copy(other); }
-
-  // GETTERS
-  float getPercent() const { return percent; }
-  unsigned int getHashesMatched() const { return hashes_matched; }
-
-  // MODIFIER
-  // Each submission in the ranking file gets a composite score that weighs both its percentage
-  // of suspicious matches, and its percentile of total number of hashes matched
-  void calculateScore(unsigned int max_hashes_matched) {
-    score = PERCENT_WEIGHT*(percent/100.0) + MATCH_WEIGHT*(static_cast<float>(hashes_matched)/max_hashes_matched);
-  }
-
-  // OPERATORS
-  bool operator>(const Score &other_s) const {
-    constexpr float EPSILON = 0.0001;
-    return std::abs(getScore() - other_s.getScore()) > EPSILON && getScore() > other_s.getScore();
-  }
-  bool operator==(const Score &other_s) const {
-    return getScore() == other_s.getScore();
-  }
-  Score& operator=(const Score& other) {
-    if (this != &other) {
-      copy(other);
-    }
-    return *this;
-  }
-
-
-private:
-  static constexpr float PERCENT_WEIGHT = 0.5;
-  static constexpr float MATCH_WEIGHT = 0.5;
-  // just a sanity check to make sure these values are appropriately updated in the future
-  static_assert(PERCENT_WEIGHT + MATCH_WEIGHT == 1, "Weights must add to 1");
-
-  unsigned int hashes_matched;
-  float percent;
-  float score;
-
-  void copy(const Score &other) {
-    hashes_matched = other.hashes_matched;
-    percent = other.percent;
-    score = other.score;
-  }
-  float getScore() const { assert(score >= 0 && score <= 1); return score; }
-};
-
-#endif
diff --git a/compare_hashes/submission.cpp b/compare_hashes/submission.cpp
index b8dd4cc..17dd8f3 100644
--- a/compare_hashes/submission.cpp
+++ b/compare_hashes/submission.cpp
@@ -10,10 +10,6 @@ typedef unsigned int hash;
 typedef std::string user_id;
 typedef unsigned int version_number;
 
-float Submission::getPercentage() const {
-  return (100.0 * (suspicious_matches.size())) / hashes.size();
-}
-
 void Submission::addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash) {
   // figure out if there is an overlap between this hash and a common/provided match
   int hash_size = config_.hash_size;
diff --git a/compare_hashes/submission.h b/compare_hashes/submission.h
index a8d0a20..b438770 100644
--- a/compare_hashes/submission.h
+++ b/compare_hashes/submission.h
@@ -32,8 +32,6 @@ class Submission {
   const std::set<location_in_submission>& getProvidedMatches() const { return provided_matches; }
   const std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>& getStudentsMatched() const { return students_matched; }
   const std::vector<std::pair<hash, location_in_submission>> & getHashes() const { return hashes; }
-  unsigned int getMatchCount() const { return suspicious_matches.size(); }
-  float getPercentage() const;
 
   // MODIFIERS
   void addHash(const hash &h, location_in_submission l) { hashes.push_back(std::make_pair(h, l)); }
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt
index 76bf21e..a0ca84a 100644
--- a/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt
+++ b/tests/data/test_lichen/multiple_versions/expected_output/overall_ranking.txt
@@ -1,2 +1,2 @@
-aphacker              2    81.4%      35
-bitdiddle             1    81.4%      35
+aphacker   2    94%       44
+bitdiddle  1    94%       44
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt
index f2807a3..ad11c6d 100644
--- a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt
+++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/1/ranking.txt
@@ -1 +1 @@
-bitdiddle        1    f21__plagiarism__multiple_versions   32.79%
+bitdiddle    1 f21__plagiarism__multiple_versions       32
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt
index ceaae6b..41f5b25 100644
--- a/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt
+++ b/tests/data/test_lichen/multiple_versions/expected_output/users/aphacker/2/ranking.txt
@@ -1 +1 @@
-bitdiddle        1    f21__plagiarism__multiple_versions   80.95%
+bitdiddle    1 f21__plagiarism__multiple_versions       44
diff --git a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt
index 8f2a405..0625c58 100644
--- a/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt
+++ b/tests/data/test_lichen/multiple_versions/expected_output/users/bitdiddle/1/ranking.txt
@@ -1,2 +1,2 @@
-aphacker         2    f21__plagiarism__multiple_versions   80.95%
-aphacker         1    f21__plagiarism__multiple_versions   47.62%
+aphacker     2 f21__plagiarism__multiple_versions       44
+aphacker     1 f21__plagiarism__multiple_versions       30
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt
index 6b140dc..7b3598f 100644
--- a/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt
+++ b/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt
@@ -1,2 +1,2 @@
-bitdiddle             1    25.7%      27
-aphacker              1    19.5%      15
+bitdiddle  1    42%       46
+aphacker   1    33%       27
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt
index af37be3..0ecab55 100644
--- a/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt
+++ b/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt
@@ -1 +1 @@
-bitdiddle        1    f21__plagiarism__repeated_sequences   18.42%
+bitdiddle    1 f21__plagiarism__repeated_sequences       27
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt
index 7ecab70..6312744 100644
--- a/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt
+++ b/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt
@@ -1 +1 @@
-aphacker         1    f21__plagiarism__repeated_sequences   15.22%
+aphacker     1 f21__plagiarism__repeated_sequences       46