Skip to content

Commit

Permalink
[Refactor:Plagiarism] Add dedicated ranking step (#79)
Browse files Browse the repository at this point in the history
* Only print warning once

* add number of times it was truncated

* Add progress bars for most of pipeline

* Add compare_hashes progress bar

* Finish the Python portion

* Remove unnecessary code from compare_hashes

* lint

* Fix off-by-1

There was a very minor off-by-1 in the original codebase which necessitated the updates to the tests
  • Loading branch information
williamjallen authored Feb 28, 2022
1 parent 47a4e2d commit 6f39cd7
Show file tree
Hide file tree
Showing 13 changed files with 198 additions and 212 deletions.
7 changes: 4 additions & 3 deletions bin/process_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@ mkdir -p "${BASEPATH}/users"
############################################################################
# Run Lichen
{ # We still want to unzip files if an error occurs when running Lichen here
./tokenize_all.py "$tmp_location" &&
./hash_all.py "$tmp_location" &&
./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}";
./tokenize_all.py "$tmp_location" &&
./hash_all.py "$tmp_location" &&
./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}" &&
./similarity_ranking.py "$tmp_location";
}

############################################################################
Expand Down
180 changes: 180 additions & 0 deletions bin/similarity_ranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
Ranks the submissions in order of plagiarism likelihood
"""

import argparse
import os
import json
import humanize
import datetime
from pathlib import Path


# This is a helper class which is used to store, and ultimately sort, data about submissions
class Submission:
def __init__(self, user_id, version):
self.user_id = user_id
self.version = version

# The percent of this submission which matches other submissions
self.percent_match = 0

# The absolute number of hashes matched
self.total_hashes_matched = 0

# The highest number of matches between this user and any other single submission
self.highest_match_count = 0

# We use this for sorting submissions later on. Future adjustments to the
# ranking algorithm should modify this function.
def __lt__(self, other):
return self.highest_match_count < other.highest_match_count


class Match:
def __init__(self, user_id, version, source_gradeable):
self.user_id = user_id
self.version = version
self.source_gradeable = source_gradeable

# The number of hashes this match shares with a Submission
self.matching_hash_count = 0


def parse_args():
parser = argparse.ArgumentParser(description='')
parser.add_argument('basepath')
return parser.parse_args()


# get_submission_stats is passed a user, version, a path to a matches.json, a
# path to a hashes.txt file, and the hash size and returns a pair of a Submission()
# object conatining a number of statistics about the specified submission, and a
# list of Match objects which match this submission
def get_submission_stats(user_id, version, matches_file, hashes_file, hash_size):
submission = Submission(user_id, version)

# Determine how many hashes there are in this submission
with open(hashes_file, 'r') as file:
token_count = len([0 for _ in file]) + hash_size

# If this is a blank/empty submission, return now
if token_count <= 1:
return submission, []

# It is possible that there are no matches and thus a matches.json file isn't
# created. If this is the case, we can simply return now.
if not os.path.isfile(matches_file):
return submission, []

with open(matches_file, 'r') as file:
matches_json = json.load(file)

# Calculate the total number of hashes matched, as well as the number of
# hashes matched for every other submission with matches
matching_submissions = dict()
prev_end = 0
for match in matches_json:
# Common and provided code doesn't have an others list (due to size contraints)
if match['type'] != 'match':
continue

for other in match['others']:
other_submission = f"{other['username']}_{other['version']}_{other['source_gradeable']}" # noqa: E501
if other_submission not in matching_submissions.keys():
matching_submissions[other_submission] = Match(other['username'],
other['version'],
other['source_gradeable'])
matching_submissions[other_submission].matching_hash_count += \
match['end'] - max(prev_end, match['start'] - 1)
submission.total_hashes_matched += match['end'] - max(prev_end, match['start'] - 1)
prev_end = match['end']

# Actually stored as the fraction of the submission which matches
submission.percent_match = submission.total_hashes_matched / token_count

if len(matching_submissions.values()) > 0:
matching_submissions = list(matching_submissions.values())

matching_submissions.sort(key=lambda x: x.matching_hash_count, reverse=True)
submission.highest_match_count = matching_submissions[0].matching_hash_count
else:
matching_submissions = []

return submission, matching_submissions


def main():
start_time = datetime.datetime.now()
args = parse_args()

print("SIMILARITY RANKING:", flush=True)
print("[0% 25% 50% 75% 100%]\n[", end="", flush=True) # noqa: E501

with open(Path(args.basepath, "config.json")) as lichen_config_file:
lichen_config = json.load(lichen_config_file)

users_dir = Path(args.basepath, 'users')
if not os.path.isdir(users_dir):
raise SystemExit('ERROR! Unable to find users directory')

# We'll make a rough estimate of the percentage of ranking output done by
# taking the percentage of users which have been done thus far
total_users = len(os.listdir(users_dir))
users_ranking_output = 0
percent_progress = 0

all_submissions = list()

for user in sorted(os.listdir(users_dir)):
user_dir = Path(users_dir, user)
if not os.path.isdir(user_dir):
continue

for version in sorted(os.listdir(user_dir)):
version_dir = Path(user_dir, version)
if not os.path.isdir(version_dir):
continue

matches_file = Path(version_dir, 'matches.json')
hashes_file = Path(version_dir, 'hashes.txt')

submission, matching_submissions = get_submission_stats(user,
version,
matches_file,
hashes_file,
lichen_config['hash_size'])
all_submissions.append(submission)

# Write the ranking.txt for this submission
with open(Path(version_dir, 'ranking.txt'), 'w') as ranking_file:
# matching_submissions is already sorted by the absolute number of hashes matched
for match in matching_submissions:
ranking_file.write(f"{match.user_id:10} {match.version:3} "
f"{match.source_gradeable} {match.matching_hash_count:>8}\n")

users_ranking_output += 1
if int((users_ranking_output / total_users) * 100) > percent_progress:
new_percent_progress = int((users_ranking_output / total_users) * 100)
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
percent_progress = new_percent_progress

all_submissions.sort(reverse=True)

# A set of all the users we've written lines for thus far (duplicates aren't allowed)
users_written = set('foo')
with open(Path(args.basepath, 'overall_ranking.txt'), 'w') as ranking_file:
for s in all_submissions:
if s.user_id in users_written:
continue
ranking_file.write(f"{s.user_id:10} {s.version:3} "
f"{s.percent_match:4.0%} {s.total_hashes_matched:>8}\n")
users_written.add(s.user_id)

# ==========================================================================
print("]\nSimilarity ranking done in", humanize.precisedelta(start_time, format="%1.f"))


if __name__ == "__main__":
main()
133 changes: 4 additions & 129 deletions compare_hashes/compare_hashes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "lichen_config.h"
#include "submission.h"
#include "hash_location.h"
#include "score.h"


// =============================================================================
Expand All @@ -29,20 +28,6 @@ typedef std::string user_id;
typedef unsigned int version_number;


// =============================================================================
// helper classes


// represents an element in a ranking of students by percent match
struct StudentRanking {
StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {}
user_id student;
version_number version;
std::string source_gradeable;
Score score;
};


// =============================================================================
// helper functions

Expand Down Expand Up @@ -89,12 +74,6 @@ void incrementEndPositionsForMatches(nlohmann::json &others) {
}


bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
return a.score > b.score ||
(a.score == b.score && a.student < b.student);
}


// =============================================================================
// MAIN

Expand Down Expand Up @@ -157,10 +136,6 @@ int main(int argc, char* argv[]) {
std::unordered_set<hash> provided_code;
// stores all hashes from other gradeables
std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
// stores the matches for every student, used later for generating overall_rankings.txt
std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>> highest_matches;
// keeps track of max matching hashes across all submissions, used for calculation of ranking score
unsigned int max_hashes_matched = 0;
// a map of "user_id:version" strings to the non-zero number of times their matching positions array was truncated
std::unordered_map<std::string, int> matching_positions_truncations;

Expand Down Expand Up @@ -323,7 +298,7 @@ int main(int argc, char* argv[]) {

// Note: we DO look for matches across submissions of the same student for self-plagiarism

// save the locations of all other occurences from proir term submissions
// save the locations of all other occurences from prior term submissions
std::vector<HashLocation>::iterator itr = other_occurences_itr->second.begin();
for (; itr != other_occurences_itr->second.end(); ++itr) {
(*submission_itr)->addSuspiciousMatch(hash_itr->second, *itr, hash_itr->first);
Expand Down Expand Up @@ -515,80 +490,14 @@ int main(int argc, char* argv[]) {
assert(ostr.good());
ostr << match_data.dump(4) << std::endl;

// =========================================================================
// create individual ranking file
// the file contains all the other students share matches, sorted by decreasing order of the percent match

// find and sort the other submissions it matches with
std::vector<StudentRanking> student_ranking;
std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>> matches = (*submission_itr)->getStudentsMatched();

std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>::const_iterator gradeables_itr = matches.begin();
for (; gradeables_itr != matches.end(); ++gradeables_itr) {
for (std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>::const_iterator matches_itr = gradeables_itr->second.begin();
matches_itr != gradeables_itr->second.end(); ++matches_itr) {

for (std::unordered_map<version_number, std::unordered_set<hash>>::const_iterator version_itr = matches_itr->second.begin();
version_itr != matches_itr->second.end(); ++version_itr) {

// Calculate the Percent Match:
// count the number of unique hashes for the percent match calculation
std::vector<std::pair<hash, location_in_submission>> submission_hashes = (*submission_itr)->getHashes();
std::unordered_set<hash> unique_hashes;
for (std::vector<std::pair<hash, location_in_submission>>::const_iterator itr = submission_hashes.begin();
itr != submission_hashes.end(); ++itr) {
unique_hashes.insert(itr->first);
}

// the percent match is currently calculated using the number of hashes that match between this
// submission and the other submission, over the total number of hashes this submission has.
// In other words, the percentage is how much of this submission's code was plgairised from the other.
unsigned int num_hashes_matched = version_itr->second.size();
float percent = (100.0 * num_hashes_matched) / unique_hashes.size();
student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent)));
student_ranking.back().score.calculateScore(num_hashes_matched);
}
}
}

// =========================================================================
// Save this submission's highest percent match for later when we generate overall_rankings.txt
float percentMatch = (*submission_itr)->getPercentage();
unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
Score submission_score(totalMatchingHashes, percentMatch);
if (max_hashes_matched < totalMatchingHashes) {
max_hashes_matched = totalMatchingHashes;
}

std::pair<version_number, Score> new_pair = {(*submission_itr)->version(), submission_score};
highest_matches[(*submission_itr)->student()].push_back(new_pair);
// =========================================================================

std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);

// create the directory and a file to write into
boost::filesystem::path ranking_student_dir = users_root_directory / (*submission_itr)->student() / std::to_string((*submission_itr)->version());
boost::filesystem::path ranking_student_file = ranking_student_dir / "ranking.txt";
boost::filesystem::create_directories(ranking_student_dir);
std::ofstream ranking_student_ostr(ranking_student_file.string());

// finally, write the file of ranking for this submission
for (unsigned int i = 0; i < student_ranking.size(); i++) {
ranking_student_ostr
<< std::setw(15) << std::left << student_ranking[i].student << " "
<< std::setw(3) << std::left << student_ranking[i].version << " "
<< std::setw(1) << std::right << student_ranking[i].source_gradeable << " "
<< std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl;
}

// =========================================================================
// Cleanup

// Done with this submissions. discard the data and clear the memory
// Done with this submission. discard the data and clear the memory
delete (*submission_itr);
(*submission_itr) = nullptr;

// print current progress
// Print current progress
my_counter++;
if (int((my_counter / float(all_submissions.size())) * 100) > my_percent) {
int new_my_percent = int((my_counter / float(all_submissions.size())) * 100);
Expand All @@ -607,7 +516,7 @@ int main(int argc, char* argv[]) {

time(&end);
diff = difftime(end, start);
std::cout << "]" << std::endl << "Finished processing submissions in " << diff << " seconds" << std::endl;
std::cout << "]" << std::endl;

// Print out the list of users who had their matching positions array truncated
if (matching_positions_truncations.size() > 0) {
Expand All @@ -618,40 +527,6 @@ int main(int argc, char* argv[]) {
}
std::cout << std::endl << " - Try increasing the hash size or adding a regex to fix this problem." << std::endl;
}
fflush(stdout);

// ===========================================================================
// Create a general summary of rankings of users by percentage match

// create a single file of students ranked by highest percentage of code plagiarised
boost::filesystem::path ranking_file = lichen_gradeable_path / "overall_ranking.txt";
std::ofstream ranking_ostr(ranking_file.string());

// take the map of highest matches and convert it to a vector so we can sort it
// by percent match and then save it to a file
std::vector<StudentRanking> ranking;
for (std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>>::iterator itr
= highest_matches.begin(); itr != highest_matches.end(); ++itr) {

std::pair<version_number, Score> best_score = itr->second.front();
best_score.second.calculateScore(max_hashes_matched);
for (unsigned int i=0; i < itr->second.size(); i++) {
itr->second[i].second.calculateScore(max_hashes_matched);
if (itr->second[i].second > best_score.second) {
best_score = itr->second[i];
}
}
ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second));
}

std::sort(ranking.begin(), ranking.end(), ranking_sorter);
for (unsigned int i = 0; i < ranking.size(); i++) {
ranking_ostr
<< std::left << std::setw(20) << ranking[i].student << " "
<< std::setw(3) << ranking[i].version << " "
<< std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "% "
<< std::setw(5) << ranking[i].score.getHashesMatched() << std::endl;
}

// ===========================================================================
// Done!
Expand Down
Loading

0 comments on commit 6f39cd7

Please sign in to comment.