Skip to content

Commit

Permalink
Modify plagiarism workflow to a new simpler one. Remove percentage ex…
Browse files Browse the repository at this point in the history
…traction script entirely (hackclub#2379)
  • Loading branch information
grymmy authored Sep 24, 2024
1 parent 0e52d9b commit 0868fd5
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 352 deletions.
64 changes: 0 additions & 64 deletions .github/scripts/extract_percentages.py

This file was deleted.

243 changes: 156 additions & 87 deletions .github/scripts/plagiarism_check.py
Original file line number Diff line number Diff line change
@@ -1,87 +1,156 @@
import sys
import subprocess
import os
import glob
import shutil
import time

def log(message):
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] {message}")

def run_compare50(single_file, directory, output_dir, saved_dir_base):
try:
if not os.path.exists(saved_dir_base):
os.makedirs(saved_dir_base)
log("Created base directory for saved files.")

all_js_files = glob.glob(os.path.join(directory, "*.js"))
total_files = len(all_js_files)
current_file_number = 0

for file in all_js_files:
current_file_number += 1
if os.path.abspath(file) == os.path.abspath(single_file):
log(f"Skipping comparison for the same file: {file}")
continue

log(f"Processing file {current_file_number} of {total_files}: {file}")
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
log(f"Cleaned existing output directory: {output_dir}")

command = [
"compare50",
f'"{single_file}"',
f'"{file}"',
"--output", f'"{output_dir}"',
"--max-file-size", str(1024 * 1024 * 100),
"--passes", "text"
]

command_str = ' '.join(command)
log(f"Running command: {command_str}")
subprocess.run(command_str, shell=True, check=True)
log("Compare50 command executed successfully.")

match_file = os.path.join(output_dir, "match_1.html")

if os.path.exists(match_file):
new_filename = os.path.basename(file).replace('.js', '.html')
saved_file_path = os.path.join(saved_dir_base, new_filename)
log(f"Match found. Moving {match_file} to {saved_file_path}")
shutil.move(match_file, saved_file_path)
else:
log(f"No match found for file: {file}")

except subprocess.CalledProcessError as e:
log(f"Error in running Compare50: {e}")
except Exception as e:
log(f"An error occurred: {e}")

def main():
if len(sys.argv) != 5:
log("Incorrect number of arguments provided.")
print("Usage: python plagiarism_check.py <single_file> <directory> <output_dir> <saved_dir_base>")
sys.exit(1)

single_file = sys.argv[1]
directory = sys.argv[2]
output_dir = sys.argv[3]
saved_dir_base = sys.argv[4]

log(f"Starting plagiarism check with the following arguments:")
log(f"Single file: {single_file}")
log(f"Directory: {directory}")
log(f"Output directory: {output_dir}")
log(f"Saved directory base: {saved_dir_base}")

log(f"Listing all JavaScript files in directory '{directory}':")
for f in glob.glob(os.path.join(directory, "*.js")):
log(f)

run_compare50(single_file, directory, output_dir, saved_dir_base)
log("Plagiarism check completed.")

if __name__ == "__main__":
main()
#!/opt/homebrew/bin/python3
from os import walk
import jsbeautifier
from multiprocessing import Pool
import nltk
from functools import cmp_to_key
import argparse
import re
import random


def load_files_from_dir(dir, suffix):
all_game_paths = []
for (dirpath, dirnames, filenames) in walk(dir):
for filepath in filenames:
full_file_path = dirpath + "/" + filepath
if full_file_path.endswith(suffix):
all_game_paths.append(full_file_path)
return all_game_paths


def load_data(path):
dd = "".join(open(path, "r+").readlines())
dlen = len(dd)
if dd is None or dlen == 0:
dd = ""

return preprocess(dd)


def format_code(code):
options = jsbeautifier.default_options()
options.wrap_line_length = 80
beautified_code = jsbeautifier.beautify(code, options)
return beautified_code


def remove_whitespace(code):
code = "".join([s for s in code.splitlines(True) if s.strip("\r\n")])
return code


def filter_code(code):
code = re.sub('bitmap`.*?`', '', code, flags=re.MULTILINE | re.DOTALL)
code = re.sub('tune`.*?`', '', code, flags=re.MULTILINE | re.DOTALL)
code = re.sub('map`.*?`', '', code, flags=re.MULTILINE | re.DOTALL)
code = re.sub('^/\\*(.|[\r\n])*?\\*/', '', code, flags=re.DOTALL)

return code


def preprocess(code):
return format_code(remove_whitespace(filter_code(code)))


def text_length_score(text1, text2):
return 1 - abs(len(text1) - len(text2)) / (len(text1) + len(text2))


def tokenize(text, num_tokens):
return set(nltk.ngrams(text.lower().split(" "), num_tokens))


def symmetrical_diff_score(text1, text2, num_tokens):
set1 = tokenize(text1, num_tokens)
set2 = tokenize(text2, num_tokens)
symmetrical_difference = set1.symmetric_difference(set2)
return 1.0 - (len(symmetrical_difference) / (len(set1) + len(set2)))


def compare(document, other_document):
len_score = text_length_score(document, other_document)

return len_score * 0.1 + \
symmetrical_diff_score(document, other_document, 1) * 0.2 + \
symmetrical_diff_score(document, other_document, 2) * 0.3 + \
symmetrical_diff_score(document, other_document, 3) * 0.4


class DocumentComparison:
def __init__(self, path, score):
self.path = path
self.score = score


def create_doc_comparison(item):
return DocumentComparison(item[2], compare(item[0], item[1]))


def find_matching_docs(input_doc_path, all_games, threshold, log):
all_game_paths = all_games[:]
all_game_paths.remove(input_doc_path)
input_document = load_data(input_doc_path)

with Pool() as P:
if log:
print("Comparing submission against %d gallery entries..." % len(all_game_paths))

# Create an array of multiple copies of the input document. This is required for parallelization.
input_docs = [input_document for i in range(len(all_game_paths))]

# Zip input document array along w/ game paths and game data
all_data = zip(input_docs, P.map(load_data, all_game_paths), all_game_paths)

# Rank documents
results = sorted(P.map(create_doc_comparison, all_data), key=cmp_to_key(lambda i1, i2: i2.score - i1.score))

if log:
print("Done!")
documents_exceeding_threshold = list(filter(lambda item: item.score > threshold, results))
top_matches = documents_exceeding_threshold[:5]
if log:
if len(documents_exceeding_threshold) > 0:
print("%d gallery entries match the submission too closely." % len(documents_exceeding_threshold))
else:
print("No similar documents found.")
if len(top_matches) > 0:
print("Here are the top %d matches" % len(top_matches))
for document in top_matches:
print("%f - %s" % (document.score, document.path))

return len(documents_exceeding_threshold)


def check_all_games():
global all_gallery_items
num_samples = 800
all_gallery_items = load_files_from_dir(args.doc_dir, ".js")
random.shuffle(all_gallery_items)
all_good_games = []
for gallery_item in all_gallery_items[:num_samples]:
print("Processing %s" % gallery_item)
if find_matching_docs(gallery_item, all_gallery_items, 0.5, False) == 0:
all_good_games.append(gallery_item)
print("Number of good games: %d" % len(all_good_games))
print("Percentage of good games: %d%%" % int(float(len(all_good_games)) / float(num_samples) * 100.0))
exit(1)


if __name__ == '__main__':
# run_tests()
# check_all_games()

parser = argparse.ArgumentParser(
description='Compare an input javascript file w/ the contents of a directory, and returns similarity scores')
parser.add_argument('doc_dir', type=str, help='a path to a directory of documents')
parser.add_argument('threshold', type=float,
help='similarity threshold (above which duplicate warnings will be returned)')
parser.add_argument('input_doc', type=str, help='a path to an input document')
args = parser.parse_args()

if args.input_doc not in all_gallery_items:
print("Sorry - the input document must be a reference into the gallery directory.")
exit(1)
num_bad_docs = find_matching_docs(args.input_doc, all_gallery_items, args.threshold, True)
exit(0 if num_bad_docs == 0 else 1)
39 changes: 3 additions & 36 deletions .github/workflows/check_plagiarism.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ jobs:
with:
python-version: '3.10'

- name: Install Compare50 && beautifulsoup4
run: pip install compare50 beautifulsoup4
- name: Install python requirements
run: pip install jsbeautifier nltk argparse

- name: Get list of changed files
id: changed-files
Expand All @@ -34,37 +34,4 @@ jobs:
- name: Run Plagiarism Detection Script
if: env.FILES != ''
run: python .github/scripts/plagiarism_check.py "${{ env.FILES }}" games output_dir saved_dir

- name: Extract and Display Similarity Percentages
run: python .github/scripts/extract_percentages.py saved_dir/
id: extract-percentages

- name: Upload Compare50 Results as Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: compare50-results
path: saved_dir/

- name: Save PR number to file
if: always()
run: echo ${{ github.event.pull_request.number }} > pr_number.txt

- name: Upload PR Number as Artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: pr-number
path: pr_number.txt

- name: Upload Plagiarism Report as Artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: plagiarism-report
path: plagiarism-report.md

- name: Check for High Plagiarism Percentages
if: always() && steps.extract-percentages.outcome == 'failure'
run: echo "Plagiarism percentage over threshold detected."
run: python ./github/scripts/plagiarism_check.py "${{ env.FILES }}" ./games 0.5
Loading

0 comments on commit 0868fd5

Please sign in to comment.