diff --git a/.github/scripts/extract_percentages.py b/.github/scripts/extract_percentages.py deleted file mode 100644 index 1087d2d2af..0000000000 --- a/.github/scripts/extract_percentages.py +++ /dev/null @@ -1,64 +0,0 @@ -from bs4 import BeautifulSoup -import os -import sys -import time - -def log(message): - timestamp = time.strftime("%Y-%m-%d %H:%M:%S") - print(f"[{timestamp}] {message}") - -def extract_similarity_percentage(html_file): - try: - with open(html_file, 'r', encoding='utf-8') as file: - soup = BeautifulSoup(file, 'html.parser') - file_name_tag = soup.select_one("#textright > div > h4") - if file_name_tag: - percentage_text = file_name_tag.find("span", class_="text-secondary small").text.strip("()%") - return int(percentage_text) - else: - return None - except Exception as e: - log(f"Error processing file {html_file}: {e}") - return None - -def process_html_files(directory, threshold=50): - log("Processing HTML files for plagiarism results...") - high_plagiarism_detected = False - high_plagiarism_files = [] - for filename in os.listdir(directory): - if filename.endswith(".html"): - file_path = os.path.join(directory, filename) - percentage = extract_similarity_percentage(file_path) - if percentage is not None and percentage >= threshold: - log(f"High plagiarism detected - {filename.replace('.html', '.js')}: {percentage}%") - high_plagiarism_files.append(filename.replace('.html', '.js') + ": " + str(percentage) + "%") - high_plagiarism_detected = True - return high_plagiarism_detected, high_plagiarism_files - -def write_to_markdown(file_path, lines): - with open(file_path, 'w') as md_file: - for line in lines: - md_file.write(line + '\n') - log(f"Markdown file written to {file_path}") - -def main(): - if len(sys.argv) != 2: - log("Incorrect number of arguments provided.") - print("Usage: python extract_percentages.py ") - sys.exit(1) - - saved_dir_path = sys.argv[1] - high_plagiarism_detected, high_plagiarism_files = process_html_files(saved_dir_path) - - markdown_lines = ["# Plagiarism Report"] - if high_plagiarism_detected: - log("High plagiarism percentages detected.") - markdown_lines.append("## Game overlap report:") - markdown_lines.extend(high_plagiarism_files) - write_to_markdown("plagiarism-report.md", markdown_lines) - sys.exit(1) - else: - log("No high plagiarism percentages detected.") - log("Plagiarism report generation completed.") -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/.github/scripts/plagiarism_check.py b/.github/scripts/plagiarism_check.py index 4946bc127e..f15b255b2f 100644 --- a/.github/scripts/plagiarism_check.py +++ b/.github/scripts/plagiarism_check.py @@ -1,87 +1,156 @@ -import sys -import subprocess -import os -import glob -import shutil -import time - -def log(message): - timestamp = time.strftime("%Y-%m-%d %H:%M:%S") - print(f"[{timestamp}] {message}") - -def run_compare50(single_file, directory, output_dir, saved_dir_base): - try: - if not os.path.exists(saved_dir_base): - os.makedirs(saved_dir_base) - log("Created base directory for saved files.") - - all_js_files = glob.glob(os.path.join(directory, "*.js")) - total_files = len(all_js_files) - current_file_number = 0 - - for file in all_js_files: - current_file_number += 1 - if os.path.abspath(file) == os.path.abspath(single_file): - log(f"Skipping comparison for the same file: {file}") - continue - - log(f"Processing file {current_file_number} of {total_files}: {file}") - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - log(f"Cleaned existing output directory: {output_dir}") - - command = [ - "compare50", - f'"{single_file}"', - f'"{file}"', - "--output", f'"{output_dir}"', - "--max-file-size", str(1024 * 1024 * 100), - "--passes", "text" - ] - - command_str = ' '.join(command) - log(f"Running command: {command_str}") - subprocess.run(command_str, shell=True, check=True) - log("Compare50 command executed successfully.") - - match_file = os.path.join(output_dir, "match_1.html") - - if os.path.exists(match_file): - new_filename = os.path.basename(file).replace('.js', '.html') - saved_file_path = os.path.join(saved_dir_base, new_filename) - log(f"Match found. Moving {match_file} to {saved_file_path}") - shutil.move(match_file, saved_file_path) - else: - log(f"No match found for file: {file}") - - except subprocess.CalledProcessError as e: - log(f"Error in running Compare50: {e}") - except Exception as e: - log(f"An error occurred: {e}") - -def main(): - if len(sys.argv) != 5: - log("Incorrect number of arguments provided.") - print("Usage: python plagiarism_check.py ") - sys.exit(1) - - single_file = sys.argv[1] - directory = sys.argv[2] - output_dir = sys.argv[3] - saved_dir_base = sys.argv[4] - - log(f"Starting plagiarism check with the following arguments:") - log(f"Single file: {single_file}") - log(f"Directory: {directory}") - log(f"Output directory: {output_dir}") - log(f"Saved directory base: {saved_dir_base}") - - log(f"Listing all JavaScript files in directory '{directory}':") - for f in glob.glob(os.path.join(directory, "*.js")): - log(f) - - run_compare50(single_file, directory, output_dir, saved_dir_base) - log("Plagiarism check completed.") - -if __name__ == "__main__": - main() \ No newline at end of file +#!/opt/homebrew/bin/python3 +from os import walk +import jsbeautifier +from multiprocessing import Pool +import nltk +from functools import cmp_to_key +import argparse +import re +import random + + +def load_files_from_dir(dir, suffix): + all_game_paths = [] + for (dirpath, dirnames, filenames) in walk(dir): + for filepath in filenames: + full_file_path = dirpath + "/" + filepath + if full_file_path.endswith(suffix): + all_game_paths.append(full_file_path) + return all_game_paths + + +def load_data(path): + dd = "".join(open(path, "r+").readlines()) + dlen = len(dd) + if dd is None or dlen == 0: + dd = "" + + return preprocess(dd) + + +def format_code(code): + options = jsbeautifier.default_options() + options.wrap_line_length = 80 + beautified_code = jsbeautifier.beautify(code, options) + return beautified_code + + +def remove_whitespace(code): + code = "".join([s for s in code.splitlines(True) if s.strip("\r\n")]) + return code + + +def filter_code(code): + code = re.sub('bitmap`.*?`', '', code, flags=re.MULTILINE | re.DOTALL) + code = re.sub('tune`.*?`', '', code, flags=re.MULTILINE | re.DOTALL) + code = re.sub('map`.*?`', '', code, flags=re.MULTILINE | re.DOTALL) + code = re.sub('^/\\*(.|[\r\n])*?\\*/', '', code, flags=re.DOTALL) + + return code + + +def preprocess(code): + return format_code(remove_whitespace(filter_code(code))) + + +def text_length_score(text1, text2): + return 1 - abs(len(text1) - len(text2)) / (len(text1) + len(text2)) + + +def tokenize(text, num_tokens): + return set(nltk.ngrams(text.lower().split(" "), num_tokens)) + + +def symmetrical_diff_score(text1, text2, num_tokens): + set1 = tokenize(text1, num_tokens) + set2 = tokenize(text2, num_tokens) + symmetrical_difference = set1.symmetric_difference(set2) + return 1.0 - (len(symmetrical_difference) / (len(set1) + len(set2))) + + +def compare(document, other_document): + len_score = text_length_score(document, other_document) + + return len_score * 0.1 + \ + symmetrical_diff_score(document, other_document, 1) * 0.2 + \ + symmetrical_diff_score(document, other_document, 2) * 0.3 + \ + symmetrical_diff_score(document, other_document, 3) * 0.4 + + +class DocumentComparison: + def __init__(self, path, score): + self.path = path + self.score = score + + +def create_doc_comparison(item): + return DocumentComparison(item[2], compare(item[0], item[1])) + + +def find_matching_docs(input_doc_path, all_games, threshold, log): + all_game_paths = all_games[:] + all_game_paths.remove(input_doc_path) + input_document = load_data(input_doc_path) + + with Pool() as P: + if log: + print("Comparing submission against %d gallery entries..." % len(all_game_paths)) + + # Create an array of multiple copies of the input document. This is required for parallelization. + input_docs = [input_document for i in range(len(all_game_paths))] + + # Zip input document array along w/ game paths and game data + all_data = zip(input_docs, P.map(load_data, all_game_paths), all_game_paths) + + # Rank documents + results = sorted(P.map(create_doc_comparison, all_data), key=cmp_to_key(lambda i1, i2: i2.score - i1.score)) + + if log: + print("Done!") + documents_exceeding_threshold = list(filter(lambda item: item.score > threshold, results)) + top_matches = documents_exceeding_threshold[:5] + if log: + if len(documents_exceeding_threshold) > 0: + print("%d gallery entries match the submission too closely." % len(documents_exceeding_threshold)) + else: + print("No similar documents found.") + if len(top_matches) > 0: + print("Here are the top %d matches" % len(top_matches)) + for document in top_matches: + print("%f - %s" % (document.score, document.path)) + + return len(documents_exceeding_threshold) + + +def check_all_games(): + global all_gallery_items + num_samples = 800 + all_gallery_items = load_files_from_dir(args.doc_dir, ".js") + random.shuffle(all_gallery_items) + all_good_games = [] + for gallery_item in all_gallery_items[:num_samples]: + print("Processing %s" % gallery_item) + if find_matching_docs(gallery_item, all_gallery_items, 0.5, False) == 0: + all_good_games.append(gallery_item) + print("Number of good games: %d" % len(all_good_games)) + print("Percentage of good games: %d%%" % int(float(len(all_good_games)) / float(num_samples) * 100.0)) + exit(1) + + +if __name__ == '__main__': + # run_tests() + # check_all_games() + + parser = argparse.ArgumentParser( + description='Compare an input javascript file w/ the contents of a directory, and returns similarity scores') + parser.add_argument('doc_dir', type=str, help='a path to a directory of documents') + parser.add_argument('threshold', type=float, + help='similarity threshold (above which duplicate warnings will be returned)') + parser.add_argument('input_doc', type=str, help='a path to an input document') + args = parser.parse_args() + + if args.input_doc not in all_gallery_items: + print("Sorry - the input document must be a reference into the gallery directory.") + exit(1) + num_bad_docs = find_matching_docs(args.input_doc, all_gallery_items, args.threshold, True) + exit(0 if num_bad_docs == 0 else 1) diff --git a/.github/workflows/check_plagiarism.yml b/.github/workflows/check_plagiarism.yml index 46dc35ca42..514d92d4fa 100644 --- a/.github/workflows/check_plagiarism.yml +++ b/.github/workflows/check_plagiarism.yml @@ -20,8 +20,8 @@ jobs: with: python-version: '3.10' - - name: Install Compare50 && beautifulsoup4 - run: pip install compare50 beautifulsoup4 + - name: Install python requirements + run: pip install jsbeautifier nltk argparse - name: Get list of changed files id: changed-files @@ -34,37 +34,4 @@ jobs: - name: Run Plagiarism Detection Script if: env.FILES != '' - run: python .github/scripts/plagiarism_check.py "${{ env.FILES }}" games output_dir saved_dir - - - name: Extract and Display Similarity Percentages - run: python .github/scripts/extract_percentages.py saved_dir/ - id: extract-percentages - - - name: Upload Compare50 Results as Artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: compare50-results - path: saved_dir/ - - - name: Save PR number to file - if: always() - run: echo ${{ github.event.pull_request.number }} > pr_number.txt - - - name: Upload PR Number as Artifact - if: always() - uses: actions/upload-artifact@v4 - with: - name: pr-number - path: pr_number.txt - - - name: Upload Plagiarism Report as Artifact - if: always() - uses: actions/upload-artifact@v4 - with: - name: plagiarism-report - path: plagiarism-report.md - - - name: Check for High Plagiarism Percentages - if: always() && steps.extract-percentages.outcome == 'failure' - run: echo "Plagiarism percentage over threshold detected." \ No newline at end of file + run: python ./github/scripts/plagiarism_check.py "${{ env.FILES }}" ./games 0.5 \ No newline at end of file diff --git a/.github/workflows/workflow_run.yml b/.github/workflows/workflow_run.yml deleted file mode 100644 index 16ead939d1..0000000000 --- a/.github/workflows/workflow_run.yml +++ /dev/null @@ -1,165 +0,0 @@ -name: Send Plagiarism Result On CI Complete - -permissions: - actions: read - contents: read - issues: write - pull-requests: write - -on: - workflow_run: - workflows: ["Plagiarism Checker"] - types: - - completed - -jobs: - on_pr_finish: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: List available artifacts with detailed logs - uses: actions/github-script@v7 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const runId = ${{ github.event.workflow_run.id }}; - console.log(`Fetching artifacts for workflow run ID: ${runId}`); - const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: runId - }); - console.log(`Artifacts found: ${artifacts.data.total_count}`); - for (const artifact of artifacts.data.artifacts) { - console.log(`Artifact name: ${artifact.name}, ID: ${artifact.id}, Size: ${artifact.size_in_bytes} bytes`); - } - - - name: Download PR Number Artifact - uses: actions/github-script@v7 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const fs = require('fs'); - const path = require('path'); - const runId = ${{ github.event.workflow_run.id }}; - const artifactName = 'pr-number'; - console.log(`Checking for artifact ${artifactName} from workflow run ID: ${runId}`); - try { - const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: runId - }); - const artifact = artifacts.data.artifacts.find(a => a.name === artifactName); - if (!artifact) { - console.log(`Artifact '${artifactName}' not found, skipping download.`); - return; - } - const artifactData = await github.rest.actions.downloadArtifact({ - owner: context.repo.owner, - repo: context.repo.repo, - artifact_id: artifact.id, - archive_format: 'zip', - }); - const artifactPath = path.join(process.env.GITHUB_WORKSPACE, `${artifactName}.zip`); - fs.writeFileSync(artifactPath, Buffer.from(artifactData.data)); - console.log(`Artifact ${artifactName} downloaded to ${artifactPath}`); - require('child_process').execSync(`unzip -o ${artifactPath} -d ${process.env.GITHUB_WORKSPACE}`); - const prNumber = fs.readFileSync(path.join(process.env.GITHUB_WORKSPACE, 'pr_number.txt'), 'utf8').trim(); - console.log(`PR Number: ${prNumber}`); - } catch (error) { - console.log(`Error occurred: ${error.message}`); - console.log('Continuing workflow execution despite the error.'); - } - - - name: Download Plagiarism Report Artifact from Another Workflow Run - uses: actions/github-script@v7 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const fs = require('fs'); - const path = require('path'); - const runId = ${{ github.event.workflow_run.id }}; - const artifactName = 'plagiarism-report'; - console.log(`Downloading artifact '${artifactName}' from workflow run ID: ${runId}`); - try { - const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: runId - }); - const artifact = artifacts.data.artifacts.find(a => a.name === artifactName); - if (!artifact) { - console.log(`Artifact '${artifactName}' not found, skipping download.`); - return; - } - const artifactData = await github.rest.actions.downloadArtifact({ - owner: context.repo.owner, - repo: context.repo.repo, - artifact_id: artifact.id, - archive_format: 'zip', - }); - const artifactPath = path.join(process.env.GITHUB_WORKSPACE, `${artifactName}.zip`); - fs.writeFileSync(artifactPath, Buffer.from(artifactData.data)); - console.log(`Artifact ${artifactName} downloaded to ${artifactPath}`); - require('child_process').execSync(`unzip -o ${artifactPath} -d ${process.env.GITHUB_WORKSPACE}`); - } catch (error) { - console.log(`Error occurred: ${error.message}`); - console.log('Continuing workflow execution despite the error.'); - } - - - name: Check if Plagiarism Report Exists - id: check-report - run: | - if unzip -l plagiarism-report.zip; then - echo "REPORT_EXISTS=true" >> $GITHUB_ENV - else - echo "REPORT_EXISTS=false" >> $GITHUB_ENV - fi - - - name: Unzip Plagiarism Report Artifact - if: env.REPORT_EXISTS == 'true' - run: unzip -o plagiarism-report.zip -d ${{ github.workspace }} - - - name: Fetch Pull Request Comments - id: fetch-comments - uses: actions/github-script@v7 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const fs = require('fs'); - const path = require('path'); - const prNumber = fs.readFileSync(path.join(process.env.GITHUB_WORKSPACE, 'pr_number.txt'), 'utf8').trim(); - const comments = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: prNumber - }); - const hasPlagiarismComment = comments.data.some(comment => comment.body.includes('[Plagiarism Check Result]')); - console.log(`Has Plagiarism Comment: ${hasPlagiarismComment}`); - return hasPlagiarismComment; - - - name: Post Markdown as Comment - if: env.REPORT_EXISTS == 'true' && steps.fetch-comments.outputs.result != 'true' - uses: actions/github-script@v7 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const fs = require('fs'); - const path = require('path'); - const prNumber = fs.readFileSync(path.join(process.env.GITHUB_WORKSPACE, 'pr_number.txt'), 'utf8').trim(); - const markdownPath = path.join(process.env.GITHUB_WORKSPACE, 'plagiarism-report.md'); - console.log(`Reading the Markdown report from: ${markdownPath}`); - let markdownContent = fs.readFileSync(markdownPath, 'utf8'); - console.log("Fetching associated pull request..."); - console.log(`Found associated pull request: #${prNumber}`); - console.log("Posting the Markdown content as a comment..."); - const commentResponse = await github.rest.issues.createComment({ - issue_number: prNumber, - owner: context.repo.owner, - repo: context.repo.repo, - body: markdownContent - }); - console.log(`Comment posted successfully: ${commentResponse.data.html_url}`); \ No newline at end of file