Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix URL check workflow #115

Merged
merged 17 commits into from
Aug 8, 2024
Merged
12 changes: 12 additions & 0 deletions .github/ISSUE_TEMPLATE_URL_CHECK.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
title: "🚨 Broken URLs Detected: {{ env.TOTAL_ISSUES }} issues found"
labels: bug, documentation
---

## URL Check Results

### Issues Found: {{ env.TOTAL_ISSUES }}

| File | Line | URL | Status Code | Reason |
|------|------|-----|-------------|--------|
{{ env.ISSUE_TABLE }}
64 changes: 30 additions & 34 deletions .github/workflows/url_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@ name: Broken URL Check

on:
schedule:
- cron: '0 0 * * 1' # Runs every Monday at 00:00 UTC
workflow_dispatch: # Allows manual triggering of the workflow
- cron: '0 0 * * 1'
workflow_dispatch:

jobs:
url-check:
runs-on: ubuntu-latest
permissions:
contents: read
issues: write

steps:
- name: Checkout code
Expand All @@ -27,40 +30,33 @@ jobs:
id: run_checker
run: |
output=$(python scripts/urlcheck.py)
echo "checker_output<<EOF" >> $GITHUB_OUTPUT
echo 'checker_output<<EOF' >> $GITHUB_OUTPUT
echo "$output" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
echo 'EOF' >> $GITHUB_OUTPUT
env:
CHECK_PATH: './pages/'

- name: Create Issue with Results
if: always()
uses: actions/github-script@v6
- name: Process results
id: process_results
run: |
output='${{ steps.run_checker.outputs.checker_output }}'
echo "Raw output:"
echo "$output"
json_output=$(echo "$output" | grep -Eo '\{.*\}')
total_issues=$(echo "$json_output" | jq -r '.total_issues')
echo "TOTAL_ISSUES=$total_issues" >> $GITHUB_ENV

if [ "$total_issues" -gt 0 ]; then
issue_table=$(echo "$json_output" | jq -r '.issues[] | "| \(.file) | \(.line) | \(.url) | \(.status_code) | \(.reason) |"' | sed -e 's/^/ /')
echo 'ISSUE_TABLE<<EOF' >> $GITHUB_ENV
echo "$issue_table" >> $GITHUB_ENV
echo 'EOF' >> $GITHUB_ENV
fi

- name: Create Issue
if: ${{ env.TOTAL_ISSUES > 0 }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const output = JSON.parse('${{ steps.run_checker.outputs.checker_output }}');
const issueTitle = output.status === "issues_found"
? `Broken URLs Detected: ${output.total_issues} issues found`
: "URL Check Completed - No Issues Found";

let issueBody = `# URL Check Results\n\n`;
if (output.status === "issues_found") {
issueBody += `## Issues Found: ${output.total_issues}\n\n`;
output.issues.forEach(issue => {
issueBody += `- File: ${issue.file}, Line: ${issue.line}\n`;
issueBody += ` URL: ${issue.url}\n`;
issueBody += ` Status Code: ${issue.status_code}, Reason: ${issue.reason}\n`;
issueBody += ` Final URL: ${issue.final_url}\n\n`;
});
} else {
issueBody += "No broken URLs detected in this check.";
}

await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.name,
title: issueTitle,
body: issueBody,
labels: ['url-check']
});
filename: .github/ISSUE_TEMPLATE_URL_CHECK.md
51 changes: 26 additions & 25 deletions scripts/urlcheck.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import os
import re
import requests
import json
import sys
import requests
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

INTERNAL_404_URL = "https://github.com/sei-protocol/sei-docs/blob/main/pages/404.mdx"
MAX_WORKERS = 5 # Adjust based on your needs and GitHub Actions limitations
MAX_WORKERS = 5

def check_url_status(url):
try:
Expand Down Expand Up @@ -35,7 +36,7 @@ def process_file(file_path):
for url in urls:
if is_valid_url(url):
status_code, reason, final_url = check_url_status(url)
if status_code and (status_code not in {200, 403, 415} or final_url == INTERNAL_404_URL):
if status_code and status_code not in {200, 403, 415, 501} and final_url != INTERNAL_404_URL:
file_report.append({
'file': file_path,
'line': line_number,
Expand All @@ -48,41 +49,41 @@ def process_file(file_path):
print(f"Error reading file {file_path}: {str(e)}")
return file_report

def check_files_in_directory(directory):
def check_location(location):
all_reports = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_file = {}
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(('.md', '.mdx')):
file_path = os.path.join(root, file)
future = executor.submit(process_file, file_path)
future_to_file[future] = file_path
if os.path.isfile(location):
future = executor.submit(process_file, location)
future_to_file[future] = location
else:
for root, _, files in os.walk(location):
for file in files:
if file.endswith(('.md', '.mdx')):
file_path = os.path.join(root, file)
future = executor.submit(process_file, file_path)
future_to_file[future] = file_path

for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
report = future.result()
all_reports.extend(report)
except Exception as exc:
print(f'{file_path} generated an exception: {exc}')

return all_reports

def generate_report(report):
output = {}
if report:
output["status"] = "issues_found"
output["total_issues"] = len(report)
output["issues"] = report
else:
output["status"] = "no_issues_found"
output["total_issues"] = 0

print(json.dumps(output, indent=2))
output = {
"total_issues": len(report),
"issues": report
}
return json.dumps(output)

if __name__ == "__main__":
check_path = os.environ.get('CHECK_PATH', './pages/')
report = check_files_in_directory(check_path)
generate_report(report)

# Set exit code for GitHub Actions
exit(len(report)) # Exit code is the number of issues found
print(f"Checking URLs in location: {check_path}", file=sys.stderr)
report = check_location(check_path)
output = generate_report(report)
print(output)
Loading