Skip to content

Commit

Permalink
Merge pull request #105 from sei-protocol/Cordt-actions
Browse files Browse the repository at this point in the history
Add github actions workflow to crawl docs pages for broken URLs
  • Loading branch information
cordt-sei authored Jul 27, 2024
2 parents 453759d + 76d2310 commit 092ad27
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 0 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/url_check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Broken URL Check

on:
schedule:
# Runs every Monday at 00:00 UTC
- cron: '0 0 * * 1'
workflow_dispatch: # Allows manual triggering of the workflow

defaults:
run:
shell: bash

jobs:
url-check:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests
- name: Run link checker
run: |
python scripts/urlcheck.py
60 changes: 60 additions & 0 deletions scripts/urlcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
import re
import requests
import socket

def check_url_status(url):
try:
response = requests.head(url, allow_redirects=True, timeout=5)
return response.status_code, response.reason
except requests.RequestException as e:
return None, str(e)

def find_urls(text):
# Only match valid URLs starting with http:// or https://
url_pattern = re.compile(r'https?://[^\s"\'<>\)]*')
return url_pattern.findall(text)

def is_valid_url(url):
try:
domain = re.findall(r'://([^/]+)', url)[0]
socket.gethostbyname(domain) # Check if domain resolves to an IP
return True
except (socket.gaierror, IndexError):
return False

def check_files_in_directory(directory):
report = []

for root, _, files in os.walk(directory):
for file in files:
if file.endswith(('.md', '.mdx')): # Check both .md and .mdx files
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f, 1):
urls = find_urls(line)
for url in urls:
if is_valid_url(url):
status_code, reason = check_url_status(url)
# Exclude specific status codes from report
if status_code and status_code not in {200, 403, 415}:
report.append({
'file': file_path,
'line': line_number,
'url': url,
'status_code': status_code,
'reason': reason
})
return report

def generate_report(report):
for item in report:
print(f"File: {item['file']}, Line: {item['line']}")
print(f"URL: {item['url']}")
print(f"Status Code: {item['status_code']}, Reason: {item['reason']}")
print("-" * 40)

if __name__ == "__main__":
check_path = './pages/' # path to check
report = check_files_in_directory(check_path)
generate_report(report)

0 comments on commit 092ad27

Please sign in to comment.