diff --git a/.github/workflows/account_recovery.yml b/.github/workflows/account_recovery.yml new file mode 100644 index 0000000..026557b --- /dev/null +++ b/.github/workflows/account_recovery.yml @@ -0,0 +1,31 @@ +name: Issue Label Trigger + +on: + issues: + types: [labeled] + +jobs: + parse-issue: + runs-on: ubuntu-latest + if: contains(github.event.issue.labels.*.name, 'account-recovery') + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r .github/workflows/autoreplies/requirements.txt + + - name: Run Python script + run: python .github/workflows/autoreplies/check_account_recovery.py + env: + ISSUE_NUMBER: ${{ github.event.issue.number }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_ISSUE_OWNER: ${{ github.repository_owner }} + GITHUB_ISSUE_REPO: ${{ github.event.repository.name }} diff --git a/.github/workflows/autoreplies/__init__.py b/.github/workflows/autoreplies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py new file mode 100644 index 0000000..255835d --- /dev/null +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -0,0 +1,187 @@ +"""Parse a GitHub issue to automatically aggregate package ownership information to facilitate account recovery. + +Steps +1) finds all PyPI packages maintained by the user +2) checks each PyPI package to see if its source code repository listed at PyPI belongs to the github user +3) adds a comment to the issue summarizing the package ownership information + +If the github user owns the source code repositories for all of the PyPI packages, or is an administrator for the github +organization that owns them, then the issue is automatically labeled with "fasttrack". + +Environment Variables +--------------------- +GITHUB_ISSUE_OWNER + The owner (e.g., "pypi") of the issue repository + +GITHUB_ISSUE_REPO + The repository (e.g., "support") where the issue is located + +ISSUE_NUMBER + The number of the issue to process + +GITHUB_TOKEN + (Optional) A GitHub token with permissions to comment on the issue and read the repository. +""" + +import os +import sys + +import pypi_utils +import gh_utils + + +# Issue body headers +PYPI_USER_HEADER = "PyPI Username" + +# Ownership status levels +BELONGS = 0 +ORG_ADMIN = 1 +ORG_MEMBER = 2 +UNKNOWN_OWERNSHIP = 3 +NO_REPO = 4 + +# This notice indicates that the final determination of account recovery rests with the PyPI team +BOT_NOTICE = ( + "### NOTE\n\n" + "_This action was performed automatically by a bot and **does not guarantee account recovery**. Account recovery" + " requires manual approval processing by the PyPI team._" +) + + +def sanitize_pypi_user(username: str) -> str: + """Remove any backticks from the username. + + Some users write their usernames like: + `username` + for pretty markdown purposes, but we don't want the backticks. + """ + return username.strip().replace("`", "") + + +def format_markdown_table(rows: list) -> str: + """Format a list of rows into a markdown table. + + Parameters + ---------- + rows: list + A list of rows to format into a table. Each row should be [package_link, repo_url, ownership_level] where + ownership_level is an int indicating which column to mark with an "X". + """ + header = ["Package", "Repository", "Owner", "Admin", "Member", "Unknown", "No Repo"] + row_strings = [] + row_strings.append(" | ".join(header)) + row_strings.append(" | ".join(["---"] * 2 + [":-:"] * (len(header) - 2))) + for row in rows: + row_fields = [""] * len(header) + row_fields[0] = row[0] + row_fields[1] = row[1] + row_fields[2 + row[2]] = "X" + row_strings.append(" | ".join(row_fields)) + return "\n".join(row_strings) + + +def format_markdown_package_link(package_name: str) -> str: + return f"[{package_name}](https://pypi.org/project/{package_name})" + + +def format_markdown_pypi_user_link(pypi_user: str) -> str: + return f"[{pypi_user}](https://pypi.org/user/{pypi_user}/)" + + +def format_markdown_gh_user_link(gh_user: str) -> str: + return f"[{gh_user}](https://github.com/{gh_user}/)" + + +if __name__ == "__main__": + issue_number = os.environ.get("ISSUE_NUMBER", "4386") + github_token = os.environ.get("GITHUB_TOKEN", None) + github_issue_owner = os.environ.get("GITHUB_ISSUE_OWNER", "pypi") + github_issue_repo = os.environ.get("GITHUB_ISSUE_REPO", "support") + + issue_data = gh_utils.fetch_issue_details( + github_issue_owner, github_issue_repo, issue_number, github_token=github_token + ) + + gh_user = issue_data["user"] + gh_user_link = format_markdown_gh_user_link(gh_user) + + if PYPI_USER_HEADER not in issue_data["body"]: + raise ValueError(f"Issue body does not contain expected header: {PYPI_USER_HEADER}") + + pypi_user = sanitize_pypi_user(issue_data["body"]["PyPI Username"]) + pypi_user_link = format_markdown_pypi_user_link(pypi_user) + + try: + packages = pypi_utils.get_packages_by_user(pypi_user) + except ValueError as e: + raise e + + # If the pypi user is not a maintainer for any packages + if not packages: + gh_utils.add_issue_comment( + f"User {pypi_user_link} has no packages", + github_issue_owner, + github_issue_repo, + issue_number, + github_token=github_token, + ) + sys.exit() + + # Loop over all packages to see if they belong to the user + package_ownership = [] # List of [package_name, repo_url, ownership_status] + for package_name in packages: + pypi_package_link = format_markdown_package_link(package_name) + package = pypi_utils.get_pypi_project_info(package_name) + + # Package has source code repo listed at PyPI + if "repository_url" not in package or not package["repository_url"]: + package_ownership.append([pypi_package_link, "", NO_REPO]) + continue + + package_repo_url = package["repository_url"] + + # Package source repo directly belongs to the gh_user + if gh_utils.does_user_own_repo(package_repo_url, gh_user): + package_ownership.append([pypi_package_link, package_repo_url, BELONGS]) + continue + + # If package source repo belongs to an organization - check if the gh_user is a member or admin + org_status = gh_utils.get_user_role_in_org(package_repo_url, gh_user) + if org_status == "admin": + package_ownership.append([pypi_package_link, package_repo_url, ORG_ADMIN]) + elif org_status == "member": + package_ownership.append([pypi_package_link, package_repo_url, ORG_MEMBER]) + + # Otherwise the source repo may not belong to the gh_user + else: + package_ownership.append([pypi_package_link, package_repo_url, UNKNOWN_OWERNSHIP]) + + # Add a comment to the issue with the package ownership information + table = format_markdown_table(package_ownership) + + # Count how many packages are not owned or administered by the user + num_unverified = len([row for row in package_ownership if row[2] > ORG_ADMIN]) + + if num_unverified == 0: + label = "fasttrack" + else: + label = "" + + comment = "\n\n".join(["### Package Ownership", table, BOT_NOTICE]) + + try: + gh_utils.add_issue_comment( + comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token + ) + except Exception as e: + print(f"Failed to add comment to issue {issue_number}: {e}") + print("Comment:") + print(comment) + + if label: + try: + gh_utils.add_label_to_issue( + label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token + ) + except Exception as e: + print(f"Failed to add label to issue {issue_number}: {e}") diff --git a/.github/workflows/autoreplies/gh_utils.py b/.github/workflows/autoreplies/gh_utils.py new file mode 100644 index 0000000..06a901c --- /dev/null +++ b/.github/workflows/autoreplies/gh_utils.py @@ -0,0 +1,202 @@ +import re +from urllib.parse import urlparse + +import requests + + +def fetch_issue_details(gh_user: str, repo_name: str, issue_number, github_token=None) -> dict: + """Fetch issue details using the GitHub API.""" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + + url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}" + response = requests.get(url, headers=headers) + if response.status_code == 200: + return parse_issue_details(response.json()) + raise ValueError(f"Failed to fetch issue details: {response.status_code}") + + +def parse_issue_details(issue: dict) -> dict: + """Parse a GitHub issue metadata to retrieve relevant fields.""" + body = parse_issue_body(issue["body"]) + return { + "created_at": issue["created_at"], + "user": issue["user"]["login"], + "url": issue["html_url"], + "body": body, + } + + +def parse_issue_body(body: str) -> dict: + """Parse the body of a GitHub issue into a dictionary. + + This function works well with the issue templates, though may run into trouble if users include "### " in their own + body text. + + For example: + + ### Some header + + abcd 123 ab + cdefg + + ### Another header + + Lorem ipsum dolor sit amet, consectetur adipiscing elit. + + will get processed to: + { + "Some header": "abcd 123 ab\ncdefg", + "Another header": "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + } + + Parameters + ---------- + body: str + The body of the issue. + + Returns + ------- + dict + A dictionary with the issue text keyed by the markdown headers (h3) + """ + RE_GH_ISSUE_HEADER = re.compile(r"### (?P.+)") # This finds lines beginning with "### " to use as keys + body_dict = {} + cur_key = None + cur_lines = [] + for line in body.strip().split("\n"): + line = line.strip() + if not line: + continue + header_match = RE_GH_ISSUE_HEADER.match(line) + if header_match: + if cur_key: + body_dict[cur_key] = "\n".join(cur_lines) + cur_lines = [] + cur_key = header_match.group("key") + else: + cur_lines.append(line) + return body_dict + + +def _sanitize_url(url: str) -> str: + """Ensure the URL starts with "http://" or "https://", and lowercases the URL since GitHub is case-insensitive.""" + url = url.lower() + if not url.startswith("http"): + url = f"https://{url}" + return url + + +def _is_user_in_org(org_name, username, github_token=None): + """Return True if the user is a publically listed member of the organization.""" + url = f"https://api.github.com/orgs/{org_name}/members/{username}" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + response = requests.get(url, headers=headers) + return response.status_code == 204 + + +def _is_user_owner_of_org(org_name, username, github_token=None): + """Return True if the user is an owner of the organization.""" + url = f"https://api.github.com/orgs/{org_name}/memberships/{username}" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + response = requests.get(url, headers=headers) + if response.status_code == 200: + membership_info = response.json() + return membership_info.get("role") == "admin" + return False + + +def _is_github_pages_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: + """Return True if the URL is a GitHub Pages URL for the GitHub user's account.""" + parsed_url = urlparse(_sanitize_url(code_repo_url)) + + # Normalize domain + hostname = parsed_url.hostname or "" + hostname = hostname.replace("www.", "") + return hostname == f"{gh_user}.github.io".lower() + + +def _is_github_repo_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: + """Return True if the URL is a GitHub repo associated to the GitHub user's account.""" + parsed_url = urlparse(_sanitize_url(code_repo_url)) + + # Normalize domain + hostname = parsed_url.hostname or "" + hostname = hostname.replace("www.", "") + + # Check if the domain is github.com + if hostname != "github.com": + return False + + # Split the path to analyze its parts + path_parts = parsed_url.path.strip("/").split("/") + + # Check if the first part of the path is 'gh_user' + return path_parts and path_parts[0] == gh_user.lower() + + +def get_user_role_in_org(code_repo_url: str, gh_user: str, github_token=None) -> str: + """Determines the role of the user in an organization. + + Parameters + ---------- + code_repo_url: str + The URL of the repository. This can be a GitHub Pages URL or a GitHub repository URL. + + gh_user: str + The GitHub username to check for. + + github_token: str + The GitHub token to use for API requests. + + Returns + ------- + str + "member" or "admin", or an empty string if the user is not in the organization. + """ + parsed_url = urlparse(_sanitize_url(code_repo_url)) + + # Normalize domain + hostname = parsed_url.hostname or "" + hostname = hostname.replace("www.", "").lower() + + RE_GH_PAGES = re.compile(r"^(?P.+)\.github\.io$") + pages_match = re.match(RE_GH_PAGES, hostname) + if pages_match: + org_name = pages_match.group("org_name") + elif hostname == "github.com": + org_name = parsed_url.path.strip("/").split("/")[0] + else: + return "" + + if _is_user_in_org(org_name, gh_user, github_token=github_token): + if _is_user_owner_of_org(org_name, gh_user, github_token=github_token): + return "admin" + return "member" + return "" + + +def does_user_own_repo(code_repo_url: str, gh_user: str) -> bool: + """Return True if the GitHub user owns the repository.""" + return _is_github_repo_belonging_to_owner(code_repo_url, gh_user) or _is_github_pages_belonging_to_owner( + code_repo_url, gh_user + ) + + +def add_issue_comment(comment: str, gh_user: str, repo_name: str, issue_number, github_token=None): + """Add a comment to a GitHub issue.""" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}/comments" + response = requests.post(url, json={"body": comment}, headers=headers) + if response.status_code != 201: + raise ValueError(f"Failed to add comment: {response.status_code}") + return response.json() + + +def add_label_to_issue(label: str, gh_user: str, repo_name: str, issue_number, github_token=None): + """Add a label to a GitHub issue.""" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}/labels" + response = requests.post(url, json=[label], headers=headers) + if response.status_code != 200: + raise ValueError(f"Failed to add label: {response.status_code}") + return response.json() diff --git a/.github/workflows/autoreplies/pypi_utils.py b/.github/workflows/autoreplies/pypi_utils.py new file mode 100644 index 0000000..7ccceac --- /dev/null +++ b/.github/workflows/autoreplies/pypi_utils.py @@ -0,0 +1,91 @@ +import re +import time +from typing import Dict + +import requests +from bs4 import BeautifulSoup + + +def get_packages_by_user(username: str) -> list: + """Parse html to get a list of packages for a given PyPI user. + + The pypi api does not provide a way to get a list of packages for a user, hence crawling the html. + + Steps: + 1) Queries the PyPI user page for the given username. + 2) Parses the html to get the number of projects and the list of packages. This assumes that the number of projects + listed on the page is in the first

tag, in the form "X project" or "X projects". + 3) Loops over all elements of to get the package names. + 4) Ensure that the number of packages found is equal to the number of projects reported. If not, raise an error. + 5) Return the list of package names. + + Step 2 is to avoid having to handle pagination of projects. As of now the user with the most projects I have seen + has 43, and there was no pagination. If pagination is detected, this function will raise an error. + + Parameters + ---------- + username: str + The PyPI username to search for. + + Returns + ------- + list + A list of package names + """ + time.sleep(1) + url = f"https://pypi.org/user/{username}/" + response = requests.get(url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, "html.parser") + + # Get the reported number of projects maintained by this user, to ensure we later don't miss any + num_projects_text = soup.find("h2").text.lower() + num_projects_text = num_projects_text.replace("no projects", "0 projects") + + RE_PROJECT_COUNT = re.compile(r"\s*(?P\d+)\s*project(?:s)?") + re_num_project_match = RE_PROJECT_COUNT.match(num_projects_text) + if not re_num_project_match: + raise ValueError(f"Could not determine the number of projects for user {username}") + + num_projects = int(re_num_project_match.group("num_projects")) + packages = [a.text.strip().split("\n")[0] for a in soup.find_all("a", class_="package-snippet")] + # Check for pagination: if num_projects > len(packages) then there are probably more pages + # which aren't handled here yet + if len(packages) != num_projects: + raise ValueError(f"num_projects {num_projects} != num_packages {len(packages)} for user {username}") + return packages + raise ValueError(f"Error retrieving project data for user {username}") + + +def get_pypi_project_info(package_name: str) -> Dict[str, str]: + """Retrieve relevant information about a PyPI project. + + Parameters + ---------- + package_name: str + The name of the package to query. + + Returns + ------- + Dict[str, str] + A dictionary containing the following keys: + - repository_url ("" if no repository or homepage is listed) + - author + - author_email + """ + time.sleep(1) + url = f"https://pypi.org/pypi/{package_name}/json" + response = requests.get(url) + if response.status_code != 200: + raise ValueError(f"Error retrieving project info for {package_name}") + + data = response.json() + info = data.get("info", {}) + project_urls = info.get("project_urls", {}) or {} + author = info.get("author") + author_email = info.get("author_email") + return { + "repository_url": project_urls.get("Source", project_urls.get("Homepage", "")), + "author": author, + "author_email": author_email, + } diff --git a/.github/workflows/autoreplies/requirements.txt b/.github/workflows/autoreplies/requirements.txt new file mode 100644 index 0000000..a4d20e8 --- /dev/null +++ b/.github/workflows/autoreplies/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4>=4.9.1 +requests>=2.24.0 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9ac2dd8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,163 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +