Skip to content

Commit

Permalink
Ruff, dependency upgrades (#112)
Browse files Browse the repository at this point in the history
* Ruff

* PR feedback, removed unused pylint excepts
  • Loading branch information
dogversioning committed Feb 22, 2024
1 parent 2dea9d6 commit d06c1af
Show file tree
Hide file tree
Showing 36 changed files with 981 additions and 432 deletions.
24 changes: 9 additions & 15 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
python-version: '3.10'

- name: Install dependencies
run: |
Expand All @@ -23,23 +23,17 @@ jobs:
lint:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install linters
run: |
python -m pip install --upgrade pip
pip install ".[dev]"
- name: Run pycodestyle
run: |
pycodestyle scripts src tests --max-line-length=88
- name: Run pylint
if: success() || failure() # still run pylint if above checks fail
run: |
pylint scripts src tests
- name: Run bandit
if: success() || failure() # still run bandit if above checks fail
run: |
bandit -r scripts src
- name: Run black
- name: Run ruff
if: success() || failure() # still run black if above checks fails
run: |
black --check --verbose .
ruff check
ruff format --check
23 changes: 8 additions & 15 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
default_install_hook_types: [pre-commit, pre-push]
repos:
- repo: https://github.com/psf/black
#this version is synced with the black mentioned in .github/workflows/ci.yml
rev: 22.12.0
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.1
hooks:
- id: black
entry: bash -c 'black "$@"; git add -u' --
# It is recommended to specify the latest version of Python
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.9
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
args: ["--profile", "black", "--filter-files"]
- name: Ruff formatting
id: ruff-format
- name: Ruff linting
id: ruff
stages: [pre-push]
46 changes: 25 additions & 21 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
[project]
name = "aggregator"
requires-python = ">= 3.9"
version = "0.1.3"
requires-python = ">= 3.10"
version = "0.3.0"
# This project is designed to run on the AWS serverless application framework (SAM).
# The project dependencies are handled via AWS layers. These are only required for
# local development.
dependencies= [
"arrow >=1.2.3",
"awswrangler >=2.19.0, <3",
"awswrangler >=3.5, <4",
"boto3",
"pandas >=1.5.0, <2"
"pandas >=2, <3",
"rich"
]
authors = [
{ name="Matt Garber", email="matthew.garber@childrens.harvard.edu" },
Expand Down Expand Up @@ -45,23 +46,26 @@ test = [
"pytest-mock"
]
dev = [
"bandit",
"black==22.12.0",
"isort==5.12.0",
"ruff == 0.2.1",
"pre-commit",
"pylint",
"pycodestyle"
]
[tool.ruff]
target-version = "py310"

[tool.coverage.run]
command_line="-m pytest"
source=["./src/"]

[tool.coverage.report]
show_missing=true

[tool.isort]
profile = "black"
src_paths = ["src", "tests"]
skip_glob = [".aws_sam"]

[tool.ruff.lint]
select = [
"A", # prevent using keywords that clobber python builtins
"B", # bugbear: security warnings
"E", # pycodestyle
"F", # pyflakes
"I", # isort
"ISC", # implicit string concatenation
"PLE", # pylint errors
"RUF", # the ruff developer's own rules
"UP", # alert you when better syntax is available in your python version
]
ignore = [
# Recommended ingore from `ruff format` due to in-project conflicts with check.
# It's expected that this will be fixed in the coming months.
"ISC001"
]
2 changes: 1 addition & 1 deletion scripts/cumulus_upload_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def upload_file(cli_args):
if args["test"]:
args_dict["user"] = os.environ.get("CUMULUS_TEST_UPLOAD_USER", "general")
args_dict["file"] = (
f"{str(Path(__file__).resolve().parents[1])}"
f"{Path(__file__).resolve().parents[1]!s}"
f"/tests/test_data/count_synthea_patient.parquet"
)
args_dict["auth"] = os.environ.get("CUMULUS_TEST_UPLOAD_AUTH", "secretval")
Expand Down
86 changes: 86 additions & 0 deletions scripts/migrations/migration.002.column_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
""" Adds a new metadata type, column_types """

import argparse
import io
import json

import boto3
import pandas
from rich import progress


def get_csv_column_datatypes(dtypes):
"""helper for generating column type for dashboard API"""
column_dict = {}
for column in dtypes.index:
if column.endswith("year"):
column_dict[column] = "year"
elif column.endswith("month"):
column_dict[column] = "month"
elif column.endswith("week"):
column_dict[column] = "week"
elif column.endswith("day") or str(dtypes[column]) == "datetime64":
column_dict[column] = "day"
elif "cnt" in column or str(dtypes[column]) in (
"Int8",
"Int16",
"Int32",
"Int64",
"UInt8",
"UInt16",
"UInt32",
"UInt64",
):
column_dict[column] = "integer"
elif str(dtypes[column]) in ("Float32", "Float64"):
column_dict[column] = "float"
elif str(dtypes[column]) == "boolean":
column_dict[column] = "float"
else:
column_dict[column] = "string"
return column_dict


def _put_s3_data(key: str, bucket_name: str, client, data: dict) -> None:
"""Convenience class for writing a dict to S3"""
b_data = io.BytesIO(json.dumps(data).encode())
client.upload_fileobj(Bucket=bucket_name, Key=key, Fileobj=b_data)


def create_column_type_metadata(bucket: str):
"""creates a new metadata dict for column types.
By design, this will replaces an existing column type dict if one already exists.
"""
client = boto3.client("s3")
res = client.list_objects_v2(Bucket=bucket, Prefix="aggregates/")
contents = res["Contents"]
output = {}
for resource in progress.track(contents):
dirs = resource["Key"].split("/")
study = dirs[1]
subscription = dirs[2].split("__")[1]
version = dirs[3]
bytes_buffer = io.BytesIO()
client.download_fileobj(
Bucket=bucket, Key=resource["Key"], Fileobj=bytes_buffer
)
df = pandas.read_parquet(bytes_buffer)
type_dict = get_csv_column_datatypes(df.dtypes)
filename = f"{resource['Key'].split('/')[-1].split('.')[0]}.csv"
output.setdefault(study, {})
output[study].setdefault(subscription, {})
output[study][subscription].setdefault(version, {})
output[study][subscription][version]["columns"] = type_dict
output[study][subscription][version]["filename"] = filename
# print(json.dumps(output, indent=2))
_put_s3_data("metadata/column_types.json", bucket, client, output)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Creates column types for existing aggregates. """
)
parser.add_argument("-b", "--bucket", help="bucket name")
args = parser.parse_args()
create_column_type_metadata(args.bucket)
5 changes: 1 addition & 4 deletions src/handlers/dashboard/filter_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,7 @@ def _parse_filter_req(filter_req):
if "," in filter_req:
return " AND ".join(_parse_filter_req(x) for x in filter_req.split(","))
filter_req_split = filter_req.split(":")
if (
filter_req_split[1]
in _FILTER_MAP_ONE_PARAM.keys() # pylint: disable=consider-iterating-dictionary
):
if filter_req_split[1] in _FILTER_MAP_ONE_PARAM:
return _FILTER_MAP_ONE_PARAM[filter_req_split[1]] % filter_req_split[0]
return _FILTER_MAP_TWO_PARAM[filter_req_split[1]] % (
filter_req_split[0],
Expand Down
15 changes: 8 additions & 7 deletions src/handlers/dashboard/get_chart_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
import boto3
import pandas

from ..dashboard.filter_config import get_filter_string
from ..shared.decorators import generic_error_handler
from ..shared.enums import BucketPath
from ..shared.functions import get_latest_data_package_version, http_response
from src.handlers.dashboard.filter_config import get_filter_string
from src.handlers.shared.decorators import generic_error_handler
from src.handlers.shared.enums import BucketPath
from src.handlers.shared.functions import get_latest_data_package_version, http_response


def _get_table_cols(table_name: str, version: str = None) -> list:
def _get_table_cols(table_name: str, version: str | None = None) -> list:
"""Returns the columns associated with a table.
Since running an athena query takes a decent amount of time due to queueing
Expand All @@ -29,7 +29,8 @@ def _get_table_cols(table_name: str, version: str = None) -> list:
s3_key = f"{prefix}/{version}/{table_name}__aggregate.csv"
s3_client = boto3.client("s3")
s3_iter = s3_client.get_object(
Bucket=s3_bucket_name, Key=s3_key # type: ignore[arg-type]
Bucket=s3_bucket_name,
Key=s3_key,
)["Body"].iter_lines()
return next(s3_iter).decode().split(",")

Expand All @@ -41,7 +42,7 @@ def _build_query(query_params: dict, filters: list, path_params: dict) -> str:
filter_str = get_filter_string(filters)
if filter_str != "":
filter_str = f"AND {filter_str}"
count_col = [c for c in columns if c.startswith("cnt")][0]
count_col = next(c for c in columns if c.startswith("cnt"))
columns.remove(count_col)
select_str = f"{query_params['column']}, sum({count_col}) as {count_col}"
group_str = f"{query_params['column']}"
Expand Down
111 changes: 111 additions & 0 deletions src/handlers/dashboard/get_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os

import boto3
import botocore

from src.handlers.shared import decorators, enums, functions


def _format_key(
s3_client,
s3_bucket_name: str,
study: str,
subscription: str,
version: str,
filename: str,
site: str | None = None,
):
"""Creates S3 key from url params"""
if site is not None:
key = f"last_valid/{study}/{study}__{subscription}/{site}/{version}/{filename}"
else:
key = f"csv_aggregates/{study}/{study}__{subscription}/{version}/{filename}"
s3_client.list_objects_v2(Bucket=s3_bucket_name)
try:
s3_client.head_object(Bucket=s3_bucket_name, Key=key)
return key
except botocore.exceptions.ClientError as e:
raise OSError(f"No object found at key {key}") from e


def _get_column_types(
s3_client,
s3_bucket_name: str,
study: str,
subscription: str,
version: str,
**kwargs,
) -> dict:
"""Gets column types from the metadata store for a given subscription"""
types_metadata = functions.read_metadata(
s3_client,
s3_bucket_name,
meta_type=enums.JsonFilename.COLUMN_TYPES.value,
)
try:
return types_metadata[study][subscription][version][
enums.ColumnTypesKeys.COLUMNS.value
]
except KeyError:
return {}


@decorators.generic_error_handler(msg="Error retrieving chart data")
def get_csv_handler(event, context):
"""manages event from dashboard api call and creates a temporary URL"""
del context
s3_bucket_name = os.environ.get("BUCKET_NAME")
s3_client = boto3.client("s3")
key = _format_key(s3_client, s3_bucket_name, **event["pathParameters"])
types = _get_column_types(s3_client, s3_bucket_name, **event["pathParameters"])
presign_url = s3_client.generate_presigned_url(
"get_object",
Params={
"Bucket": s3_bucket_name,
"Key": key,
"ResponseContentType": "text/csv",
},
ExpiresIn=600,
)
extra_headers = {
"Location": presign_url,
"x-column-names": ",".join(key for key in types.keys()),
"x-column-types": ",".join(key for key in types.values()),
# TODO: add x-column-descriptions once a source for column descriptions
# has been established
}
res = functions.http_response(302, "", extra_headers=extra_headers)
return res


@decorators.generic_error_handler(msg="Error retrieving csv data")
def get_csv_list_handler(event, context):
"""manages event from dashboard api call and creates a temporary URL"""
del context
s3_bucket_name = os.environ.get("BUCKET_NAME")
s3_client = boto3.client("s3")
if event["path"].startswith("/last_valid"):
key_prefix = "last_valid"
url_prefix = "last_valid"
elif event["path"].startswith("/aggregates"):
key_prefix = "csv_aggregates"
url_prefix = "aggregates"
else:
raise Exception("Unexpected url encountered")
s3_objs = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=key_prefix)
urls = []
if s3_objs["KeyCount"] == 0:
return functions.http_response(200, urls)
for obj in s3_objs["Contents"]:
key_parts = obj["Key"].split("/")
study = key_parts[1]
subscription = key_parts[2].split("__")[1]
version = key_parts[-2]
filename = key_parts[-1]
site = key_parts[3] if url_prefix == "last_valid" else None
url_parts = [url_prefix, study, subscription, version, filename]
if url_prefix == "last_valid":
url_parts.insert(3, site)
urls.append("/".join(url_parts))
res = functions.http_response(200, urls)
return res
Loading

0 comments on commit d06c1af

Please sign in to comment.