Ruff, dependency upgrades (#112)

* Ruff * PR feedback, removed unused pylint excepts
smart-on-fhir · Feb 22, 2024 · d06c1af · d06c1af
1 parent 2dea9d6
commit d06c1af
Show file tree

Hide file tree

Showing 36 changed files with 981 additions and 432 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -10,7 +10,7 @@ jobs:
       - name: Set up Python 
         uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: '3.10'
 
       - name: Install dependencies
         run: |
@@ -23,23 +23,17 @@ jobs:
   lint:
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+      - name: Set up Python 
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
       - name: Install linters
         run: |
           python -m pip install --upgrade pip
           pip install ".[dev]"
-      - name: Run pycodestyle
-        run: |
-          pycodestyle scripts src tests --max-line-length=88
-      - name: Run pylint
-        if: success() || failure() # still run pylint if above checks fail
-        run: |
-          pylint scripts src tests
-      - name: Run bandit
-        if: success() || failure() # still run bandit if above checks fail
-        run: |
-          bandit -r scripts src
-      - name: Run black
+      - name: Run ruff
         if: success() || failure() # still run black if above checks fails
         run: |
-          black --check --verbose .
+          ruff check
+          ruff format --check
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,17 +1,10 @@
+default_install_hook_types: [pre-commit, pre-push]
 repos:
-  - repo: https://github.com/psf/black
-    #this version is synced with the black mentioned in .github/workflows/ci.yml
-    rev: 22.12.0
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.1
     hooks:
-      - id: black
-        entry: bash -c 'black "$@"; git add -u' --
-        # It is recommended to specify the latest version of Python
-        # supported by your project here, or alternatively use
-        # pre-commit's default_language_version, see
-        # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.9
-  - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
-        args: ["--profile", "black", "--filter-files"]
+      - name: Ruff formatting
+        id: ruff-format
+      - name: Ruff linting
+        id: ruff
+        stages: [pre-push]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,16 @@
 [project]
 name = "aggregator"
-requires-python = ">= 3.9"
-version = "0.1.3"
+requires-python = ">= 3.10"
+version = "0.3.0"
 # This project is designed to run on the AWS serverless application framework (SAM).
 # The project dependencies are handled via AWS layers. These are only required for
 # local development.
 dependencies= [
     "arrow >=1.2.3",
-    "awswrangler >=2.19.0, <3",
+    "awswrangler >=3.5, <4",
     "boto3",
-    "pandas >=1.5.0, <2"
+    "pandas >=2, <3",
+    "rich"
 ]
 authors = [
   { name="Matt Garber", email="matthew.garber@childrens.harvard.edu" },
@@ -45,23 +46,26 @@ test = [
     "pytest-mock"
 ]
 dev = [
-    "bandit",
-    "black==22.12.0",
-    "isort==5.12.0",
+    "ruff == 0.2.1",
     "pre-commit",
-    "pylint",
-    "pycodestyle"
 ]
+[tool.ruff]
+target-version = "py310"
 
-[tool.coverage.run]
-command_line="-m pytest"
-source=["./src/"]
-
-[tool.coverage.report]
-show_missing=true
-
-[tool.isort]
-profile = "black"
-src_paths = ["src", "tests"]
-skip_glob = [".aws_sam"]
-
+[tool.ruff.lint]
+select = [
+    "A",  # prevent using keywords that clobber python builtins
+    "B",  # bugbear: security warnings
+    "E",  # pycodestyle
+    "F",  # pyflakes
+    "I",  # isort
+    "ISC",  # implicit string concatenation
+    "PLE",  # pylint errors
+    "RUF",  # the ruff developer's own rules
+    "UP",  # alert you when better syntax is available in your python version
+]
+ignore = [
+# Recommended ingore from `ruff format` due to in-project conflicts with check.
+# It's expected that this will be fixed in the coming months.
+    "ISC001"
+]
diff --git a/scripts/cumulus_upload_data.py b/scripts/cumulus_upload_data.py
@@ -107,7 +107,7 @@ def upload_file(cli_args):
     if args["test"]:
         args_dict["user"] = os.environ.get("CUMULUS_TEST_UPLOAD_USER", "general")
         args_dict["file"] = (
-            f"{str(Path(__file__).resolve().parents[1])}"
+            f"{Path(__file__).resolve().parents[1]!s}"
             f"/tests/test_data/count_synthea_patient.parquet"
         )
         args_dict["auth"] = os.environ.get("CUMULUS_TEST_UPLOAD_AUTH", "secretval")

diff --git a/scripts/migrations/migration.002.column_types.py b/scripts/migrations/migration.002.column_types.py
@@ -0,0 +1,86 @@
+""" Adds a new metadata type, column_types """
+
+import argparse
+import io
+import json
+
+import boto3
+import pandas
+from rich import progress
+
+
+def get_csv_column_datatypes(dtypes):
+    """helper for generating column type for dashboard API"""
+    column_dict = {}
+    for column in dtypes.index:
+        if column.endswith("year"):
+            column_dict[column] = "year"
+        elif column.endswith("month"):
+            column_dict[column] = "month"
+        elif column.endswith("week"):
+            column_dict[column] = "week"
+        elif column.endswith("day") or str(dtypes[column]) == "datetime64":
+            column_dict[column] = "day"
+        elif "cnt" in column or str(dtypes[column]) in (
+            "Int8",
+            "Int16",
+            "Int32",
+            "Int64",
+            "UInt8",
+            "UInt16",
+            "UInt32",
+            "UInt64",
+        ):
+            column_dict[column] = "integer"
+        elif str(dtypes[column]) in ("Float32", "Float64"):
+            column_dict[column] = "float"
+        elif str(dtypes[column]) == "boolean":
+            column_dict[column] = "float"
+        else:
+            column_dict[column] = "string"
+    return column_dict
+
+
+def _put_s3_data(key: str, bucket_name: str, client, data: dict) -> None:
+    """Convenience class for writing a dict to S3"""
+    b_data = io.BytesIO(json.dumps(data).encode())
+    client.upload_fileobj(Bucket=bucket_name, Key=key, Fileobj=b_data)
+
+
+def create_column_type_metadata(bucket: str):
+    """creates a new metadata dict for column types.
+
+    By design, this will replaces an existing column type dict if one already exists.
+    """
+    client = boto3.client("s3")
+    res = client.list_objects_v2(Bucket=bucket, Prefix="aggregates/")
+    contents = res["Contents"]
+    output = {}
+    for resource in progress.track(contents):
+        dirs = resource["Key"].split("/")
+        study = dirs[1]
+        subscription = dirs[2].split("__")[1]
+        version = dirs[3]
+        bytes_buffer = io.BytesIO()
+        client.download_fileobj(
+            Bucket=bucket, Key=resource["Key"], Fileobj=bytes_buffer
+        )
+        df = pandas.read_parquet(bytes_buffer)
+        type_dict = get_csv_column_datatypes(df.dtypes)
+        filename = f"{resource['Key'].split('/')[-1].split('.')[0]}.csv"
+        output.setdefault(study, {})
+        output[study].setdefault(subscription, {})
+        output[study][subscription].setdefault(version, {})
+        output[study][subscription][version]["columns"] = type_dict
+        output[study][subscription][version]["filename"] = filename
+    # print(json.dumps(output, indent=2))
+    _put_s3_data("metadata/column_types.json", bucket, client, output)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Creates column types for existing aggregates. """
+    )
+    parser.add_argument("-b", "--bucket", help="bucket name")
+    args = parser.parse_args()
+    create_column_type_metadata(args.bucket)
diff --git a/src/handlers/dashboard/filter_config.py b/src/handlers/dashboard/filter_config.py
@@ -73,10 +73,7 @@ def _parse_filter_req(filter_req):
     if "," in filter_req:
         return " AND ".join(_parse_filter_req(x) for x in filter_req.split(","))
     filter_req_split = filter_req.split(":")
-    if (
-        filter_req_split[1]
-        in _FILTER_MAP_ONE_PARAM.keys()  # pylint: disable=consider-iterating-dictionary
-    ):
+    if filter_req_split[1] in _FILTER_MAP_ONE_PARAM:
         return _FILTER_MAP_ONE_PARAM[filter_req_split[1]] % filter_req_split[0]
     return _FILTER_MAP_TWO_PARAM[filter_req_split[1]] % (
         filter_req_split[0],

diff --git a/src/handlers/dashboard/get_chart_data.py b/src/handlers/dashboard/get_chart_data.py
@@ -7,13 +7,13 @@
 import boto3
 import pandas
 
-from ..dashboard.filter_config import get_filter_string
-from ..shared.decorators import generic_error_handler
-from ..shared.enums import BucketPath
-from ..shared.functions import get_latest_data_package_version, http_response
+from src.handlers.dashboard.filter_config import get_filter_string
+from src.handlers.shared.decorators import generic_error_handler
+from src.handlers.shared.enums import BucketPath
+from src.handlers.shared.functions import get_latest_data_package_version, http_response
 
 
-def _get_table_cols(table_name: str, version: str = None) -> list:
+def _get_table_cols(table_name: str, version: str | None = None) -> list:
     """Returns the columns associated with a table.
 
     Since running an athena query takes a decent amount of time due to queueing
@@ -29,7 +29,8 @@ def _get_table_cols(table_name: str, version: str = None) -> list:
     s3_key = f"{prefix}/{version}/{table_name}__aggregate.csv"
     s3_client = boto3.client("s3")
     s3_iter = s3_client.get_object(
-        Bucket=s3_bucket_name, Key=s3_key  # type: ignore[arg-type]
+        Bucket=s3_bucket_name,
+        Key=s3_key,
     )["Body"].iter_lines()
     return next(s3_iter).decode().split(",")
 
@@ -41,7 +42,7 @@ def _build_query(query_params: dict, filters: list, path_params: dict) -> str:
     filter_str = get_filter_string(filters)
     if filter_str != "":
         filter_str = f"AND {filter_str}"
-    count_col = [c for c in columns if c.startswith("cnt")][0]
+    count_col = next(c for c in columns if c.startswith("cnt"))
     columns.remove(count_col)
     select_str = f"{query_params['column']}, sum({count_col}) as {count_col}"
     group_str = f"{query_params['column']}"

diff --git a/src/handlers/dashboard/get_csv.py b/src/handlers/dashboard/get_csv.py
@@ -0,0 +1,111 @@
+import os
+
+import boto3
+import botocore
+
+from src.handlers.shared import decorators, enums, functions
+
+
+def _format_key(
+    s3_client,
+    s3_bucket_name: str,
+    study: str,
+    subscription: str,
+    version: str,
+    filename: str,
+    site: str | None = None,
+):
+    """Creates S3 key from url params"""
+    if site is not None:
+        key = f"last_valid/{study}/{study}__{subscription}/{site}/{version}/{filename}"
+    else:
+        key = f"csv_aggregates/{study}/{study}__{subscription}/{version}/{filename}"
+    s3_client.list_objects_v2(Bucket=s3_bucket_name)
+    try:
+        s3_client.head_object(Bucket=s3_bucket_name, Key=key)
+        return key
+    except botocore.exceptions.ClientError as e:
+        raise OSError(f"No object found at key {key}") from e
+
+
+def _get_column_types(
+    s3_client,
+    s3_bucket_name: str,
+    study: str,
+    subscription: str,
+    version: str,
+    **kwargs,
+) -> dict:
+    """Gets column types from the metadata store for a given subscription"""
+    types_metadata = functions.read_metadata(
+        s3_client,
+        s3_bucket_name,
+        meta_type=enums.JsonFilename.COLUMN_TYPES.value,
+    )
+    try:
+        return types_metadata[study][subscription][version][
+            enums.ColumnTypesKeys.COLUMNS.value
+        ]
+    except KeyError:
+        return {}
+
+
+@decorators.generic_error_handler(msg="Error retrieving chart data")
+def get_csv_handler(event, context):
+    """manages event from dashboard api call and creates a temporary URL"""
+    del context
+    s3_bucket_name = os.environ.get("BUCKET_NAME")
+    s3_client = boto3.client("s3")
+    key = _format_key(s3_client, s3_bucket_name, **event["pathParameters"])
+    types = _get_column_types(s3_client, s3_bucket_name, **event["pathParameters"])
+    presign_url = s3_client.generate_presigned_url(
+        "get_object",
+        Params={
+            "Bucket": s3_bucket_name,
+            "Key": key,
+            "ResponseContentType": "text/csv",
+        },
+        ExpiresIn=600,
+    )
+    extra_headers = {
+        "Location": presign_url,
+        "x-column-names": ",".join(key for key in types.keys()),
+        "x-column-types": ",".join(key for key in types.values()),
+        # TODO: add x-column-descriptions once a source for column descriptions
+        # has been established
+    }
+    res = functions.http_response(302, "", extra_headers=extra_headers)
+    return res
+
+
+@decorators.generic_error_handler(msg="Error retrieving csv data")
+def get_csv_list_handler(event, context):
+    """manages event from dashboard api call and creates a temporary URL"""
+    del context
+    s3_bucket_name = os.environ.get("BUCKET_NAME")
+    s3_client = boto3.client("s3")
+    if event["path"].startswith("/last_valid"):
+        key_prefix = "last_valid"
+        url_prefix = "last_valid"
+    elif event["path"].startswith("/aggregates"):
+        key_prefix = "csv_aggregates"
+        url_prefix = "aggregates"
+    else:
+        raise Exception("Unexpected url encountered")
+    s3_objs = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=key_prefix)
+    urls = []
+    if s3_objs["KeyCount"] == 0:
+        return functions.http_response(200, urls)
+    for obj in s3_objs["Contents"]:
+        key_parts = obj["Key"].split("/")
+        study = key_parts[1]
+        subscription = key_parts[2].split("__")[1]
+        version = key_parts[-2]
+        filename = key_parts[-1]
+        site = key_parts[3] if url_prefix == "last_valid" else None
+        url_parts = [url_prefix, study, subscription, version, filename]
+        if url_prefix == "last_valid":
+            url_parts.insert(3, site)
+        urls.append("/".join(url_parts))
+    res = functions.http_response(200, urls)
+    return res