Skip to content

Commit

Permalink
Merge pull request #294 from 4dn-dcic/smaht-ingestion-20231201
Browse files Browse the repository at this point in the history
SMaHT ingestion work. Moved structured_data.py from smaht-portal to here. (Tests still in smaht-portal though for now).
  • Loading branch information
dmichaels-harvard authored Dec 11, 2023
2 parents 9474417 + 3dd5c24 commit 26ec9c0
Show file tree
Hide file tree
Showing 8 changed files with 1,159 additions and 103 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ dcicutils
Change Log
----------

8.5.0
=====
* Moved structured_data.py from smaht-portal to here; new portal_utils and data_readers modules.
* Strip sheet name in data_readers.Excel; respecte (ignore) hidden sheets.


8.4.0
=====
* More work related to SMaHT ingestion (bundle/sheet_utils, data_readers, etc).
Expand Down
36 changes: 24 additions & 12 deletions dcicutils/data_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,30 @@
import csv
import openpyxl
from typing import Any, Generator, Iterator, List, Optional, Type, Tuple, Union
from dcicutils.misc_utils import right_trim
from dcicutils.misc_utils import create_dict, right_trim

# Forward type references for type hints.
Excel = Type["Excel"]


class RowReader(abc.ABC):

def __init__(self):
self.header = None
self.location = 0
self.row_number = 0
self._warning_empty_headers = False
self._warning_extra_values = [] # Line numbers.
self.open()

def __iter__(self) -> Iterator:
for row in self.rows:
self.location += 1
self.row_number += 1
if self.is_comment_row(row):
continue
if self.is_terminating_row(row):
break
if len(self.header) < len(row): # Row values beyond what there are headers for are ignored.
self._warning_extra_values.append(self.location)
self._warning_extra_values.append(self.row_number)
yield {column: self.cell_value(value) for column, value in zip(self.header, row)}

def _define_header(self, header: List[Optional[Any]]) -> None:
Expand All @@ -49,13 +52,20 @@ def open(self) -> None:
pass

@property
def issues(self) -> Optional[List[str]]:
issues = []
def file(self) -> Optional[str]:
return self._file if hasattr(self, "_file") else None

@property
def warnings(self) -> List[str]:
warnings = []
if self._warning_empty_headers:
issues.append("Empty header column encountered; ignoring it and all subsequent columns.")
warnings.append({"src": create_dict(file=self.file),
"warning": "Empty header column encountered; ignoring it and all subsequent columns."})
if self._warning_extra_values:
issues.extend([f"Extra column values on row [{row_number}]" for row_number in self._warning_extra_values])
return issues if issues else None
for row_number in self._warning_extra_values:
warnings.append({"src": create_dict(file=self.file, row=row_number),
"warning": f"Extra row column values."})
return warnings


class ListReader(RowReader):
Expand Down Expand Up @@ -101,9 +111,10 @@ def __del__(self) -> None:

class ExcelSheetReader(RowReader):

def __init__(self, sheet_name: str, workbook: openpyxl.workbook.workbook.Workbook) -> None:
def __init__(self, excel: Excel, sheet_name: str, workbook: openpyxl.workbook.workbook.Workbook) -> None:
self.sheet_name = sheet_name or "Sheet1"
self._workbook = workbook
self._file = excel._file
self._rows = None
super().__init__()

Expand Down Expand Up @@ -134,12 +145,13 @@ def __init__(self, file: str, reader_class: Optional[Type] = None) -> None:
self.open()

def sheet_reader(self, sheet_name: str) -> ExcelSheetReader:
return self._reader_class(sheet_name=sheet_name, workbook=self._workbook)
return self._reader_class(self, sheet_name=sheet_name, workbook=self._workbook)

def open(self) -> None:
if self._workbook is None:
self._workbook = openpyxl.load_workbook(self._file, data_only=True)
self.sheet_names = self._workbook.sheetnames or []
self.sheet_names = [sheet_name for sheet_name in self._workbook.sheetnames
if self._workbook[sheet_name].sheet_state != "hidden"]

def __del__(self) -> None:
if (workbook := self._workbook) is not None:
Expand Down
8 changes: 8 additions & 0 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,14 @@ def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]],
return list_or_tuple[:i + 1]


def create_dict(**kwargs) -> dict:
result = {}
for name in kwargs:
if kwargs[name]:
result[name] = kwargs[name]
return result


def is_c4_arn(arn: str) -> bool:
"""
Returns True iff the given (presumed) AWS ARN string value looks like it
Expand Down
278 changes: 278 additions & 0 deletions dcicutils/portal_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
from collections import deque
from pyramid.paster import get_app
from pyramid.router import Router
import re
import requests
from requests.models import Response as RequestResponse
from typing import Optional, Type, Union
from webtest.app import TestApp, TestResponse
from dcicutils.common import OrchestratedApp, APP_CGAP, APP_FOURFRONT, APP_SMAHT, ORCHESTRATED_APPS
from dcicutils.creds_utils import CGAPKeyManager, FourfrontKeyManager, SMaHTKeyManager
from dcicutils.ff_utils import get_metadata, get_schema, patch_metadata, post_metadata
from dcicutils.misc_utils import to_camel_case, VirtualApp
from dcicutils.zip_utils import temporary_file

Portal = Type["Portal"] # Forward type reference for type hints.
FILE_SCHEMA_NAME = "File"


class Portal:
"""
This is meant to be an uber wrapper for Portal access. It can be created in a variety of ways:
1. From a (Portal) .ini file (e.g. development.ini)
2. From a key dictionary, containing "key" and "secret" property values.
3. From a key tuple, containing (in order) a key and secret values.
4. From a keys file assumed to reside in ~/.{app}-keys.json where the given "app" value is either "smaht", "cgap",
or "fourfront"; and where this file is assumed to contain a dictionary with a key equal to the given "env"
value (e.g. smaht-localhost) and with a dictionary value containing "key" and "secret" property values; if
an "app" value is not specified but the given "env" value begins with one of the app values then that value
will be used, i.e. e.g. if env is "smaht-localhost" and app is unspecified than it is assumed to be "smaht".
5. From a keys file as described above (#4) but rather than be identified by the given "env" value it
is looked up by the given "server" name and the "server" key dictionary value in the key file.
6. From a given "vapp" value (which is assumed to be a TestApp or VirtualApp).
7. From another Portal object.
8. From a a pyramid Router object.
"""
def __init__(self,
arg: Optional[Union[VirtualApp, TestApp, Router, Portal, dict, tuple, str]] = None,
env: Optional[str] = None, app: Optional[OrchestratedApp] = None, server: Optional[str] = None,
key: Optional[Union[dict, tuple]] = None,
vapp: Optional[Union[VirtualApp, TestApp, Router, Portal, str]] = None,
portal: Optional[Union[VirtualApp, TestApp, Router, Portal, str]] = None) -> Portal:
if vapp and not portal:
portal = vapp
if ((isinstance(arg, (VirtualApp, TestApp, Router, Portal)) or
isinstance(arg, str) and arg.endswith(".ini")) and not portal):
portal = arg
elif isinstance(arg, str) and not env:
env = arg
elif (isinstance(arg, dict) or isinstance(arg, tuple)) and not key:
key = arg
if not app and env:
if env.startswith(APP_SMAHT):
app = APP_SMAHT
elif env.startswith(APP_CGAP):
app = APP_CGAP
elif env.startswith(APP_FOURFRONT):
app = APP_FOURFRONT
if isinstance(portal, Portal):
self._vapp = portal._vapp
self._env = portal._env
self._app = portal._app
self._server = portal._server
self._key = portal._key
self._key_pair = portal._key_pair
self._key_file = portal._key_file
return
self._vapp = None
self._env = env
self._app = app
self._server = server
self._key = None
self._key_pair = None
self._key_file = None
if isinstance(portal, (VirtualApp, TestApp)):
self._vapp = portal
elif isinstance(portal, (Router, str)):
self._vapp = Portal._create_vapp(portal)
elif isinstance(key, dict):
self._key = key
self._key_pair = (key.get("key"), key.get("secret")) if key else None
if key_server := key.get("server"):
self._server = key_server
elif isinstance(key, tuple) and len(key) >= 2:
self._key = {"key": key[0], "secret": key[1]}
self._key_pair = key
elif isinstance(env, str):
key_managers = {APP_CGAP: CGAPKeyManager, APP_FOURFRONT: FourfrontKeyManager, APP_SMAHT: SMaHTKeyManager}
if not (key_manager := key_managers.get(self._app)) or not (key_manager := key_manager()):
raise Exception(f"Invalid app name: {self._app} (valid: {', '.join(ORCHESTRATED_APPS)}).")
if isinstance(env, str):
self._key = key_manager.get_keydict_for_env(env)
if key_server := self._key.get("server"):
self._server = key_server
elif isinstance(self._server, str):
self._key = key_manager.get_keydict_for_server(self._server)
self._key_pair = key_manager.keydict_to_keypair(self._key) if self._key else None
self._key_file = key_manager.keys_file

@property
def env(self):
return self._env

@property
def app(self):
return self._app

@property
def server(self):
return self._server

@property
def key(self):
return self._key

@property
def key_pair(self):
return self._key_pair

@property
def key_file(self):
return self._key_file

@property
def vapp(self):
return self._vapp

def get_metadata(self, object_id: str) -> Optional[dict]:
return get_metadata(obj_id=object_id, vapp=self._vapp, key=self._key)

def patch_metadata(self, object_id: str, data: str) -> Optional[dict]:
if self._key:
return patch_metadata(obj_id=object_id, patch_item=data, key=self._key)
return self.patch(f"/{object_id}", data)

def post_metadata(self, object_type: str, data: str) -> Optional[dict]:
if self._key:
return post_metadata(schema_name=object_type, post_item=data, key=self._key)
return self.post(f"/{object_type}", data)

def get(self, uri: str, follow: bool = True, **kwargs) -> Optional[Union[RequestResponse, TestResponse]]:
if isinstance(self._vapp, (VirtualApp, TestApp)):
response = self._vapp.get(self._uri(uri), **self._kwargs(**kwargs))
if response and response.status_code in [301, 302, 303, 307, 308] and follow:
response = response.follow()
return self._response(response)
return requests.get(self._uri(uri), allow_redirects=follow, **self._kwargs(**kwargs))

def patch(self, uri: str, data: Optional[dict] = None,
json: Optional[dict] = None, **kwargs) -> Optional[Union[RequestResponse, TestResponse]]:
if isinstance(self._vapp, (VirtualApp, TestApp)):
return self._vapp.patch_json(self._uri(uri), json or data, **self._kwargs(**kwargs))
return requests.patch(self._uri(uri), json=json or data, **self._kwargs(**kwargs))

def post(self, uri: str, data: Optional[dict] = None, json: Optional[dict] = None,
files: Optional[dict] = None, **kwargs) -> Optional[Union[RequestResponse, TestResponse]]:
if isinstance(self._vapp, (VirtualApp, TestApp)):
if files:
return self._vapp.post(self._uri(uri), json or data, upload_files=files, **self._kwargs(**kwargs))
else:
return self._vapp.post_json(self._uri(uri), json or data, upload_files=files, **self._kwargs(**kwargs))
return requests.post(self._uri(uri), json=json or data, files=files, **self._kwargs(**kwargs))

def get_schema(self, schema_name: str) -> Optional[dict]:
return get_schema(self.schema_name(schema_name), portal_vapp=self._vapp, key=self._key)

def get_schemas(self) -> dict:
return self.get("/profiles/").json()

@staticmethod
def schema_name(name: str) -> str:
return to_camel_case(name)

def is_file_schema(self, schema_name: str) -> bool:
if super_type_map := self.get_schemas_super_type_map():
if file_super_type := super_type_map.get(FILE_SCHEMA_NAME):
return self.schema_name(schema_name) in file_super_type
return False

def get_schemas_super_type_map(self) -> dict:
"""
Returns the "super type map" for all of the known schemas (via /profiles).
This is a dictionary of all types which have (one or more) sub-types whose value is
an array of all of those sub-types (direct and all descendents), in breadth first order.
"""
def breadth_first(super_type_map: dict, super_type_name: str) -> dict:
result = []
queue = deque(super_type_map.get(super_type_name, []))
while queue:
result.append(sub_type_name := queue.popleft())
if sub_type_name in super_type_map:
queue.extend(super_type_map[sub_type_name])
return result
if not (schemas := self.get_schemas()):
return {}
super_type_map = {}
for type_name in schemas:
if super_type_name := schemas[type_name].get("rdfs:subClassOf"):
super_type_name = super_type_name.replace("/profiles/", "").replace(".json", "")
if super_type_name != "Item":
if not super_type_map.get(super_type_name):
super_type_map[super_type_name] = [type_name]
elif type_name not in super_type_map[super_type_name]:
super_type_map[super_type_name].append(type_name)
super_type_map_flattened = {}
for super_type_name in super_type_map:
super_type_map_flattened[super_type_name] = breadth_first(super_type_map, super_type_name)
return super_type_map_flattened

def _uri(self, uri: str) -> str:
if not isinstance(uri, str) or not uri:
return "/"
if uri.lower().startswith("http://") or uri.lower().startswith("https://"):
return uri
uri = re.sub(r"/+", "/", uri)
return (self._server + ("/" if uri.startswith("/") else "") + uri) if self._server else uri

def _kwargs(self, **kwargs) -> dict:
result_kwargs = {"headers":
kwargs.get("headers", {"Content-type": "application/json", "Accept": "application/json"})}
if self._key_pair:
result_kwargs["auth"] = self._key_pair
if isinstance(timeout := kwargs.get("timeout"), int):
result_kwargs["timeout"] = timeout
return result_kwargs

def _response(self, response) -> Optional[RequestResponse]:
if response and isinstance(getattr(response.__class__, "json"), property):
class RequestResponseWrapper: # For consistency change json property to method.
def __init__(self, response, **kwargs):
super().__init__(**kwargs)
self._response = response
def __getattr__(self, attr): # noqa
return getattr(self._response, attr)
def json(self): # noqa
return self._response.json
response = RequestResponseWrapper(response)
return response

@staticmethod
def create_for_testing(ini_file: Optional[str] = None) -> Portal:
if isinstance(ini_file, str):
return Portal(Portal._create_vapp(ini_file))
minimal_ini_for_unit_testing = "[app:app]\nuse = egg:encoded\nsqlalchemy.url = postgresql://dummy\n"
with temporary_file(content=minimal_ini_for_unit_testing, suffix=".ini") as ini_file:
return Portal(Portal._create_vapp(ini_file))

@staticmethod
def create_for_testing_local(ini_file: Optional[str] = None) -> Portal:
if isinstance(ini_file, str) and ini_file:
return Portal(Portal._create_vapp(ini_file))
minimal_ini_for_testing_local = "\n".join([
"[app:app]\nuse = egg:encoded\nfile_upload_bucket = dummy",
"sqlalchemy.url = postgresql://postgres@localhost:5441/postgres?host=/tmp/snovault/pgdata",
"multiauth.groupfinder = encoded.authorization.smaht_groupfinder",
"multiauth.policies = auth0 session remoteuser accesskey",
"multiauth.policy.session.namespace = mailto",
"multiauth.policy.session.use = encoded.authentication.NamespacedAuthenticationPolicy",
"multiauth.policy.session.base = pyramid.authentication.SessionAuthenticationPolicy",
"multiauth.policy.remoteuser.namespace = remoteuser",
"multiauth.policy.remoteuser.use = encoded.authentication.NamespacedAuthenticationPolicy",
"multiauth.policy.remoteuser.base = pyramid.authentication.RemoteUserAuthenticationPolicy",
"multiauth.policy.accesskey.namespace = accesskey",
"multiauth.policy.accesskey.use = encoded.authentication.NamespacedAuthenticationPolicy",
"multiauth.policy.accesskey.base = encoded.authentication.BasicAuthAuthenticationPolicy",
"multiauth.policy.accesskey.check = encoded.authentication.basic_auth_check",
"multiauth.policy.auth0.use = encoded.authentication.NamespacedAuthenticationPolicy",
"multiauth.policy.auth0.namespace = auth0",
"multiauth.policy.auth0.base = encoded.authentication.Auth0AuthenticationPolicy"
])
with temporary_file(content=minimal_ini_for_testing_local, suffix=".ini") as minimal_ini_file:
return Portal(Portal._create_vapp(minimal_ini_file))

@staticmethod
def _create_vapp(value: Union[str, Router, TestApp] = "development.ini", app_name: str = "app") -> TestApp:
if isinstance(value, TestApp):
return value
app = value if isinstance(value, Router) else get_app(value, app_name)
return TestApp(app, {"HTTP_ACCEPT": "application/json", "REMOTE_USER": "TEST"})
Loading

0 comments on commit 26ec9c0

Please sign in to comment.