From 2ba1e0c126e76c6c2e1d618e5ad01b41ae516f71 Mon Sep 17 00:00:00 2001 From: rhysrevans3 Date: Tue, 9 Jan 2024 09:35:28 +0000 Subject: [PATCH] Fixing extraction method bugs. --- setup.py | 3 + stac_generator/core/baker.py | 8 +- stac_generator/core/extraction_method.py | 3 + .../extraction_methods/asset_aggregator.py | 10 +- .../plugins/extraction_methods/bbox.py | 16 ++- .../datetime_bound_to_centroid.py | 78 ++++++++++++ .../plugins/extraction_methods/default.py | 6 - .../elasticsearch_aggregation.py | 103 +++++++++------ .../elasticsearch_assets.py | 2 +- .../extraction_methods/geometry_polygon.py | 2 +- .../extraction_methods/geometry_to_bbox.py | 68 ++++++++++ .../plugins/extraction_methods/iso_date.py | 26 ++-- .../plugins/extraction_methods/lambda.py | 47 ++++--- .../plugins/extraction_methods/netcdf.py | 6 +- .../plugins/extraction_methods/open_zip.py | 117 ++++++++++++++++++ .../plugins/extraction_methods/regex.py | 1 - .../plugins/extraction_methods/remove.py | 3 +- .../plugins/extraction_methods/stac_bbox.py | 20 ++- .../plugins/extraction_methods/xml.py | 39 ++++-- stac_generator/plugins/mappings/stac.py | 57 +++++++-- .../plugins/outputs/stac_fastapi.py | 24 +++- stac_generator/scripts/recipe_keys.py | 81 ++++++++++++ stac_generator/scripts/stac_generator.py | 13 -- 23 files changed, 605 insertions(+), 128 deletions(-) create mode 100644 stac_generator/plugins/extraction_methods/datetime_bound_to_centroid.py create mode 100644 stac_generator/plugins/extraction_methods/geometry_to_bbox.py create mode 100644 stac_generator/plugins/extraction_methods/open_zip.py create mode 100644 stac_generator/scripts/recipe_keys.py diff --git a/setup.py b/setup.py index 72e6b35..7d740ef 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,8 @@ "header = stac_generator.plugins.extraction_methods.header.header:HeaderExtract", "iso19115 = stac_generator.plugins.extraction_methods.iso19115:ISO19115Extract", "xml = stac_generator.plugins.extraction_methods.xml:XMLExtract", + "open_zip = stac_generator.plugins.extraction_methods.open_zip:ZipExtract", + "datetime_bound_to_centroid = stac_generator.plugins.extraction_methods.datetime_bound_to_centroid:DatetimeBoundToCentroidExtract", "elasticsearch_aggregation = stac_generator.plugins.extraction_methods.elasticsearch_aggregation:ElasticsearchAggregationExtract", "json_file = stac_generator.plugins.extraction_methods.json_file:JsonFileExtract", "path_parts = stac_generator.plugins.extraction_methods.path_parts:PathPartsExtract", @@ -84,6 +86,7 @@ "geometry_line = stac_generator.plugins.extraction_methods.geometry_line:GeometryLineExtract", "geometry_point = stac_generator.plugins.extraction_methods.geometry_point:GeometryPointExtract", "geometry_polygon = stac_generator.plugins.extraction_methods.geometry_polygon:GeometryPolygonExtract", + "geometry_to_bbox = stac_generator.plugins.extraction_methods.geometry_to_bbox:GeometryToBboxExtract", "string_join = stac_generator.plugins.extraction_methods.string_join:StringJoinExtract", "string_template = stac_generator.plugins.extraction_methods.string_template:StringTemplateExtract", "facet_prefix = stac_generator.plugins.extraction_methods.facet_prefix:FacetPrefixExtract", diff --git a/stac_generator/core/baker.py b/stac_generator/core/baker.py index 41c48d6..0486c7f 100644 --- a/stac_generator/core/baker.py +++ b/stac_generator/core/baker.py @@ -105,9 +105,6 @@ def __init__(self, root_path: str): for file_path in Path(root_path).rglob("*.y*ml"): _ = self._load_data(file_path) - # only used during the - del self.location_map - def _load_data(self, file: Path) -> Recipe: """ Loads the yaml files from the root path and builds the recipe dictionary and map of @@ -159,7 +156,7 @@ def get(self, path: str, stac_type: str) -> Recipe: :param path: Path for which to retrieve the recipe """ if path in self.recipes[stac_type]: - return self.load_recipe(path) + return self.load_recipe(path, stac_type) for parent in chain([path], Path(path).parents): if parent in self.paths_map[stac_type]: @@ -168,3 +165,6 @@ def get(self, path: str, stac_type: str) -> Recipe: return self.load_recipe(key, stac_type) raise ValueError(f"No Recipe found for path: {path}") + + def get_maps(self): + return self.paths_map, self.location_map \ No newline at end of file diff --git a/stac_generator/core/extraction_method.py b/stac_generator/core/extraction_method.py index 3af6d5e..b56e38c 100644 --- a/stac_generator/core/extraction_method.py +++ b/stac_generator/core/extraction_method.py @@ -29,6 +29,9 @@ def __init__(self, **kwargs): # Override with specific processor settings self._set_attrs(kwargs) + if not hasattr(self, "exists_key"): + self.exists_key = "$" + def _set_attrs(self, conf: dict) -> None: """ Set instance attributes diff --git a/stac_generator/plugins/extraction_methods/asset_aggregator.py b/stac_generator/plugins/extraction_methods/asset_aggregator.py index ef9f2da..4820965 100644 --- a/stac_generator/plugins/extraction_methods/asset_aggregator.py +++ b/stac_generator/plugins/extraction_methods/asset_aggregator.py @@ -124,20 +124,22 @@ def run(self, body: dict, **kwargs) -> dict: for asset in body["assets"].values(): for list_term in self.list_terms: - body[list_term["name"]].append(asset[list_term["key"]]) + if list_term["key"] in asset: + body[list_term["name"]].append(asset[list_term["key"]]) for sum_term in self.sum_terms: - body[sum_term["name"]] += asset[sum_term["key"]] + if sum_term["key"] in asset: + body[sum_term["name"]] += asset[sum_term["key"]] for avg_term in self.avg_terms: body[avg_term["name"]] /= len(body["assets"]) for min_term in self.min_terms: - if asset[min_term["key"]] < body[min_term["name"]]: + if min_term["key"] in asset and asset[min_term["key"]] < body[min_term["name"]]: body[min_term["name"]] = asset[min_term["key"]] for max_term in self.max_terms: - if asset[max_term["key"]] < body[max_term["name"]]: + if max_term["key"] in asset and asset[max_term["key"]] < body[max_term["name"]]: body[max_term["name"]] = asset[max_term["key"]] return body diff --git a/stac_generator/plugins/extraction_methods/bbox.py b/stac_generator/plugins/extraction_methods/bbox.py index 1422325..1f02b51 100644 --- a/stac_generator/plugins/extraction_methods/bbox.py +++ b/stac_generator/plugins/extraction_methods/bbox.py @@ -41,14 +41,19 @@ class BboxExtract(BaseExtractionMethod): def run(self, body: dict, **kwargs): try: + west = body[self.coordinate_keys[0]] + south = body[self.coordinate_keys[1]] + east = body[self.coordinate_keys[2]] + north = body[self.coordinate_keys[3]] + coordinates = [ [ - float(body[self.coordinate_keys[0]]), - float(body[self.coordinate_keys[1]]), + float(west) if west is not None else west, + float(south) if south is not None else south, ], [ - float(body[self.coordinate_keys[2]]), - float(body[self.coordinate_keys[3]]), + float(east) if east is not None else east, + float(north) if north is not None else north, ], ] @@ -57,6 +62,9 @@ def run(self, body: dict, **kwargs): "coordinates": coordinates, } + except TypeError: + LOGGER.warning("Unable to convert bbox.", exc_info=True) + except KeyError: LOGGER.warning("Unable to convert bbox.", exc_info=True) diff --git a/stac_generator/plugins/extraction_methods/datetime_bound_to_centroid.py b/stac_generator/plugins/extraction_methods/datetime_bound_to_centroid.py new file mode 100644 index 0000000..0a73fb0 --- /dev/null +++ b/stac_generator/plugins/extraction_methods/datetime_bound_to_centroid.py @@ -0,0 +1,78 @@ +__author__ = "Richard Smith" +__date__ = "28 May 2021" +__copyright__ = "Copyright 2018 United Kingdom Research and Innovation" +__license__ = "BSD - see LICENSE file in top-level package directory" +__contact__ = "richard.d.smith@stfc.ac.uk" + + +import logging + +from datetime import datetime + +# Package imports +from stac_generator.core.extraction_method import BaseExtractionMethod + +LOGGER = logging.getLogger(__name__) + + +class DatetimeBoundToCentroidExtract(BaseExtractionMethod): + """ + + Processor Name: ``datetime_bound_to_centroid`` + + Description: + Accepts a dictionary of coordinate values and converts to `RFC 7946, section 5 `_ + formatted bbox. + + Configuration Options: + - ``coordinate_keys``: ``REQUIRED`` list of keys to convert to bbox array. Ordering is respected. + + Example Configuration: + + .. code-block:: yaml + + - method: stac_bbox + inputs: + output_term: polygon + + """ + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if not hasattr(self, "start_term"): + self.start_term = {"name": "start_datetime", "format": "%Y-%m-%dT%H:%M:%S"} + + if "name" not in self.start_term: + self.start_term["name"] = "start_datetime" + + if "format" not in self.start_term: + self.start_term["format"] = "%Y-%m-%dT%H:%M:%S" + + if not hasattr(self, "end_term"): + self.end_term = {"name": "end_datetime", "format": "%Y-%m-%dT%H:%M:%S"} + + if "name" not in self.end_term: + self.end_term["name"] = "start_datetime" + + if "format" not in self.end_term: + self.end_term["format"] = "%Y-%m-%dT%H:%M:%S" + + if not hasattr(self, "output_term"): + self.output_term = {"name": "datetime", "format": "%Y-%m-%dT%H:%M:%S"} + + if "name" not in self.output_term: + self.output_term["name"] = "datetime" + + if "format" not in self.output_term: + self.output_term["format"] = "%Y-%m-%dT%H:%M:%S" + + def run(self, body: dict, **kwargs): + + start_datetime = datetime.strptime(body[self.start_term["name"]], self.start_term["format"]) + end_datetime = datetime.strptime(body[self.end_term["name"]], self.end_term["format"]) + + centroid_datetime = start_datetime + (end_datetime - start_datetime) / 2 + + body[self.output_term["name"]] = centroid_datetime.strftime(self.output_term["format"]) + + return body diff --git a/stac_generator/plugins/extraction_methods/default.py b/stac_generator/plugins/extraction_methods/default.py index cc639a9..3f5807d 100644 --- a/stac_generator/plugins/extraction_methods/default.py +++ b/stac_generator/plugins/extraction_methods/default.py @@ -45,12 +45,6 @@ class DefaultExtract(BaseExtractionMethod): """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - - if not hasattr(self, "exists_key"): - self.exists_key = "$" - def run(self, body: dict, **kwargs) -> dict: defaults = {} for default_key, default_value in self.defaults.items(): diff --git a/stac_generator/plugins/extraction_methods/elasticsearch_aggregation.py b/stac_generator/plugins/extraction_methods/elasticsearch_aggregation.py index eb436c7..39cf9bd 100644 --- a/stac_generator/plugins/extraction_methods/elasticsearch_aggregation.py +++ b/stac_generator/plugins/extraction_methods/elasticsearch_aggregation.py @@ -15,6 +15,7 @@ # Third party imports from elasticsearch import Elasticsearch +from collections import defaultdict from stac_generator.core.extraction_method import BaseExtractionMethod @@ -68,24 +69,44 @@ def __init__(self, **kwargs): if not hasattr(self, "request_tiemout"): self.request_tiemout = 15 + for position, geo_bound_term in enumerate(self.geo_bounds): + if "name" not in geo_bound_term: + self.geo_bounds[position]["name"] = geo_bound_term["key"] + + for position, min_term in enumerate(self.min): + if "name" not in min_term: + self.min[position]["name"] = min_term["key"] + + for position, max_term in enumerate(self.max): + if "name" not in max_term: + self.max[position]["name"] = max_term["key"] + + for position, sum_term in enumerate(self.sum): + if "name" not in sum_term: + self.sum[position]["name"] = sum_term["key"] + + for position, list_term in enumerate(self.list): + if "name" not in list_term: + self.list[position]["name"] = list_term["key"] + @staticmethod - def bbox_query(facet: str) -> dict: + def geo_bounds_query(facet_key: str, facet_name: str) -> dict: """ Query to retrieve the BBOX from items """ - return {"bbox": {"geo_bounds": {"field": facet, "wrap_longitude": True}}} + return {facet_name: {"geo_bounds": {"field": facet_key}}} @staticmethod - def facet_composite_query(facet: str) -> dict: + def facet_composite_query(facet_key: str, facet_name: str) -> dict: """ Generate the composite aggregation for the facet :param facet: Facet to aggregate on """ return { - facet: { + facet_name: { "composite": { "sources": [ - {facet: {"terms": {"field": f"properties.{facet}.keyword"}}} + {facet_name: {"terms": {"field": facet_key}}} ], "size": 100, } @@ -93,39 +114,43 @@ def facet_composite_query(facet: str) -> dict: } @staticmethod - def min_query(facet: str) -> dict: + def min_query(facet_key: str, facet_name: str) -> dict: """ Query to retrieve the minimum value from docs """ - return {facet: {"min": {"field": f"properties.{facet}"}}} + return {facet_name: {"min": {"field": facet_key}}} @staticmethod - def max_query(facet: str) -> dict: + def max_query(facet_key: str, facet_name: str) -> dict: """ Query to retrieve the maximum value from docs """ - return {facet: {"max": {"field": f"properties.{facet}"}}} + return {facet_name: {"max": {"field": facet_key}}} @staticmethod - def sum_query(facet: str) -> dict: + def sum_query(facet_key: str, facet_name: str) -> dict: """ Query to retrieve the sum of the values from docs """ - return {facet: {"sum": {"field": f"properties.{facet}"}}} + return {facet_name: {"sum": {"field": facet_key}}} def extract_facet(self, facets: list): """ Function to extract the given facets from the aggregation """ for facet in facets: - if facet in self.aggregations.keys(): - if "value_as_string" in self.aggregations[facet].keys(): - value = self.aggregations[facet]["value_as_string"] + if facet["name"] in self.aggregations.keys(): + + if "value_as_string" in self.aggregations[facet["name"]].keys(): + value = self.aggregations[facet["name"]]["value_as_string"] + elif "bounds" in self.aggregations[facet["name"]].keys(): + value = self.aggregations[facet["name"]]["bounds"] + else: - value = self.aggregations[facet]["value"] + value = self.aggregations[facet["name"]]["value"] - self.metadata[facet] = value + self.metadata[facet["name"]] = value def extract_first_facet(self, facets: list): """ @@ -134,29 +159,29 @@ def extract_first_facet(self, facets: list): properties = self.hits[0]["_source"]["properties"] for facet in facets: - if facet in properties.keys(): - self.metadata[facet] = properties[facet] + if facet["key"] in properties.keys(): + self.metadata[facet["name"]] = properties[facet["key"]] def extract_facet_list(self, facets: list): """ Function to extract the lists of given facets from the aggregation """ next_query = self.base_query - items = self.aggregations + current_aggregations = self.aggregations while True: for facet in facets: - if facet in items.keys(): - aggregation = items[facet] + if facet["name"] in current_aggregations.keys(): + aggregation = current_aggregations[facet["name"]] - self.metadata[facet].extend( - [bucket["key"][facet] for bucket in aggregation["buckets"]] + self.metadata[facet["name"]].extend( + [bucket["key"][facet["name"]] for bucket in aggregation["buckets"]] ) if hasattr(aggregation, "after_key"): - next_query["aggs"] |= self.query["aggs"][facet] - next_query["aggs"][facet]["composite"]["sources"]["after"] = { - facet: aggregation["after_key"][facet] + next_query["aggs"] |= self.query["aggs"][facet["name"]] + next_query["aggs"][facet["name"]]["composite"]["sources"]["after"] = { + facet["name"]: aggregation["after_key"][facet["name"]] } if next_query == self.base_query: @@ -164,7 +189,7 @@ def extract_facet_list(self, facets: list): else: result = self.es.search(index=self.index, body=next_query) - items = result["aggregations"].items() + current_aggregations = result["aggregations"].items() def construct_base_query(self, key: str, uri: str) -> dict: """ @@ -176,7 +201,7 @@ def construct_base_query(self, key: str, uri: str) -> dict: "query": { "bool": { "must_not": [{"term": {"categories.keyword": {"value": "hidden"}}}], - "must": [{"term": {f"{key}.keyword": {"value": uri}}}], + "must": [{"term": {f"{key}": {"value": uri}}}], } }, "aggs": {}, @@ -188,25 +213,25 @@ def construct_query(self): """ self.query = self.base_query - if hasattr(self, "bbox"): - for bbox_term in self.bbox: - self.query["aggs"].update(self.bbox_query(bbox_term)) + if hasattr(self, "geo_bounds"): + for geo_term in self.geo_bounds: + self.query["aggs"].update(self.geo_bounds_query(geo_term["key"], geo_term["name"])) if hasattr(self, "min"): for min_term in self.min: - self.query["aggs"].update(self.min_query(min_term)) + self.query["aggs"].update(self.min_query(min_term["key"], min_term["name"])) if hasattr(self, "max"): for max_term in self.max: - self.query["aggs"].update(self.max_query(max_term)) + self.query["aggs"].update(self.max_query(max_term["key"], max_term["name"])) if hasattr(self, "sum"): for sum_term in self.sum: - self.query["aggs"].update(self.sum_query(sum_term)) + self.query["aggs"].update(self.sum_query(sum_term["key"], sum_term["name"])) if hasattr(self, "list"): for list_term in self.list: - self.query["aggs"].update(self.facet_composite_query(list_term)) + self.query["aggs"].update(self.facet_composite_query(list_term["key"], list_term["name"])) def extract_metadata(self): """ @@ -215,8 +240,8 @@ def extract_metadata(self): if hasattr(self, "first"): self.extract_first_facet(self.first) - if hasattr(self, "bbox"): - self.extract_facet(self.bbox) + if hasattr(self, "geo_bounds"): + self.extract_facet(self.geo_bounds) if hasattr(self, "min"): self.extract_facet(self.min) @@ -231,7 +256,7 @@ def extract_metadata(self): self.extract_facet_list(self.list) def run(self, body: dict, **kwargs) -> dict: - self.metadata = body + self.metadata = defaultdict(list) self.construct_base_query(self.id_term, body["uri"]) @@ -251,4 +276,4 @@ def run(self, body: dict, **kwargs) -> dict: # Extract metadata self.extract_metadata() - return self.metadata + return body | self.metadata diff --git a/stac_generator/plugins/extraction_methods/elasticsearch_assets.py b/stac_generator/plugins/extraction_methods/elasticsearch_assets.py index bbfedad..3fadf5d 100644 --- a/stac_generator/plugins/extraction_methods/elasticsearch_assets.py +++ b/stac_generator/plugins/extraction_methods/elasticsearch_assets.py @@ -125,7 +125,7 @@ def run(self, body: dict, **kwargs) -> dict: for extraction_method in self.extraction_methods: asset = extraction_method.run(asset) - assets[Path(path).stem] = asset + assets[Path(path).name] = asset body["assets"] = assets diff --git a/stac_generator/plugins/extraction_methods/geometry_polygon.py b/stac_generator/plugins/extraction_methods/geometry_polygon.py index f678363..95119e9 100644 --- a/stac_generator/plugins/extraction_methods/geometry_polygon.py +++ b/stac_generator/plugins/extraction_methods/geometry_polygon.py @@ -66,7 +66,7 @@ def run(self, body: dict, **kwargs): body["geometry"] = { "type": "Polygon", - "coordinates": coordinates, + "coordinates": [coordinates], } except KeyError: diff --git a/stac_generator/plugins/extraction_methods/geometry_to_bbox.py b/stac_generator/plugins/extraction_methods/geometry_to_bbox.py new file mode 100644 index 0000000..927f35e --- /dev/null +++ b/stac_generator/plugins/extraction_methods/geometry_to_bbox.py @@ -0,0 +1,68 @@ +__author__ = "Richard Smith" +__date__ = "28 May 2021" +__copyright__ = "Copyright 2018 United Kingdom Research and Innovation" +__license__ = "BSD - see LICENSE file in top-level package directory" +__contact__ = "richard.d.smith@stfc.ac.uk" + + +import logging + +# Package imports +from stac_generator.core.extraction_method import BaseExtractionMethod + +LOGGER = logging.getLogger(__name__) + + +class GeometryToBboxExtract(BaseExtractionMethod): + """ + + Processor Name: ``bbox`` + + Description: + Accepts a dictionary of coordinate values and converts to `RFC 7946, section 5 `_ + formatted bbox. + + Configuration Options: + - ``coordinate_keys``: ``REQUIRED`` list of keys to convert to bbox array. Ordering is respected. + + Example Configuration: + + .. code-block:: yaml + + - method: stac_bbox + inputs: + type: polygon + + """ + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if not hasattr(self, "input_term"): + self.input_term = "geometry" + + if not hasattr(self, "output_term"): + self.output_term = "bbox" + + def run(self, body: dict, **kwargs): + + coordinates = body[self.input_term]["coordinates"][0] + bbox = [coordinates[0][0], coordinates[0][1], coordinates[0][0], coordinates[0][1]] + + if self.type in ["polygon", "line"]: + for coordinate in coordinates: + + if coordinate[0] < bbox[0]: + bbox[0] = coordinate[0] + + elif coordinate[0] > bbox[2]: + bbox[2] = coordinate[0] + + if coordinate[1] < bbox[1]: + bbox[1] = coordinate[1] + + elif coordinate[1] > bbox[3]: + bbox[3] = coordinate[1] + + body[self.output_term] = bbox + + return body diff --git a/stac_generator/plugins/extraction_methods/iso_date.py b/stac_generator/plugins/extraction_methods/iso_date.py index 4281ec5..4be0577 100644 --- a/stac_generator/plugins/extraction_methods/iso_date.py +++ b/stac_generator/plugins/extraction_methods/iso_date.py @@ -61,21 +61,23 @@ def run(self, body: dict, **kwargs) -> dict: date_iso = None if not date_str: - LOGGER.error(f"{date_key} not present in body for {body['uri']}") - - if hasattr(self, "formats"): + LOGGER.error(f"{date_key} not present in body for {body.get('uri', body.get('href'))}") - for date_format in self.formats: - try: - date_iso = datetime.strptime(date_str, date_format).isoformat() + else: - except ValueError: - pass + if hasattr(self, "formats"): + + for date_format in self.formats: + try: + date_iso = datetime.strptime(date_str, date_format).isoformat() - else: - date_iso = datetime.strptime(date_str).isoformat() + except ValueError: + pass + + else: + date_iso = datetime.strptime(date_str).isoformat() - if date_iso: - body[date_key] = date_iso + if date_iso: + body[date_key] = date_iso return body diff --git a/stac_generator/plugins/extraction_methods/lambda.py b/stac_generator/plugins/extraction_methods/lambda.py index ad58f85..49ba492 100644 --- a/stac_generator/plugins/extraction_methods/lambda.py +++ b/stac_generator/plugins/extraction_methods/lambda.py @@ -9,6 +9,7 @@ import logging import re import ast +import traceback # Package imports @@ -70,28 +71,40 @@ def __init__(self, **kwargs): def run(self, body: dict, **kwargs): output_body = body.copy() - function = eval(self.function) + try: + function = eval(self.function) - function_args = [] - for input_arg in self.input_args: - if isinstance(input_arg, str) and input_arg[0] == self.exists_key: - input_arg = body[input_arg[1:]] + function_args = [] + for input_arg in self.input_args: + if isinstance(input_arg, str) and input_arg[0] == self.exists_key: + if input_arg[1:] in body: + input_arg = body[input_arg[1:]] + else: + return body - function_args.append(input_arg) + function_args.append(input_arg) - function_kwargs = {} - for input_kwarg_key, input_kwarg_value in self.input_kwargs.items(): - if isinstance(input_kwarg_value, str) and input_kwarg_value[0] == self.exists_key: - input_kwarg_value = body[input_kwarg_value[1:]] + function_kwargs = {} + for input_kwarg_key, input_kwarg_value in self.input_kwargs.items(): + if isinstance(input_kwarg_value, str) and input_kwarg_value[0] == self.exists_key: + if input_kwarg_value[1:] in body: + input_kwarg_value = body[input_kwarg_value[1:]] + else: + return body - function_kwargs[input_kwarg_key] = input_kwarg_value + function_kwargs[input_kwarg_key] = input_kwarg_value - result = function(*function_args, **function_kwargs) + result = function(*function_args, **function_kwargs) - if self.output_key: - output_body[self.output_key] = result + if self.output_key: + output_body[self.output_key] = result - elif isinstance(result, dict): - output_body |= result + elif isinstance(result, dict): + output_body |= result - return output_body + return output_body + + except Exception as e: + LOGGER.warning(f"Lamda function: {self.function} failed.") + + return output_body \ No newline at end of file diff --git a/stac_generator/plugins/extraction_methods/netcdf.py b/stac_generator/plugins/extraction_methods/netcdf.py index 048ff37..abc92a6 100644 --- a/stac_generator/plugins/extraction_methods/netcdf.py +++ b/stac_generator/plugins/extraction_methods/netcdf.py @@ -134,6 +134,10 @@ def run(self, body: dict, **kwargs) -> dict: name = cf_term.get("name") key = cf_term.get("key", name) - body[name] = cf_attributes[key] + try: + body[name] = cf_attributes[key].strip() + + except KeyError: + body[name] = None return body diff --git a/stac_generator/plugins/extraction_methods/open_zip.py b/stac_generator/plugins/extraction_methods/open_zip.py new file mode 100644 index 0000000..b4bfcfa --- /dev/null +++ b/stac_generator/plugins/extraction_methods/open_zip.py @@ -0,0 +1,117 @@ +# encoding: utf-8 +""" +.. _xml-extract: + +XML Extract +------------ +""" +__author__ = "Richard Smith" +__date__ = "19 Aug 2021" +__copyright__ = "Copyright 2018 United Kingdom Research and Innovation" +__license__ = "BSD - see LICENSE file in top-level package directory" +__contact__ = "richard.d.smith@stfc.ac.uk" + +# Python imports +from collections import defaultdict +import logging +import zipfile +import tempfile +from pathlib import Path +from xml.etree import ElementTree +from xml.etree.ElementTree import ParseError + +from stac_generator.core.extraction_method import BaseExtractionMethod + +# Package imports + + +LOGGER = logging.getLogger(__name__) + + +class ZipExtract(BaseExtractionMethod): + """ + .. list-table:: + + * - Processor Name + - ``xml`` + + Description: + Processes XML documents to extract metadata + + Configuration Options: + - ``extraction_keys``: List of keys to retrieve from the document. + - ``filter_expr``: Regex to match against files to limit the attempts to known files + - ``namespaces``: Map of namespaces + + Extraction Keys: + Extraction keys should be a map. + + .. list-table:: + + * - Name + - Description + * - ``name`` + - Name of the outputted attribute + * - ``key`` + - Access key to extract the required data. Passed to + `xml.etree.ElementTree.find() `_ + and also supports `xpath formatted `_ accessors + * - ``attribute`` + - Allows you to select from the element attribute. In the absence of this value, the default behaviour is to access the text value of the key. + In some cases, you might want to access and attribute of the element. + + Example: + .. code-block:: yaml + + - method: start_datetime + key: './/gml:beginPosition' + + Example configuration: + .. code-block:: yaml + + - method: xml + inputs: + filter_expr: '\.manifest$' + extraction_keys: + - name: start_datetime + key: './/gml:beginPosition' + attribute: start + + # noqa: W605 + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if not hasattr(self, "output_key"): + self.output_key = "zip_file" + + if not hasattr(self, "inner_file"): + self.inner_file = "" + + if not hasattr(self, "zip_file"): + self.zip_file = "uri" + + def run(self, body: dict, **kwargs) -> dict: + # Extract the keys + self.body = body + try: + if self.zip_file[0] == self.exists_key: + self.zip_file = body[self.zip_file[1:]] + + if self.inner_file[0] == self.exists_key: + self.inner_file = body[self.inner_file[1:]] + + with zipfile.ZipFile(self.zip_file) as z: + if self.inner_file: + file_obj = z.read(self.inner_file) + else: + file_obj = z.read() + + except FileNotFoundError: + # return body + file_obj = tempfile.TemporaryFile() + + body[self.output_key] = file_obj + + return body diff --git a/stac_generator/plugins/extraction_methods/regex.py b/stac_generator/plugins/extraction_methods/regex.py index 415e114..8a6dc5f 100644 --- a/stac_generator/plugins/extraction_methods/regex.py +++ b/stac_generator/plugins/extraction_methods/regex.py @@ -59,7 +59,6 @@ def run(self, body: dict, **kwargs) -> dict: result = re.search(self.regex, body[self.input_term]) if result: - LOGGER.info("Found matches for regex extract") body = body | result.groupdict() else: diff --git a/stac_generator/plugins/extraction_methods/remove.py b/stac_generator/plugins/extraction_methods/remove.py index 5ca8ffe..a2e5b1d 100644 --- a/stac_generator/plugins/extraction_methods/remove.py +++ b/stac_generator/plugins/extraction_methods/remove.py @@ -42,6 +42,7 @@ class RemoveExtract(BaseExtractionMethod): def run(self, body: dict, **kwargs): for key in self.keys: - del body[key] + if key in body: + del body[key] return body diff --git a/stac_generator/plugins/extraction_methods/stac_bbox.py b/stac_generator/plugins/extraction_methods/stac_bbox.py index 117dfb4..d2aea2f 100644 --- a/stac_generator/plugins/extraction_methods/stac_bbox.py +++ b/stac_generator/plugins/extraction_methods/stac_bbox.py @@ -38,14 +38,24 @@ class STACBboxExtract(BaseExtractionMethod): - north """ + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if not hasattr(self, "output_term"): + self.output_term = "bbox" def run(self, body: dict, **kwargs): try: - body["bbox"] = [ - float(body[self.coordinate_keys[0]]), - float(body[self.coordinate_keys[1]]), - float(body[self.coordinate_keys[2]]), - float(body[self.coordinate_keys[3]]), + west = body[self.coordinate_keys[0]] + south = body[self.coordinate_keys[1]] + east = body[self.coordinate_keys[2]] + north = body[self.coordinate_keys[3]] + + body[self.output_term] = [ + float(west) if west is not None else west, + float(south) if south is not None else south, + float(east) if east is not None else east, + float(north) if north is not None else north, ] except KeyError: diff --git a/stac_generator/plugins/extraction_methods/xml.py b/stac_generator/plugins/extraction_methods/xml.py index acf5abc..400259c 100644 --- a/stac_generator/plugins/extraction_methods/xml.py +++ b/stac_generator/plugins/extraction_methods/xml.py @@ -12,6 +12,7 @@ __contact__ = "richard.d.smith@stfc.ac.uk" # Python imports +from collections import defaultdict import logging import os from pathlib import Path @@ -87,23 +88,41 @@ def __init__(self, **kwargs): def run(self, body: dict, **kwargs) -> dict: # Extract the keys try: - xml_file = ElementTree.parse(body[self.input_term]) + if isinstance(body[self.input_term], str): + xml_file = ElementTree.parse(body[self.input_term]) - except ParseError: + else: + xml_file = ElementTree.XML(body[self.input_term]) + + except (ParseError, FileNotFoundError, TypeError): return body + output = defaultdict(list) + for key in self.extraction_keys: - value = xml_file.find(key["key"], self.namespaces) + values = xml_file.findall(key["key"], self.namespaces) + + for value in values: + + if value is not None: + attribute = key.get("attribute") + + if attribute: + v = value.get(attribute, "") + + else: + v = value.text + + if v and v not in output[key["name"]]: + output[key["name"]].append(v.strip()) - if value is not None: - attribute = key.get("attribute") + if output[key["name"]] and len(output[key["name"]]) == 1: + output[key["name"]] = output[key["name"]][0] - if attribute: - v = value.get(attribute, "") + if not output[key["name"]]: + output[key["name"]] = None - else: - v = value.text - body[key["name"]] = v + body |= output return body diff --git a/stac_generator/plugins/mappings/stac.py b/stac_generator/plugins/mappings/stac.py index 69266b1..bb86071 100644 --- a/stac_generator/plugins/mappings/stac.py +++ b/stac_generator/plugins/mappings/stac.py @@ -32,17 +32,12 @@ class STACMapping(BaseMapping): """ - def run( - self, - body: dict, - recipe: Recipe, - **kwargs, - ) -> dict: + def item(self, body:dict) -> dict: output = { "type": "Feature", "stac_version": self.stac_version, "stac_extensions": self.stac_extensions, - "id": body.pop(f"{kwargs['TYPE'].value}_id"), + "id": body.pop("item_id"), "geometry": None, "assets": {}, "properties": { @@ -78,3 +73,51 @@ def run( output["properties"] |= body return output + + def collection(self, body: dict) -> dict: + output = { + "type": "Collection", + "stac_version": self.stac_version, + "stac_extensions": self.stac_extensions, + "id": body.pop("collection_id"), + "description": body.pop("description"), + "extent": { + "temporal": { + "interval": None, + }, + "spatial": { + "bbox": None, + } + }, + "summaries": {}, + "assets": {}, + } + + extent = {} + if "interval" in body: + output["extent"]["temporal"]["interval"] = body.pop("interval") + + if "bbox" in body: + output["extent"]["spatial"]["bbox"] = body.pop("bbox") + + if "member_of_recipes" in body: + output["member_of_recipes"] = body.pop("member_of_recipes") + + output["summaries"] |= body + + return output + + + def run( + self, + body: dict, + recipe: Recipe, + **kwargs, + ) -> dict: + if kwargs['TYPE'].value == "item": + return self.item(body) + + elif kwargs['TYPE'].value == "collection": + return self.collection(body) + + return body \ No newline at end of file diff --git a/stac_generator/plugins/outputs/stac_fastapi.py b/stac_generator/plugins/outputs/stac_fastapi.py index ea73701..af56795 100644 --- a/stac_generator/plugins/outputs/stac_fastapi.py +++ b/stac_generator/plugins/outputs/stac_fastapi.py @@ -45,6 +45,7 @@ from stac_generator.core.output import BaseOutput +LOGGER = logging.getLogger(__name__) class STACFastAPIOutput(BaseOutput): """ @@ -58,8 +59,7 @@ def __init__(self, **kwargs): if not hasattr(self, "verify"): self.verify = True - def export(self, data: dict, **kwargs) -> None: - + def item(self, data: dict) -> None: collections = data["collection"] if isinstance(data["collection"], str): @@ -67,6 +67,8 @@ def export(self, data: dict, **kwargs) -> None: for collection in collections: + collection = data["collection"] = collection.lower() + response = requests.post( urljoin(self.api_url, f"collections/{collection}/items"), json=data, verify=self.verify ) @@ -87,3 +89,21 @@ def export(self, data: dict, **kwargs) -> None: response = requests.post( urljoin(self.api_url, f"collections/{collection}/items"), json=data, verify=self.verify ) + + elif response.status_code != 200: + LOGGER.warning(f"FastAPI Output failed with status code: {response.status_code} and response text: {response.text}") + + def collection(self, data: dict) -> None: + + response = requests.update( + urljoin(self.api_url, f"collections"), json=data, verify=self.verify + ) + + + def export(self, data: dict, **kwargs) -> None: + + if kwargs['TYPE'].value == "item": + self.item(data) + + elif kwargs['TYPE'].value == "collection": + self.collection(data) diff --git a/stac_generator/scripts/recipe_keys.py b/stac_generator/scripts/recipe_keys.py new file mode 100644 index 0000000..5282fd4 --- /dev/null +++ b/stac_generator/scripts/recipe_keys.py @@ -0,0 +1,81 @@ +# encoding: utf-8 +""" + +""" +__author__ = "Richard Smith" +__date__ = "08 Jun 2021" +__copyright__ = "Copyright 2018 United Kingdom Research and Innovation" +__license__ = "BSD - see LICENSE file in top-level package directory" +__contact__ = "richard.d.smith@stfc.ac.uk" + +import argparse +import cProfile +import logging + +import click +import pkg_resources +import yaml + +from stac_generator.core.exceptions import NoPluginsError +from stac_generator.core.generator import BaseGenerator +from stac_generator.core.utils import load_plugins + +def load_config(path): + with open(path) as reader: + conf = yaml.safe_load(reader) + return conf + + +def load_generator(conf: dict) -> BaseGenerator: + """ + Load the generator. + + Looks for generator defined in the configuration in preference + and falls back to the first defined entry point + at ``stac_generator.generators`` + + :param conf: Configuration dict + :return: Generator + """ + + generator = None + + if conf.get("generator"): + entry_points = pkg_resources.iter_entry_points( + "stac_generator.generators", conf.get("generator") + ) + + for entry_point in entry_points: + generator = entry_point.load() + + # Only load the first one + break + + if not generator: + raise NoPluginsError("No extraction plugins have been loaded") + + return generator(conf) + + +@click.command() +@click.option( + "--conf", + "-c", + "conf", + required=True, + help="Path for generator configuration.", +) +def main(conf): + + conf = load_config(conf) + + generator = load_generator(conf) + + paths_map, location_map = generator.recipes.get_maps() + + print("Path map", path_map) + print("Location map", location_map) + + +if __name__ == "__main__": + main() diff --git a/stac_generator/scripts/stac_generator.py b/stac_generator/scripts/stac_generator.py index 27dba53..3a35ee9 100644 --- a/stac_generator/scripts/stac_generator.py +++ b/stac_generator/scripts/stac_generator.py @@ -20,19 +20,6 @@ from stac_generator.core.generator import BaseGenerator from stac_generator.core.utils import load_plugins - -def command_args(): - """ - Sets the command line arguments and handles their parsing - :return: command line options - """ - parser = argparse.ArgumentParser(description="Run the asset scanner as configured") - parser.add_argument("conf", help="Path to a yaml configuration file") - args = parser.parse_args() - - return args - - def setup_logging(conf): config = conf.get("logging", {}) if not config or (config and not config.get("format")):