diff --git a/docs/clay_over_aoi.ipynb b/docs/clay_over_aoi.ipynb
new file mode 100644
index 00000000..51dcbb74
--- /dev/null
+++ b/docs/clay_over_aoi.ipynb
@@ -0,0 +1,807 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "c67aed17-49b0-4cc3-9bcb-6110c05ff809",
+ "metadata": {},
+ "source": [
+ "# How to run clay over custom AOIs\n",
+ "\n",
+ "This script shows in a few simple steps how the clay model can be run for custom AOIs and over custom date ranges.\n",
+ "\n",
+ "## Download and open global list of MGRS tiles"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b11e3a6a-3fa9-4c36-9f02-8991f7b4ec8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from pathlib import Path\n",
+ "\n",
+ "# The repo home is our working directory.\n",
+ "wd = Path.cwd().parent\n",
+ "os.chdir(wd)\n",
+ "# Ensure data directories exist\n",
+ "Path(\"data/mgrs\").mkdir(exist_ok=True)\n",
+ "Path(\"data/chips\").mkdir(exist_ok=True)\n",
+ "Path(\"data/checkpoints\").mkdir(exist_ok=True)\n",
+ "Path(\"data/embeddings\").mkdir(exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b65dbec9-725d-4032-bc3e-13e0dbebfcf0",
+ "metadata": {},
+ "source": [
+ "A full list of MGRS tiles has been created as part of the landcover based sampling strategy. The file is [sourced from a complete MGRS tile list](https://github.com/Clay-foundation/model/blob/main/scripts/landcover.sh#L7), and then [itersected with the WorldCover landcover](https://github.com/Clay-foundation/model/blob/main/scripts/landcover.py) layer, outputting the `mgrs_full.fgb` file that is used below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "6643b779-5f87-42ce-87f6-d4247f60db12",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2024-01-15 12:40:58-- https://clay-mgrs-samples.s3.amazonaws.com/mgrs_full.fgb\n",
+ "Resolving clay-mgrs-samples.s3.amazonaws.com (clay-mgrs-samples.s3.amazonaws.com)... 3.5.28.235, 54.231.165.153, 52.217.89.132, ...\n",
+ "Connecting to clay-mgrs-samples.s3.amazonaws.com (clay-mgrs-samples.s3.amazonaws.com)|3.5.28.235|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 13787464 (13M) [binary/octet-stream]\n",
+ "Saving to: ‘data/mgrs/mgrs_full.fgb’\n",
+ "\n",
+ "data/mgrs/mgrs_full 100%[===================>] 13.15M 4.30MB/s in 3.1s \n",
+ "\n",
+ "2024-01-15 12:41:02 (4.30 MB/s) - ‘data/mgrs/mgrs_full.fgb’ saved [13787464/13787464]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import geopandas as gpd\n",
+ "\n",
+ "! wget https://clay-mgrs-samples.s3.amazonaws.com/mgrs_full.fgb -O data/mgrs/mgrs_full.fgb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "beb8d805-38c6-4b08-bff8-fdb59f598535",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Name
\n",
+ "
geometry
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
59CNK
\n",
+ "
MULTIPOLYGON (((170.99885 -81.06088, 177.30316...
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
59CNJ
\n",
+ "
MULTIPOLYGON (((170.99872 -81.95638, 177.99281...
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
58CEN
\n",
+ "
MULTIPOLYGON (((164.99856 -82.85238, 172.85358...
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
58CEP
\n",
+ "
MULTIPOLYGON (((164.99872 -81.95638, 171.99281...
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
60CVQ
\n",
+ "
MULTIPOLYGON (((171.25220 -81.01644, 177.56261...
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
56681
\n",
+ "
02CMR
\n",
+ "
MULTIPOLYGON (((-176.23170 -80.12446, -170.488...
\n",
+ "
\n",
+ "
\n",
+ "
56682
\n",
+ "
02CMQ
\n",
+ "
MULTIPOLYGON (((-176.74780 -81.01644, -170.437...
\n",
+ "
\n",
+ "
\n",
+ "
56683
\n",
+ "
01CEK
\n",
+ "
MULTIPOLYGON (((-177.00115 -81.06088, -170.696...
\n",
+ "
\n",
+ "
\n",
+ "
56684
\n",
+ "
01CEJ
\n",
+ "
MULTIPOLYGON (((-177.00128 -81.95638, -170.007...
\n",
+ "
\n",
+ "
\n",
+ "
56685
\n",
+ "
01CEH
\n",
+ "
MULTIPOLYGON (((-177.00144 -82.85238, -169.146...
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
56686 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name geometry\n",
+ "0 59CNK MULTIPOLYGON (((170.99885 -81.06088, 177.30316...\n",
+ "1 59CNJ MULTIPOLYGON (((170.99872 -81.95638, 177.99281...\n",
+ "2 58CEN MULTIPOLYGON (((164.99856 -82.85238, 172.85358...\n",
+ "3 58CEP MULTIPOLYGON (((164.99872 -81.95638, 171.99281...\n",
+ "4 60CVQ MULTIPOLYGON (((171.25220 -81.01644, 177.56261...\n",
+ "... ... ...\n",
+ "56681 02CMR MULTIPOLYGON (((-176.23170 -80.12446, -170.488...\n",
+ "56682 02CMQ MULTIPOLYGON (((-176.74780 -81.01644, -170.437...\n",
+ "56683 01CEK MULTIPOLYGON (((-177.00115 -81.06088, -170.696...\n",
+ "56684 01CEJ MULTIPOLYGON (((-177.00128 -81.95638, -170.007...\n",
+ "56685 01CEH MULTIPOLYGON (((-177.00144 -82.85238, -169.146...\n",
+ "\n",
+ "[56686 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mgrs = gpd.read_file(\"data/mgrs/mgrs_full.fgb\")\n",
+ "mgrs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "05bd8621-f4d0-40e3-8a4c-ac279f377ec0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\n",
+ "Name: WGS 84\n",
+ "Axis Info [ellipsoidal]:\n",
+ "- Lat[north]: Geodetic latitude (degree)\n",
+ "- Lon[east]: Geodetic longitude (degree)\n",
+ "Area of Use:\n",
+ "- name: World.\n",
+ "- bounds: (-180.0, -90.0, 180.0, 90.0)\n",
+ "Datum: World Geodetic System 1984 ensemble\n",
+ "- Ellipsoid: WGS 84\n",
+ "- Prime Meridian: Greenwich"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mgrs.crs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a5211112-d169-40b4-83af-1a7cc40c2439",
+ "metadata": {},
+ "source": [
+ "## Create a Geopandas dataframe with AOI\n",
+ "\n",
+ "This example uses a bounding box over the area around Puri, India."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "8a5fb076-e19f-4b34-b938-02b06829fef7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import geopandas as gpd\n",
+ "import pandas as pd\n",
+ "from shapely import box\n",
+ "\n",
+ "aoi = gpd.GeoDataFrame(\n",
+ " pd.DataFrame([\"Puri\"], columns=[\"Region\"]),\n",
+ " crs=\"EPSG:4326\",\n",
+ " geometry=[box(85.0503, 19.4949, 86.1042, 20.5642)],\n",
+ ")\n",
+ "aoi.geometry[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b1f838fa-4c18-4934-b6cc-dff4702145c9",
+ "metadata": {},
+ "source": [
+ "## Intersect the AOI with the MGRS tile layer\n",
+ "\n",
+ "This will select the MGRS tiles that intersect with your AOI. The processing will then happen for each of the MGRS tiles. This will most likely provide slightly more data than the AOI itself, as the whole tile data will downloaded for each matched MGRS tile.\n",
+ "\n",
+ "Store the intersected tiles in a file, it will be used by the `datacube.py` script."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "7ba0c7c2-9937-461a-8c48-d9c607e93bd0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mgrs_aoi = mgrs.overlay(aoi)\n",
+ "# Rename the name column to use lowercase letters for the datacube script to\n",
+ "# pick upthe MGRS tile name.\n",
+ "mgrs_aoi = mgrs_aoi.rename(columns={\"Name\": \"name\"})\n",
+ "mgrs_aoi.geometry[2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "045a0e36-025a-4381-b633-85ab236b1450",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
name
\n",
+ "
Region
\n",
+ "
geometry
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
45QTC
\n",
+ "
Puri
\n",
+ "
POLYGON ((85.18408 19.79882, 85.05030 19.79707...
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
45QUC
\n",
+ "
Puri
\n",
+ "
POLYGON ((85.09075 19.79784, 85.08133 20.56420...
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
45QVC
\n",
+ "
Puri
\n",
+ "
POLYGON ((86.04484 19.80549, 86.04018 20.56420...
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
45QVB
\n",
+ "
Puri
\n",
+ "
POLYGON ((86.10420 19.89401, 86.10420 19.49490...
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
45QTB
\n",
+ "
Puri
\n",
+ "
POLYGON ((85.18307 19.88715, 85.18741 19.49490...
\n",
+ "
\n",
+ "
\n",
+ "
5
\n",
+ "
45QUB
\n",
+ "
Puri
\n",
+ "
POLYGON ((86.10420 19.89408, 86.10420 19.49490...
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name Region geometry\n",
+ "0 45QTC Puri POLYGON ((85.18408 19.79882, 85.05030 19.79707...\n",
+ "1 45QUC Puri POLYGON ((85.09075 19.79784, 85.08133 20.56420...\n",
+ "2 45QVC Puri POLYGON ((86.04484 19.80549, 86.04018 20.56420...\n",
+ "3 45QVB Puri POLYGON ((86.10420 19.89401, 86.10420 19.49490...\n",
+ "4 45QTB Puri POLYGON ((85.18307 19.88715, 85.18741 19.49490...\n",
+ "5 45QUB Puri POLYGON ((86.10420 19.89408, 86.10420 19.49490..."
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mgrs_aoi.to_file(\"data/mgrs/mgrs_aoi.fgb\")\n",
+ "mgrs_aoi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd5a5329-3ffd-4de8-9e36-c3b7a5f47e60",
+ "metadata": {},
+ "source": [
+ "## Use the datacube.py script to download imagery\n",
+ "\n",
+ "Each run of th datacube script will take an index as input, which is the index of the MGRS tile within the input file. This is why we need to download the data in a loop.\n",
+ "\n",
+ "A list of date ranges can be specified. The script will look for the least cloudy Sentinel-2 scene for each date range, and match Sentinel-1 dates near the identified Sentinel-2 dates.\n",
+ "\n",
+ "The output folder can be specified as a local folder, or a bucket can be specified to upload the data to S3.\n",
+ "\n",
+ "Note that for the script to run, a Microsoft Planetary Computer token needs to be set up, consult the [Planetary Computer SDK](https://github.com/microsoft/planetary-computer-sdk-for-python) documentation on how to set up the token."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "a06149e2-116a-4a49-92a8-43aaa053bb43",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Usage: datacube.py [OPTIONS]\n",
+ "\n",
+ "Options:\n",
+ " --sample TEXT Location of MGRS tile sample\n",
+ " --index INTEGER Index of MGRS tile from sample file that should be\n",
+ " processed\n",
+ " --bucket TEXT Specify the bucket for where to write the data.\n",
+ " --subset TEXT For debugging, subset x and y to this pixel window as a\n",
+ " commaseparated string of 4 integers.\n",
+ " --localpath TEXT If specified, this path will be used to write the tiles\n",
+ " locallyOtherwise a temp dir will be used.\n",
+ " --dateranges TEXT Comma separated list of date ranges, each provided as yy-\n",
+ " mm-dd/yy-mm-dd.\n",
+ " --help Show this message and exit.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the help of the script to get a sense of the input parameters.\n",
+ "! python scripts/datacube.py --help"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b95c426-d82a-4973-94c0-c761affe9b9e",
+ "metadata": {},
+ "source": [
+ "By default, the datacube script will download all the data available for the each MGRS tile it processes. So the output might include imagery chips that are outside of the AOI specified.\n",
+ "\n",
+ "To speed up processing in the example below, we use the subset argument to reduce each MGRS tile to a small pixel window. When subsetting, the script will only download a fraction of each MGRS tile. This will lead to discontinous datasets and should not be used in a real use case. Remove the subset argument when using the script for a real world application, where all the data should be downloaded for each MGRS tile."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "6e176358-1c66-47be-8357-47f27e4359a1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 name 45QTC\n",
+ "Region Puri\n",
+ "geometry POLYGON ((85.1840761005 19.798817712, 85.0503 ...\n",
+ "Name: 0, dtype: object\n",
+ "Starting algorithm for MGRS tile 45QVB with index 0\n",
+ "Processing data for date range 2020-01-01/2020-04-01\n",
+ "Found 29 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2020-03-27/2020-04-02\n",
+ "Found 1 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 1 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "Too much no-data in vv\n",
+ "Too much no-data in vv\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Too much no-data in vv\n",
+ "Too much no-data in vv\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Processing data for date range 2021-06-01/2021-09-15\n",
+ "Found 14 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2021-08-22/2021-08-28\n",
+ "Found 4 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 2 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "1 name 45QUC\n",
+ "Region Puri\n",
+ "geometry POLYGON ((85.0907496721 19.7978447286, 85.0813...\n",
+ "Name: 1, dtype: object\n",
+ "Starting algorithm for MGRS tile 45QUB with index 1\n",
+ "Processing data for date range 2020-01-01/2020-04-01\n",
+ "Found 14 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2020-02-09/2020-02-15\n",
+ "Found 1 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 2 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Processing data for date range 2021-06-01/2021-09-15\n",
+ "Found 7 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2021-08-22/2021-08-28\n",
+ "Found 4 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 2 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "2 name 45QVC\n",
+ "Region Puri\n",
+ "geometry POLYGON ((86.0448445217 19.8054934333, 86.0401...\n",
+ "Name: 2, dtype: object\n",
+ "Starting algorithm for MGRS tile 45QVC with index 2\n",
+ "Processing data for date range 2020-01-01/2020-04-01\n",
+ "Found 27 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2020-02-11/2020-02-17\n",
+ "Found 1 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 3 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Processing data for date range 2021-06-01/2021-09-15\n",
+ "Found 13 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2021-06-03/2021-06-09\n",
+ "Found 2 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 4 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "3 name 45QVB\n",
+ "Region Puri\n",
+ "geometry POLYGON ((86.1042 19.894006473585993, 86.1042 ...\n",
+ "Name: 3, dtype: object\n",
+ "Starting algorithm for MGRS tile 45QUC with index 3\n",
+ "Processing data for date range 2020-01-01/2020-04-01\n",
+ "Found 16 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2020-02-09/2020-02-15\n",
+ "Found 2 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 4 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Processing data for date range 2021-06-01/2021-09-15\n",
+ "Found 7 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2021-06-03/2021-06-09\n",
+ "Found 2 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 4 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "4 name 45QTB\n",
+ "Region Puri\n",
+ "geometry POLYGON ((85.183071142 19.8871534399, 85.18741...\n",
+ "Name: 4, dtype: object\n",
+ "Starting algorithm for MGRS tile 45QTC with index 4\n",
+ "Processing data for date range 2020-01-01/2020-04-01\n",
+ "Found 30 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2020-01-10/2020-01-16\n",
+ "Found 2 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 4 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Processing data for date range 2021-06-01/2021-09-15\n",
+ "Found 12 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2021-06-03/2021-06-09\n",
+ "Found 2 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 4 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "5 name 45QUB\n",
+ "Region Puri\n",
+ "geometry POLYGON ((86.1042 19.894075652432416, 86.1042 ...\n",
+ "Name: 5, dtype: object\n",
+ "Starting algorithm for MGRS tile 45QTB with index 5\n",
+ "Processing data for date range 2020-01-01/2020-04-01\n",
+ "Found 29 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2020-02-09/2020-02-15\n",
+ "Found 1 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 2 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Processing data for date range 2021-06-01/2021-09-15\n",
+ "Found 12 Sentinel-2 items\n",
+ "EPSG code based on Sentinel-2 item: 32645\n",
+ "Searching S1 in date range 2021-06-03/2021-06-09\n",
+ "Found 3 Sentinel-1 items\n",
+ "Most overlapped orbit: descending\n",
+ "Found 3 DEM items\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "/home/tam/apps/miniforge3/envs/claymodel/lib/python3.11/site-packages/stackstac/prepare.py:408: UserWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n",
+ " times = pd.to_datetime(\n",
+ "Subsetting to [1500, 1500, 2524, 2524]\n",
+ "Writing tempfiles to data/chips\n",
+ "Too much cloud coverage\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n",
+ "Too much cloud coverage\n",
+ "Too much cloud coverage\n",
+ "No bucket specified, skipping S3 sync.\n"
+ ]
+ }
+ ],
+ "source": [
+ "for index, row in mgrs_aoi.iterrows():\n",
+ " print(index, row)\n",
+ " ! python scripts/datacube.py --sample data/mgrs/mgrs_aoi.fgb --subset 1500,1500,2524,2524 --localpath data/chips --index {index} --dateranges 2020-01-01/2020-04-01,2021-06-01/2021-09-15"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1f640c20-87b3-445b-9de8-e91670562df1",
+ "metadata": {},
+ "source": [
+ "## Create the embeddings for each training chip\n",
+ "\n",
+ "The checkpoints can be accessed directly from huggingface at https://huggingface.co/made-with-clay/Clay.\n",
+ "\n",
+ "The following command will automatically download and cache the model weights and run the model to create the embeddings."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "91f194ad-781c-4fa3-bdd8-6d1af29b23b5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "W&B disabled.\n",
+ "Seed set to 42\n",
+ "Using 16bit Automatic Mixed Precision (AMP)\n",
+ "GPU available: True (cuda), used: True\n",
+ "TPU available: False, using: 0 TPU cores\n",
+ "IPU available: False, using: 0 IPUs\n",
+ "HPU available: False, using: 0 HPUs\n",
+ "[rank: 0] Seed set to 42\n",
+ "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+ "----------------------------------------------------------------------------------------------------\n",
+ "distributed_backend=nccl\n",
+ "All distributed processes registered. Starting with 1 processes\n",
+ "----------------------------------------------------------------------------------------------------\n",
+ "\n",
+ "Total number of chips: 28\n",
+ "Restoring states from the checkpoint path at https://huggingface.co/made-with-clay/Clay/resolve/main/Clay_v0.1_epoch-24_val-loss-0.46.ckpt\n",
+ "Downloading: \"https://huggingface.co/made-with-clay/Clay/resolve/main/Clay_v0.1_epoch-24_val-loss-0.46.ckpt\" to /home/tam/.cache/torch/hub/checkpoints/Clay_v0.1_epoch-24_val-loss-0.46.ckpt\n",
+ "100%|████████████████████████████████████████| 487M/487M [00:18<00:00, 27.8MB/s]\n",
+ "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+ "Loaded model weights from the checkpoint at https://huggingface.co/made-with-clay/Clay/resolve/main/Clay_v0.1_epoch-24_val-loss-0.46.ckpt\n",
+ "Predicting DataLoader 0: 100%|██████████████████| 14/14 [00:02<00:00, 5.97it/s]Saved 7 rows of embeddings of shape (768,) to /home/tam/Documents/repos/model/data/embeddings/45QUB_20200328_20210825_v001.gpq\n",
+ "Saved 8 rows of embeddings of shape (768,) to /home/tam/Documents/repos/model/data/embeddings/45QTB_20200328_20210815_v001.gpq\n",
+ "Saved 8 rows of embeddings of shape (768,) to /home/tam/Documents/repos/model/data/embeddings/44QRG_20200328_20210815_v001.gpq\n",
+ "Saved 1 rows of embeddings of shape (768,) to /home/tam/Documents/repos/model/data/embeddings/45QVC_20210822_20210822_v001.gpq\n",
+ "Saved 2 rows of embeddings of shape (768,) to /home/tam/Documents/repos/model/data/embeddings/44QRH_20200328_20200328_v001.gpq\n",
+ "Saved 2 rows of embeddings of shape (768,) to /home/tam/Documents/repos/model/data/embeddings/45QTC_20200328_20200328_v001.gpq\n",
+ "Predicting DataLoader 0: 100%|██████████████████| 14/14 [00:02<00:00, 5.84it/s]\n",
+ "Done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "! wandb disabled\n",
+ "! python trainer.py predict --ckpt_path=https://huggingface.co/made-with-clay/Clay/resolve/main/Clay_v0.1_epoch-24_val-loss-0.46.ckpt \\\n",
+ " --trainer.precision=16-mixed \\\n",
+ " --data.data_dir=/home/tam/Desktop/aoitiles \\\n",
+ " --data.batch_size=2 \\\n",
+ " --data.num_workers=8"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "claymodel",
+ "language": "python",
+ "name": "claymodel"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/ruff.toml
similarity index 79%
rename from pyproject.toml
rename to ruff.toml
index c2b3a2b7..82f49e2d 100644
--- a/pyproject.toml
+++ b/ruff.toml
@@ -1,8 +1,11 @@
-[tool.ruff.format]
+[per-file-ignores]
+"docs/clay_over_aoi.ipynb" = ["E501"]
+
+[format]
# https://docs.astral.sh/ruff/settings/#format
line-ending = "lf" # Use UNIX `\n` line endings for all files
-[tool.ruff.lint]
+[lint]
# https://docs.astral.sh/ruff/rules/
select = [
"E", # pycodestyle errors
@@ -14,5 +17,5 @@ select = [
"W", # pycodestyle warnings
]
-[tool.ruff.lint.pylint]
+[lint.pylint]
max-args = 6
diff --git a/scripts/datacube.py b/scripts/datacube.py
index 7345cc27..ad4d470f 100755
--- a/scripts/datacube.py
+++ b/scripts/datacube.py
@@ -316,20 +316,20 @@ def make_datasets(s2_items, s1_items, dem_items, resolution):
def process(
aoi,
- year,
+ date_range,
resolution,
cloud_cover_percentage,
nodata_pixel_percentage,
):
"""
Process Sentinel-2, Sentinel-1, and Copernicus DEM data for a specified
- year, area of interest (AOI), resolution, EPSG code, cloud cover
+ date_range, area of interest (AOI), resolution, EPSG code, cloud cover
percentage, and nodata pixel percentage.
Parameters:
- aoi (shapely.geometry.base.BaseGeometry): Geometry object for an Area of
Interest (AOI).
- - year (int): The year for finding imagery.
+ - date_range (str): Date range string to pass to the catalog search.
- resolution (int): Spatial resolution.
- cloud_cover_percentage (int): Maximum acceptable cloud cover percentage
for Sentinel-2 images.
@@ -339,7 +339,6 @@ def process(
Returns:
- xr.Dataset: Merged xarray Dataset containing processed data.
"""
- date_range = f"{year}-01-01/{year}-12-31"
catalog = pystac_client.Client.open(STAC_API, modifier=pc.sign_inplace)
for i in range(S1_MATCH_ATTEMPTS):
@@ -364,8 +363,8 @@ def process(
if i == S1_MATCH_ATTEMPTS - 1:
print(
- "No match for S1 scenes found for year "
- f"{year} after {S1_MATCH_ATTEMPTS} attempts."
+ "No match for S1 scenes found for date range "
+ f"{date_range} after {S1_MATCH_ATTEMPTS} attempts."
)
return None, None
@@ -423,10 +422,26 @@ def convert_attrs_and_coords_objects_to_str(data):
@click.option(
"--subset",
required=False,
+ default="",
+ help="For debugging, subset the MGRS tile data to this pixel window."
+ "Expects a comma separated string of 4 integers.",
+ type=str,
+)
+@click.option(
+ "--localpath",
+ required=False,
default=None,
- help="For debugging, subset x and y to this pixel window.",
+ help="If specified, this path will be used to write the tiles locally"
+ "Otherwise a temp dir will be used.",
+)
+@click.option(
+ "--dateranges",
+ required=False,
+ default="",
+ type=str,
+ help="Comma separated list of date ranges, each provided as YYYY-MM-DD/YYYY-MM-DD.",
)
-def main(sample, index, subset, bucket):
+def main(sample, index, subset, bucket, localpath, dateranges):
index = int(index)
tiles = gpd.read_file(sample)
tile = tiles.iloc[index]
@@ -434,17 +449,27 @@ def main(sample, index, subset, bucket):
print(f"Starting algorithm for MGRS tile {tile['name']} with index {index}")
- # Shuffle years, use index as seed for reproducibility but no
- # to have the same shuffle every time.
- years = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
- random.seed(index)
- random.shuffle(years)
+ if subset:
+ subset = [int(dat) for dat in subset.split(",")]
+
+ if dateranges:
+ date_ranges = dateranges.split(",")
+ else:
+ # Shuffle years, use index as seed for reproducibility but no
+ # to have the same shuffle every time.
+ date_ranges = [
+ f"{year}-01-01/{year}-12-31"
+ for year in (2017, 2018, 2019, 2020, 2021, 2022, 2023)
+ ]
+ random.seed(index)
+ random.shuffle(date_ranges)
+
match_count = 0
- for year in years:
- print(f"Processing data for year {year}")
+ for date_range in date_ranges:
+ print(f"Processing data for date range {date_range}")
date, pixels = process(
tile.geometry,
- year,
+ date_range,
SPATIAL_RESOLUTION,
CLOUD_COVER_PERCENTAGE,
NODATA_PIXEL_PERCENTAGE,
@@ -455,7 +480,6 @@ def main(sample, index, subset, bucket):
match_count += 1
if subset:
- subset = [int(dat) for dat in subset.split(",")]
print(f"Subsetting to {subset}")
pixels = [
part[:, subset[1] : subset[3], subset[0] : subset[2]] for part in pixels
@@ -463,7 +487,7 @@ def main(sample, index, subset, bucket):
pixels = [part.compute() for part in pixels]
- tiler(pixels, date, mgrs, bucket)
+ tiler(pixels, date, mgrs, bucket, localpath)
if match_count == DATES_PER_LOCATION:
break
diff --git a/scripts/tile.py b/scripts/tile.py
index e7e12a4b..41a9056f 100644
--- a/scripts/tile.py
+++ b/scripts/tile.py
@@ -55,7 +55,7 @@ def filter_clouds_nodata(tile):
return True # If both conditions pass
-def tiler(stack, date, mgrs, bucket):
+def tile_to_dir(stack, date, mgrs, bucket, dir):
"""
Function to tile a multi-dimensional imagery stack while filtering out
tiles with high cloud coverage or no-data pixels.
@@ -66,67 +66,77 @@ def tiler(stack, date, mgrs, bucket):
- mgrs (str): MGRS Tile id
- bucket(str): AWS S3 bucket to write tiles to
"""
+ print("Writing tempfiles to ", dir)
+
# Calculate the number of full tiles in x and y directions
num_x_tiles = stack[0].x.size // TILE_SIZE
num_y_tiles = stack[0].y.size // TILE_SIZE
counter = 0
- with tempfile.TemporaryDirectory() as dir:
- print("Writing tempfiles to ", dir)
- # Iterate through each chunk of x and y dimensions and create tiles
- for y_idx in range(num_y_tiles):
- for x_idx in range(num_x_tiles):
- # Calculate the start and end indices for x and y dimensions
- # for the current tile
- x_start = x_idx * TILE_SIZE
- y_start = y_idx * TILE_SIZE
- x_end = x_start + TILE_SIZE
- y_end = y_start + TILE_SIZE
-
- # Select the subset of data for the current tile
- parts = [part[:, y_start:y_end, x_start:x_end] for part in stack]
-
- # Only concat here to save memory, it converts S2 data to float
- tile = xr.concat(parts, dim="band").rename("tile")
-
- counter += 1
- if counter % 100 == 0:
- print(f"Counted {counter} tiles")
-
- if not filter_clouds_nodata(tile):
- continue
-
- tile = tile.drop_sel(band="SCL")
-
- # Track band names and color interpretation
- tile.attrs["long_name"] = [str(x.values) for x in tile.band]
- color = [ColorInterp.blue, ColorInterp.green, ColorInterp.red] + [
- ColorInterp.gray
- ] * (len(tile.band) - 3)
-
- # Write tile to tempdir
- name = "{dir}/claytile_{mgrs}_{date}_v{version}_{counter}.tif".format(
- dir=dir,
- mgrs=mgrs,
- date=date.replace("-", ""),
- version=VERSION,
- counter=str(counter).zfill(4),
- )
- tile.rio.to_raster(name, compress="deflate")
-
- with rasterio.open(name, "r+") as rst:
- rst.colorinterp = color
- rst.update_tags(date=date)
-
- print(f"Syncing {dir} with s3://{bucket}/{VERSION}/{mgrs}/{date}")
- subprocess.run(
- [
- "aws",
- "s3",
- "sync",
- dir,
- f"s3://{bucket}/{VERSION}/{mgrs}/{date}",
- "--no-progress",
- ],
- check=True,
- )
+ # Iterate through each chunk of x and y dimensions and create tiles
+ for y_idx in range(num_y_tiles):
+ for x_idx in range(num_x_tiles):
+ # Calculate the start and end indices for x and y dimensions
+ # for the current tile
+ x_start = x_idx * TILE_SIZE
+ y_start = y_idx * TILE_SIZE
+ x_end = x_start + TILE_SIZE
+ y_end = y_start + TILE_SIZE
+
+ # Select the subset of data for the current tile
+ parts = [part[:, y_start:y_end, x_start:x_end] for part in stack]
+
+ # Only concat here to save memory, it converts S2 data to float
+ tile = xr.concat(parts, dim="band").rename("tile")
+
+ counter += 1
+ if counter % 100 == 0:
+ print(f"Counted {counter} tiles")
+
+ if not filter_clouds_nodata(tile):
+ continue
+
+ tile = tile.drop_sel(band="SCL")
+
+ # Track band names and color interpretation
+ tile.attrs["long_name"] = [str(x.values) for x in tile.band]
+ color = [ColorInterp.blue, ColorInterp.green, ColorInterp.red] + [
+ ColorInterp.gray
+ ] * (len(tile.band) - 3)
+
+ # Write tile to tempdir
+ name = "{dir}/claytile_{mgrs}_{date}_v{version}_{counter}.tif".format(
+ dir=dir,
+ mgrs=mgrs,
+ date=date.replace("-", ""),
+ version=VERSION,
+ counter=str(counter).zfill(4),
+ )
+ tile.rio.to_raster(name, compress="deflate")
+
+ with rasterio.open(name, "r+") as rst:
+ rst.colorinterp = color
+ rst.update_tags(date=date)
+ if bucket:
+ print(f"Syncing {dir} with s3://{bucket}/{VERSION}/{mgrs}/{date}")
+ subprocess.run(
+ [
+ "aws",
+ "s3",
+ "sync",
+ dir,
+ f"s3://{bucket}/{VERSION}/{mgrs}/{date}",
+ "--no-progress",
+ ],
+ check=True,
+ )
+ else:
+ print("No bucket specified, skipping S3 sync.")
+
+
+def tiler(stack, date, mgrs, bucket, dir):
+ if dir:
+ tile_to_dir(stack, date, mgrs, bucket, dir)
+ else:
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tile_to_dir(stack, date, mgrs, bucket, tmpdir)