From 399d421311e6e8629a403ee4e193c7f2991ba875 Mon Sep 17 00:00:00 2001 From: andrewmoorman Date: Wed, 11 Sep 2024 21:46:47 -0400 Subject: [PATCH 001/156] Cleanup --- docs/notebooks/segger_tutorial.ipynb | 1587 +------------------------- environment-rapids.yml | 40 - src/segger/data/io.py | 14 - 3 files changed, 6 insertions(+), 1635 deletions(-) delete mode 100644 environment-rapids.yml diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index bbe8f33..96ccb70 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -27,25 +27,6 @@ "This tutorial will guide you through each step of the process, ensuring you can train and apply Segger for your own data." ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "0d7448cb-5bb9-4100-b3c3-ed4edfe43fcd", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:48:58.720916Z", - "iopub.status.busy": "2024-09-12T00:48:58.720207Z", - "iopub.status.idle": "2024-09-12T00:49:00.931992Z", - "shell.execute_reply": "2024-09-12T00:49:00.931572Z", - "shell.execute_reply.started": "2024-09-12T00:48:58.720895Z" - } - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, { "cell_type": "code", "execution_count": 1, @@ -73,45 +54,6 @@ "import seaborn as sns" ] }, - { - "cell_type": "code", - "execution_count": 3, - "id": "53724a12-4d87-43f9-937e-9b3754e20016", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:49:00.932740Z", - "iopub.status.busy": "2024-09-12T00:49:00.932580Z", - "iopub.status.idle": "2024-09-12T00:49:02.219477Z", - "shell.execute_reply": "2024-09-12T00:49:02.219072Z", - "shell.execute_reply.started": "2024-09-12T00:49:00.932725Z" - } - }, - "outputs": [], - "source": [ - "from pytorch_lightning.plugins.environments import SLURMEnvironment\n", - "SLURMEnvironment.detect = lambda: False" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2d91db91-a71b-4832-8861-7c0240347654", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:49:02.220725Z", - "iopub.status.busy": "2024-09-12T00:49:02.220525Z", - "iopub.status.idle": "2024-09-12T00:49:03.130996Z", - "shell.execute_reply": "2024-09-12T00:49:03.130618Z", - "shell.execute_reply.started": "2024-09-12T00:49:02.220709Z" - } - }, - "outputs": [], - "source": [ - "# Plotting styles used throughout\n", - "stylesheet = Path('../../dev/tutorial/assets/default.mplstyle')\n", - "plt.style.use(stylesheet)" - ] - }, { "cell_type": "markdown", "id": "db009015-c379-4f50-97ed-81dca9df28ac", @@ -158,8 +100,8 @@ "outputs": [], "source": [ "# Paths to Xenium sample data and where to store Segger data\n", - "xenium_data_dir = Path('../../dev/xenium_data/jose_run_1_region_1_small')\n", - "segger_data_dir = Path('../../dev/tutorial/segger_data/')\n", + "xenium_data_dir = Path('path/to/tutorial/xenium_data')\n", + "segger_data_dir = Path('path/to/tutorial/segger_data/')\n", "\n", "# Setup Xenium sample to create dataset\n", "xs = XeniumSample(verbose=False)\n", @@ -322,7 +264,7 @@ ], "source": [ "# Base directory to store Pytorch Lightning models\n", - "models_dir = Path('../../dev/tutorial/models/')\n", + "models_dir = Path('path/to/tutorial/models/')\n", "\n", "# Initialize the Lightning model\n", "metadata = ([\"tx\", \"bd\"], [(\"tx\", \"belongs\", \"bd\"), (\"tx\", \"neighbors\", \"tx\")])\n", @@ -624,7 +566,7 @@ }, "outputs": [], "source": [ - "tuning_dir = Path('../../dev/tutorial/tuning/')\n", + "tuning_dir = Path('path/to/tutorial/tuning/')\n", "sampling_rate = 0.125" ] }, @@ -737,7 +679,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "ba2dcc9a-3a06-4b84-a487-59a768eed5d5", "metadata": { "execution": { @@ -749,1524 +691,7 @@ }, "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using 16bit Automatic Mixed Precision (AMP)\n", - "GPU available: True (cuda), used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n", - "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [MIG-9b1ee058-0b73-52d6-b909-b056af809b4b,MIG-e8861a3f-0f14-562d-ad70-c9f56ba1db06,MIG-e9c80ed6-e922-5f99-84ba-fd31a06f7f82,MIG-ddac4c2a-2ccf-5d86-be01-c9e27dd3cd76]\n", - "\n", - " | Name | Type | Params | Mode \n", - "--------------------------------------------------------\n", - "0 | model | GraphModule | 9.7 K | train\n", - "1 | criterion | BCEWithLogitsLoss | 0 | train\n", - "--------------------------------------------------------\n", - "9.7 K Trainable params\n", - "0 Non-trainable params\n", - "9.7 K Total params\n", - "0.039 Total estimated model params size (MB)\n", - "45 Modules in train mode\n", - "0 Modules in eval mode\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00 79\u001b[0m segmentation \u001b[38;5;241m=\u001b[39m \u001b[43mtrainable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 80\u001b[0m trial \u001b[38;5;241m=\u001b[39m evaluate(segmentation, predict_kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mscore_cut\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 81\u001b[0m trial \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat([pd\u001b[38;5;241m.\u001b[39mSeries(config), trial])\n", - "Cell \u001b[0;32mIn[78], line 32\u001b[0m, in \u001b[0;36mtrainable\u001b[0;34m(config)\u001b[0m\n\u001b[1;32m 25\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer( \n\u001b[1;32m 26\u001b[0m default_root_dir\u001b[38;5;241m=\u001b[39mconfig[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel_dir\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 27\u001b[0m logger\u001b[38;5;241m=\u001b[39mCSVLogger(config[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel_dir\u001b[39m\u001b[38;5;124m'\u001b[39m]),\n\u001b[1;32m 28\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mtrainer_kwargs,\n\u001b[1;32m 29\u001b[0m )\n\u001b[1;32m 30\u001b[0m trainer\u001b[38;5;241m.\u001b[39mfit(model\u001b[38;5;241m=\u001b[39mls, datamodule\u001b[38;5;241m=\u001b[39mdm)\n\u001b[0;32m---> 32\u001b[0m segmentation \u001b[38;5;241m=\u001b[39m \u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmodel_dir\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlightning_logs/version_0/checkpoints\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mdm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mreceptive_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreceptive_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpredict_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m metrics \u001b[38;5;241m=\u001b[39m evaluate(segmentation)\n", - "File \u001b[0;32m/lilac/data/peer/moormana/GitHub/dpeerlab/segger_dev/src/segger/prediction/predict.py:259\u001b[0m, in \u001b[0;36mpredict\u001b[0;34m(lit_segger, data_loader, score_cut, receptive_field, use_cc)\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[38;5;66;03m# Assign transcripts from each batch to nuclei\u001b[39;00m\n\u001b[1;32m 257\u001b[0m \u001b[38;5;66;03m# TODO: parallelize this step\u001b[39;00m\n\u001b[1;32m 258\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch \u001b[38;5;129;01min\u001b[39;00m tqdm(data_loader):\n\u001b[0;32m--> 259\u001b[0m batch_assignments \u001b[38;5;241m=\u001b[39m \u001b[43mpredict_batch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 260\u001b[0m \u001b[43m \u001b[49m\u001b[43mlit_segger\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscore_cut\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreceptive_field\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_cc\u001b[49m\n\u001b[1;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 262\u001b[0m assignments\u001b[38;5;241m.\u001b[39mappend(batch_assignments)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;66;03m# Join across batches and handle duplicates between batches\u001b[39;00m\n", - "File \u001b[0;32m/lilac/data/peer/moormana/GitHub/dpeerlab/segger_dev/src/segger/prediction/predict.py:209\u001b[0m, in \u001b[0;36mpredict_batch\u001b[0;34m(lit_segger, batch, score_cut, receptive_field, use_cc)\u001b[0m\n\u001b[1;32m 204\u001b[0m edge_index \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtx\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mneighbors\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtx\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39medge_index\n\u001b[1;32m 205\u001b[0m batch[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtx\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtx_field\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m coo_to_dense_adj(\n\u001b[1;32m 206\u001b[0m edge_index,\n\u001b[1;32m 207\u001b[0m num_nodes\u001b[38;5;241m=\u001b[39mbatch[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtx\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m],\n\u001b[1;32m 208\u001b[0m )\n\u001b[0;32m--> 209\u001b[0m scores \u001b[38;5;241m=\u001b[39m \u001b[43mget_similarity_scores\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlit_segger\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 210\u001b[0m scores \u001b[38;5;241m=\u001b[39m scores\u001b[38;5;241m.\u001b[39mfill_diagonal_(\u001b[38;5;241m0\u001b[39m) \u001b[38;5;66;03m# ignore self-similarity\u001b[39;00m\n\u001b[1;32m 212\u001b[0m \u001b[38;5;66;03m# 2. Assign remainder using connected components\u001b[39;00m\n", - "File \u001b[0;32m/lilac/data/peer/moormana/GitHub/dpeerlab/segger_dev/src/segger/prediction/predict.py:135\u001b[0m, in \u001b[0;36mget_similarity_scores\u001b[0;34m(model, batch, from_type, to_type)\u001b[0m\n\u001b[1;32m 132\u001b[0m sparse_sim \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39msparse_coo_tensor(indices, values, shape)\n\u001b[1;32m 134\u001b[0m \u001b[38;5;66;03m# Return in dense format for backwards compatibility\u001b[39;00m\n\u001b[0;32m--> 135\u001b[0m scores \u001b[38;5;241m=\u001b[39m \u001b[43msparse_sim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_dense\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mcpu()\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m scores\n", - "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 24.97 GiB. GPU 0 has a total capacty of 19.50 GiB of which 19.15 GiB is free. Including non-PyTorch memory, this process has 324.00 MiB memory in use. Of the allocated memory 65.54 MiB is allocated by PyTorch, and 84.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF" - ] - } - ], + "outputs": [], "source": [ "param_space = {\n", " \"k_bd\": [3, 5, 10],\n", diff --git a/environment-rapids.yml b/environment-rapids.yml deleted file mode 100644 index 14d503c..0000000 --- a/environment-rapids.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: segger-env -channels: - - conda-forge - - defaults - - pyg - - nvidia - - rapidsai - - pytorch -dependencies: - - python=3.10 - - pytorch>=2.0.0 - - numpy>=1.21.0 - - pandas>=1.3.0 - - scipy>=1.7.0 - - matplotlib>=3.4.0 - - seaborn>=0.11.0 - - tqdm>=4.61.0 - - torchvision>=0.10.0 - - lightning>=1.9.0 - - torchmetrics>=0.5.0 - - scanpy>=1.9.3 - - squidpy>=1.2.0 - - adjustText>=0.8 - - scikit-learn>=0.24.0 - - geopandas>=0.9.0 - - shapely>=1.7.0 - - path>=17.0.0 - - pyarrow>=17.0.0 - - cudf>=21.08 - - cuml>=21.08 - - cugraph>=21.08 - - cuspatial>=21.08 - - faiss-cpu>=1.7.0 - - faiss-gpu>=1.7.0 - - pip - - pip: - - torch-scatter>=2.1.2 - - torch-sparse>=0.6.18 - - torch-cluster>=1.6.3 - - torch-geometric>=2.2.0 diff --git a/src/segger/data/io.py b/src/segger/data/io.py index b33d5fd..2face2b 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -89,20 +89,6 @@ def set_file_paths(self, transcripts_path: Path, boundaries_path: Path) -> None: """ self.transcripts_path = transcripts_path self.boundaries_path = boundaries_path - tx_extents = get_xy_extents( - self.transcripts_path, - self.keys.TRANSCRIPTS_X.value, - self.keys.TRANSCRIPTS_Y.value, - ) - bd_extents = get_xy_extents( - self.boundaries_path, - self.keys.BOUNDARIES_VERTEX_X.value, - self.keys.BOUNDARIES_VERTEX_Y.value, - ) - self.x_min = min(tx_extents[0], bd_extents[0]) - self.y_min = min(tx_extents[1], bd_extents[1]) - self.x_max = max(tx_extents[2], bd_extents[2]) - self.y_max = max(tx_extents[3], bd_extents[3]) if self.verbose: print(f"Set transcripts file path to {transcripts_path}") if self.verbose: print(f"Set boundaries file path to {boundaries_path}") From 3178241a83c6598b770be7c6e4b841e9b45ff508 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 04:32:53 +0200 Subject: [PATCH 002/156] Update static.yml --- .github/workflows/static.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 4f86f55..23ebb52 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -34,7 +34,6 @@ jobs: # Step 3: Install your package from the repository - name: Install package run: | - export PYTHONPATH='./src' export PYTHONPATH='./src/segger' # Step 4: Install MkDocs and dependencies From cd4f185f6b148706713c423965792759223b3cec Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 04:34:58 +0200 Subject: [PATCH 003/156] Update static.yml --- .github/workflows/static.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 23ebb52..5bd3844 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -34,7 +34,7 @@ jobs: # Step 3: Install your package from the repository - name: Install package run: | - export PYTHONPATH='./src/segger' + pip install . # Step 4: Install MkDocs and dependencies - name: Install dependencies From d682a790491df8ce4833601eaa096aa152ba3c84 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 04:35:37 +0200 Subject: [PATCH 004/156] Update static.yml --- .github/workflows/static.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 5bd3844..0eb9d5b 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -29,7 +29,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: '3.10' # Step 3: Install your package from the repository - name: Install package From 976252d77ad6bd0f102020f08e1e293c015bc4a1 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 04:41:06 +0200 Subject: [PATCH 005/156] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8ca1513..0e6b605 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "geopandas>=0.9.0", "shapely>=1.7.0", "path>=17.0.0", - "pyarrow>=17.0.0" + "pyarrow>=17.0.0", "torch-geometric>=2.2.0" ] From 6e91e9ac5acae3bc0fd4f3393e9543e4caa34e64 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 04:42:38 +0200 Subject: [PATCH 006/156] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0e6b605..033a2db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "lightning>=1.9.0", "torchmetrics>=0.5.0", "scanpy>=1.9.3", - "squidpy=1.2.0", + "squidpy==1.2.0", "adjustText>=0.8", "scikit-learn>=0.24.0", "geopandas>=0.9.0", From 9dbaf896c2562426ba1f03ef990cdde0139a96af Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 04:51:59 +0200 Subject: [PATCH 007/156] Update static.yml --- .github/workflows/static.yml | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 0eb9d5b..53badd9 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -2,9 +2,8 @@ name: Deploy MkDocs to GitHub Pages on: push: - branches: ["main"] # Change "main" to your default branch if necessary + branches: ["main"] - # Allows you to run this workflow manually from the Actions tab workflow_dispatch: permissions: @@ -31,27 +30,34 @@ jobs: with: python-version: '3.10' - # Step 3: Install your package from the repository - - name: Install package + # Step 3: Install package and set PYTHONPATH + - name: Install package and documentation dependencies run: | - pip install . - - # Step 4: Install MkDocs and dependencies - - name: Install dependencies + export PYTHONPATH=$PYTHONPATH:$(pwd)/src + # pip install -e . + + # Step 4: Install MkDocs and required plugins + - name: Install MkDocs and plugins run: | pip install mkdocs mkdocs-material mkdocs-autorefs mkdocstrings[python] mkdocs-jupyter pymdown-extensions termynal mkdocs-minify-plugin - # Step 5: Build the MkDocs site + # Step 5: Debug environment to check installed packages + - name: Debug environment + run: | + python -m pip freeze + python -m mkdocs --version + + # Step 6: Build the MkDocs site with verbose output - name: Build MkDocs site - run: mkdocs build --verbose + run: mkdocs build --verbose --strict - # Step 6: Upload the generated site directory as an artifact for GitHub Pages + # Step 7: Upload the generated site directory as an artifact for GitHub Pages - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: - path: './site' # Only upload the generated `site` folder + path: './site' - # Step 7: Deploy to GitHub Pages + # Step 8: Deploy to GitHub Pages - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 From 0c34076aa06c4aa42373ed2b8bfade909baadb43 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 05:04:20 +0200 Subject: [PATCH 008/156] Update static.yml --- .github/workflows/static.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 53badd9..d44e889 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -46,10 +46,12 @@ jobs: run: | python -m pip freeze python -m mkdocs --version + # Step 6: Build the MkDocs site with verbose output - name: Build MkDocs site - run: mkdocs build --verbose --strict + run: poetry run mkdocs gh-deploy --force + # Step 7: Upload the generated site directory as an artifact for GitHub Pages - name: Upload artifact From f7956387f541d8eb6ecb05bf2f0fbe68f9c0621b Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 05:06:09 +0200 Subject: [PATCH 009/156] Update static.yml --- .github/workflows/static.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index d44e889..ecee006 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -30,6 +30,21 @@ jobs: with: python-version: '3.10' + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + # Step 3: Install package and set PYTHONPATH - name: Install package and documentation dependencies run: | From 8a8b63a900d77c56e75342c11079af3372b45ccd Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 05:09:04 +0200 Subject: [PATCH 010/156] Update static.yml --- .github/workflows/static.yml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index ecee006..aa3ab3d 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -30,25 +30,11 @@ jobs: with: python-version: '3.10' - - name: Install Poetry - uses: snok/install-poetry@v1 - with: - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v3 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} - # Step 3: Install package and set PYTHONPATH - name: Install package and documentation dependencies run: | - export PYTHONPATH=$PYTHONPATH:$(pwd)/src + export PYTHONPATH=$PYTHONPATH:$(pwd)/src/segger # pip install -e . # Step 4: Install MkDocs and required plugins @@ -65,7 +51,7 @@ jobs: # Step 6: Build the MkDocs site with verbose output - name: Build MkDocs site - run: poetry run mkdocs gh-deploy --force + run: mkdocs gh-deploy --force # Step 7: Upload the generated site directory as an artifact for GitHub Pages From 6ede5c7c8189244611d59c6c96b78a81d09e50de Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 05:12:04 +0200 Subject: [PATCH 011/156] Update static.yml --- .github/workflows/static.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index aa3ab3d..80bd5eb 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -34,8 +34,8 @@ jobs: # Step 3: Install package and set PYTHONPATH - name: Install package and documentation dependencies run: | - export PYTHONPATH=$PYTHONPATH:$(pwd)/src/segger - # pip install -e . + # export PYTHONPATH=$PYTHONPATH:$(pwd) + pip install -e . # Step 4: Install MkDocs and required plugins - name: Install MkDocs and plugins From 65972324836adbb22bffa02d7f722d12f2114d0b Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 05:43:30 +0200 Subject: [PATCH 012/156] Update static.yml --- .github/workflows/static.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 80bd5eb..b531e88 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -51,7 +51,7 @@ jobs: # Step 6: Build the MkDocs site with verbose output - name: Build MkDocs site - run: mkdocs gh-deploy --force + run: mkdocs gh-deploy # Step 7: Upload the generated site directory as an artifact for GitHub Pages From f9d99dd5185c8455275b043102d9930ab24845c3 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 05:57:53 +0200 Subject: [PATCH 013/156] Update static.yml --- .github/workflows/static.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index b531e88..c5d7297 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -64,3 +64,5 @@ jobs: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 + publish: + From 0de71ca2ad167def92d727a0990ef48fcd680029 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:02:36 +0200 Subject: [PATCH 014/156] Update static.yml --- .github/workflows/static.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index c5d7297..69cb4cc 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -51,7 +51,7 @@ jobs: # Step 6: Build the MkDocs site with verbose output - name: Build MkDocs site - run: mkdocs gh-deploy + run: mkdocs gh-deploy --force # Step 7: Upload the generated site directory as an artifact for GitHub Pages @@ -60,9 +60,4 @@ jobs: with: path: './site' - # Step 8: Deploy to GitHub Pages - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 - publish: From ab55d2125fc5f669902284cc3e1379e067e5b9d7 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 06:17:40 +0200 Subject: [PATCH 015/156] cleaned up rusty files --- docs/notebooks/benchmark_bc.ipynb | 108 ------------------ docs/notebooks/create_dataset.ipynb | 19 ---- docs/notebooks/train_model.ipynb | 164 ---------------------------- scripts/create_dataset.py | 98 ----------------- scripts/create_dataset.sh | 38 ------- scripts/create_dataset_2.py | 66 ----------- scripts/create_tile_speed.py | 140 ------------------------ scripts/create_tile_speed_iter.py | 137 ----------------------- 8 files changed, 770 deletions(-) delete mode 100644 docs/notebooks/benchmark_bc.ipynb delete mode 100644 docs/notebooks/create_dataset.ipynb delete mode 100644 docs/notebooks/train_model.ipynb delete mode 100644 scripts/create_dataset.py delete mode 100644 scripts/create_dataset.sh delete mode 100644 scripts/create_dataset_2.py delete mode 100644 scripts/create_tile_speed.py delete mode 100644 scripts/create_tile_speed_iter.py diff --git a/docs/notebooks/benchmark_bc.ipynb b/docs/notebooks/benchmark_bc.ipynb deleted file mode 100644 index 7da6499..0000000 --- a/docs/notebooks/benchmark_bc.ipynb +++ /dev/null @@ -1,108 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Segmentation Method Benchmarking: General Statistics and Beyond\n", - "\n", - "This notebook provides a step-by-step approach to calculating various metrics for benchmarking segmentation methods in single-cell transcriptomics. We will start with general statistics and proceed to advanced metrics, including F1 purity, neighborhood entropy, MECR, and contamination.\n", - "\n", - "## 0. Setup: Import Required Packages and Define Paths\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import scanpy as sc\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from pathlib import Path\n", - "from itertools import combinations\n", - "from matplotlib.backends.backend_pdf import PdfPages\n", - "\n", - "# Define paths and segmentation methods\n", - "benchmarks_path = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc')\n", - "output_path = benchmarks_path / 'resutls'\n", - "output_path.mkdir(parents = True, exist_ok=True)\n", - "segmentation_methods = ['segger', 'segger_n0', 'segger_n1', 'Baysor', '10X', '10X-nucleus']\n", - "\n", - "# Load the AnnData objects for each segmentation method\n", - "adata_segger = sc.read(benchmarks_path / 'adata_segger.h5ad')\n", - "adata_baysor = sc.read(benchmarks_path / 'adata_baysor.h5ad')\n", - "adata_10X = sc.read(benchmarks_path / 'adata_10X.h5ad')\n", - "adata_10X_nuc = sc.read(benchmarks_path / 'adata_10X_nuc.h5ad')\n", - "\n", - "cells_n1 = [i for i in adata_segger.obs_names if not i.endswith('-nx')]\n", - "cells_n0 = [i for i in adata_segger.obs_names if i.endswith('-nx')]\n", - "adata_segger_n1 = adata_segger[cells_n1,:]\n", - "adata_segger_n0 = adata_segger[cells_n0,:]\n", - "\n", - "scRNAseq = sc.read_h5ad(Path('data_tidy') / \"BC_atlas_xe.h5ad\")\n", - "genes = scRNAseq.var_names\n", - "\n", - "segmentations = [adata_segger, adata_segger_n0, adata_segger_n1, adata_baysor, adata_10X, adata_10X_nuc]\n", - "segmentations_dict = dict(zip(segmentation_methods, segmentations))\n", - "\n", - "max_area = adata_10X.obs.cell_area.max()\n", - "min_area = adata_10X_nuc.obs.cell_area.min()\n", - "min_transcripts = adata_10X_nuc.obs.transcripts.min()\n", - "\n", - "segmentations = [x[(x.obs.cell_area > min_area) & (x.obs.cell_area < max_area) & (x.obs.transcripts > min_transcripts)] for x in segmentations]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. General Statistics\n", - "\n", - "### Number of cells" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/notebooks/create_dataset.ipynb b/docs/notebooks/create_dataset.ipynb deleted file mode 100644 index a610084..0000000 --- a/docs/notebooks/create_dataset.ipynb +++ /dev/null @@ -1,19 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/notebooks/train_model.ipynb b/docs/notebooks/train_model.ipynb deleted file mode 100644 index 63ebe41..0000000 --- a/docs/notebooks/train_model.ipynb +++ /dev/null @@ -1,164 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "{\n", - " \"cells\": [\n", - " {\n", - " \"cell_type\": \"markdown\",\n", - " \"metadata\": {},\n", - " \"source\": [\n", - " \"# Segger Model Training\\n\",\n", - " \"\\n\",\n", - " \"This notebook demonstrates how to train the Segger model on spatial transcriptomics data.\"\n", - " ]\n", - " },\n", - " {\n", - " \"cell_type\": \"markdown\",\n", - " \"metadata\": {},\n", - " \"source\": [\n", - " \"## Step 1: Setup and Environment\\n\",\n", - " \"\\n\",\n", - " \"First, we set up the environment by importing necessary libraries and ensuring that required directories exist.\"\n", - " ]\n", - " },\n", - " {\n", - " \"cell_type\": \"code\",\n", - " \"execution_count\": null,\n", - " \"metadata\": {},\n", - " \"outputs\": [],\n", - " \"source\": [\n", - " \"import os\\n\",\n", - " \"import sys\\n\",\n", - " \"from pathlib import Path\\n\",\n", - " \"import torch\\n\",\n", - " \"import lightning as L\\n\",\n", - " \"from torch_geometric.loader import DataLoader\\n\",\n", - " \"from segger.data.utils import XeniumDataset\\n\",\n", - " \"from segger.models.segger_model import Segger\\n\",\n", - " \"from segger.training.train import LitSegger\\n\",\n", - " \"\\n\",\n", - " \"# Ensure PyGEOS is not used\\n\",\n", - " \"os.environ[\\\"USE_PYGEOS\\\"] = \\\"0\\\"\\n\",\n", - " \"os.environ[\\\"PYTORCH_USE_CUDA_DSA\\\"] = \\\"1\\\"\\n\",\n", - " \"os.environ[\\\"CUDA_LAUNCH_BLOCKING\\\"] = \\\"1\\\"\\n\",\n", - " \"\\n\",\n", - " \"# Add the src directory to the Python path\\n\",\n", - " \"sys.path.insert(0, os.path.abspath('../../src'))\\n\",\n", - " \"\\n\",\n", - " \"# Define the data directory paths\\n\",\n", - " \"TRAIN_DIR = Path('data_tidy/pyg_datasets/MNG_N173116IA/train_tiles/processed')\\n\",\n", - " \"VAL_DIR = Path('data_tidy/pyg_datasets/MNG_N173116IA/val_tiles/processed')\\n\",\n", - " \"\\n\",\n", - " \"# Data params\\n\",\n", - " \"DATA_CHUNK_SIZE = 20\\n\",\n", - " \"BATCH_SIZE_TRAIN = 4\\n\",\n", - " \"BATCH_SIZE_VAL = 4\\n\",\n", - " \"\\n\",\n", - " \"# Trainer params\\n\",\n", - " \"EPOCHS = 100\\n\",\n", - " \"ACCELERATOR = \\\"cuda\\\"\\n\",\n", - " \"STRATEGY = 'auto'\\n\",\n", - " \"PRECISION = \\\"16-mixed\\\"\\n\",\n", - " \"DEVICES = 4\\n\",\n", - " \"DEFAULT_ROOT_DIR = \\\"./models/MNG_big\\\"\\n\",\n", - " \"\\n\",\n", - " \"# Create model directory\\n\",\n", - " \"os.makedirs(DEFAULT_ROOT_DIR, exist_ok=True)\"\n", - " ]\n", - " },\n", - " {\n", - " \"cell_type\": \"markdown\",\n", - " \"metadata\": {},\n", - " \"source\": [\n", - " \"## Step 2: Load and Process Data\\n\",\n", - " \"\\n\",\n", - " \"Load the datasets for training and validation.\"\n", - " ]\n", - " },\n", - " {\n", - " \"cell_type\": \"code\",\n", - " \"execution_count\": null,\n", - " \"metadata\": {},\n", - " \"outputs\": [],\n", - " \"source\": [\n", - " \"# Load datasets\\n\",\n", - " \"xe_train_ds = XeniumDataset(root=TRAIN_DIR)\\n\",\n", - " \"xe_val_ds = XeniumDataset(root=VAL_DIR)\"\n", - " ]\n", - " },\n", - " {\n", - " \"cell_type\": \"markdown\",\n", - " \"metadata\": {},\n", - " \"source\": [\n", - " \"## Step 3: Initialize and Train the Model\\n\",\n", - " \"\\n\",\n", - " \"Initialize the Segger model and the Lightning trainer, then train the model.\"\n", - " ]\n", - " },\n", - " {\n", - " \"cell_type\": \"code\",\n", - " \"execution_count\": null,\n", - " \"metadata\": {},\n", - " \"outputs\": [],\n", - " \"source\": [\n", - " \"# Initialize model and trainer\\n\",\n", - " \"model = Segger(init_emb=8, hidden_channels=64, out_channels=16, heads=4)\\n\",\n", - " \"model = to_hetero(model, (['tx', 'nc'], [('tx', 'belongs', 'nc'), ('tx', 'neighbors', 'tx')]), aggr='sum')\\n\",\n", - " \"\\n\",\n", - " \"litsegger = LitSegger(model)\\n\",\n", - " \"trainer = L.Trainer(\\n\",\n", - " \" accelerator=ACCELERATOR,\\n\",\n", - " \" strategy=STRATEGY,\\n\",\n", - " \" precision=PRECISION,\\n\",\n", - " \" devices=DEVICES,\\n\",\n", - " \" max_epochs=EPOCHS,\\n\",\n", - " \" default_root_dir=DEFAULT_ROOT_DIR,\\n\",\n", - " \" # callbacks=[EarlyStopping(monitor=\\\"train_loss\\\", mode=\\\"min\\\")]\\n\",\n", - " \")\\n\",\n", - " \"\\n\",\n", - " \"# Train model\\n\",\n", - " \"train_loader = DataLoader(xe_train_ds, batch_size=BATCH_SIZE_TRAIN, num_workers=0, pin_memory=True, shuffle=True)\\n\",\n", - " \"val_loader = DataLoader(xe_val_ds, batch_size=BATCH_SIZE_VAL, num_workers=0, pin_memory=True, shuffle=True)\\n\",\n", - " \"trainer.fit(litsegger, train_loader, val_loader)\"\n", - " ]\n", - " }\n", - " ],\n", - " \"metadata\": {\n", - " \"kernelspec\": {\n", - " \"display_name\": \"Python 3\",\n", - " \"language\": \"python\",\n", - " \"name\": \"python3\"\n", - " },\n", - " \"language_info\": {\n", - " \"codemirror_mode\": {\n", - " \"name\": \"ipython\",\n", - " \"version\": 3\n", - " },\n", - " \"file_extension\": \".py\",\n", - " \"mimetype\": \"text/x-python\",\n", - " \"name\": \"python\",\n", - " \"nbconvert_exporter\": \"python\",\n", - " \"pygments_lexer\": \"ipython3\",\n", - " \"version\": \"3.8.5\"\n", - " }\n", - " },\n", - " \"nbformat\": 4,\n", - " \"nbformat_minor\": 5\n", - "}\n" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/scripts/create_dataset.py b/scripts/create_dataset.py deleted file mode 100644 index 0232251..0000000 --- a/scripts/create_dataset.py +++ /dev/null @@ -1,98 +0,0 @@ -import argparse -import os -from pathlib import Path -from urllib import request -from segger.data.utils import XeniumSample - -def download_file(url, dest): - if not dest.exists(): - print(f"Downloading {url} to {dest}...") - request.urlretrieve(url, dest) - print("Download completed.") - -def main(args): - os.environ["USE_PYGEOS"] = "0" - - raw_data_dir = Path(args.raw_data_dir) - processed_data_dir = Path(args.processed_data_dir) - - raw_data_dir.mkdir(parents=True, exist_ok=True) - processed_data_dir.mkdir(parents=True, exist_ok=True) - - transcripts_url = args.transcripts_url - nuclei_url = args.nuclei_url - - transcripts_path = raw_data_dir / "transcripts.csv.gz" - nuclei_path = raw_data_dir / "nucleus_boundaries.csv.gz" - - download_file(transcripts_url, transcripts_path) - download_file(nuclei_url, nuclei_path) - - xs = XeniumSample().load_transcripts(path=transcripts_path, min_qv=args.min_qv) - xs.load_nuclei(path=nuclei_path) - - if args.parallel: - xs.save_dataset_for_segger_parallel( - processed_data_dir, - d_x=args.d_x, d_y=args.d_y, x_size=args.x_size, y_size=args.y_size, - margin_x=args.margin_x, margin_y=args.margin_y, - r_tx=args.r_tx, - val_prob=args.val_prob, - test_prob=args.test_prob, - compute_labels=args.compute_labels, - sampling_rate=args.sampling_rate, - num_workers=args.num_workers, - receptive_field={ - "k_nc": args.k_nc, - "dist_nc": args.dist_nc, - "k_tx": args.k_tx, - "dist_tx": args.dist_tx - } - ) - else: - xs.save_dataset_for_segger( - processed_data_dir, - d_x=args.d_x, d_y=args.d_y, x_size=args.x_size, y_size=args.y_size, - margin_x=args.margin_x, margin_y=args.margin_y, - r_tx=args.r_tx, - val_prob=args.val_prob, - test_prob=args.test_prob, - compute_labels=args.compute_labels, - sampling_rate=args.sampling_rate, - receptive_field={ - "k_nc": args.k_nc, - "dist_nc": args.dist_nc, - "k_tx": args.k_tx, - "dist_tx": args.dist_tx - } - ) - - print("Dataset creation completed.") - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Create dataset from Xenium Human Pancreatic data.") - parser.add_argument("--raw_data_dir", type=str, required=True, help="Directory to store raw data.") - parser.add_argument("--processed_data_dir", type=str, required=True, help="Directory to store processed data.") - parser.add_argument("--transcripts_url", type=str, required=True, help="URL for transcripts data.") - parser.add_argument("--nuclei_url", type=str, required=True, help="URL for nuclei data.") - parser.add_argument("--min_qv", type=int, default=30, help="Minimum quality value for filtering transcripts.") - parser.add_argument("--d_x", type=int, default=180, help="Step size in x direction for tiles.") - parser.add_argument("--d_y", type=int, default=180, help="Step size in y direction for tiles.") - parser.add_argument("--x_size", type=int, default=200, help="Width of each tile.") - parser.add_argument("--y_size", type=int, default=200, help="Height of each tile.") - parser.add_argument("--margin_x", type=int, default=None, help="Margin in x direction.") - parser.add_argument("--margin_y", type=int, default=None, help="Margin in y direction.") - parser.add_argument("--r_tx", type=int, default=3, help="Radius for building the graph.") - parser.add_argument("--val_prob", type=float, default=0.1, help="Probability of assigning a tile to the validation set.") - parser.add_argument("--test_prob", type=float, default=0.1, help="Probability of assigning a tile to the test set.") - parser.add_argument("--k_nc", type=int, default=3, help="Number of nearest neighbors for nuclei.") - parser.add_argument("--dist_nc", type=int, default=10, help="Distance threshold for nuclei.") - parser.add_argument("--k_tx", type=int, default=5, help="Number of nearest neighbors for transcripts.") - parser.add_argument("--dist_tx", type=int, default=3, help="Distance threshold for transcripts.") - parser.add_argument("--compute_labels", type=bool, default=True, help="Whether to compute edge labels.") - parser.add_argument("--sampling_rate", type=float, default=1, help="Rate of sampling tiles.") - parser.add_argument("--parallel", action='store_true', help="Use parallel processing.") - parser.add_argument("--num_workers", type=int, default=4, help="Number of workers for parallel processing.") - - args = parser.parse_args() - main(args) diff --git a/scripts/create_dataset.sh b/scripts/create_dataset.sh deleted file mode 100644 index 4c81f23..0000000 --- a/scripts/create_dataset.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# Ensure the script exits if a command fails -set -e - -# Define the data directory paths -RAW_DATA_DIR="data_raw/pancreatic" -PROCESSED_DATA_DIR="data_tidy/pyg_datasets/pancreatic" - -# Define the data URLs -TRANSCRIPTS_URL="https://cf.10xgenomics.com/samples/xenium/1.3.0/xenium_human_pancreas/analysis/transcripts.csv.gz" -NUCLEI_URL="https://cf.10xgenomics.com/samples/xenium/1.3.0/xenium_human_pancreas/analysis/nucleus_boundaries.csv.gz" - -# Run the data preparation script -python scripts/create_dataset.py \ - --raw_data_dir $RAW_DATA_DIR \ - --processed_data_dir $PROCESSED_DATA_DIR \ - --transcripts_url $TRANSCRIPTS_URL \ - --nuclei_url $NUCLEI_URL \ - --min_qv 30 \ - --d_x 180 \ - --d_y 180 \ - --x_size 200 \ - --y_size 200 \ - --r_tx 3 \ - --val_prob 0.1 \ - --test_prob 0.1 \ - --k_nc 3 \ - --dist_nc 10 \ - --k_tx 5 \ - --dist_tx 3 \ - --compute_labels True \ - --sampling_rate 0.1 \ - --parallel \ - --num_workers 4 - - -git config --global user.email elyas.heidari@dkfz-heidelberg.de \ No newline at end of file diff --git a/scripts/create_dataset_2.py b/scripts/create_dataset_2.py deleted file mode 100644 index 25428f4..0000000 --- a/scripts/create_dataset_2.py +++ /dev/null @@ -1,66 +0,0 @@ -import scanpy as sc -from segger.data.utils import * -# Assuming we have an AnnData object `adata` from scRNA-seq analysis - - -raw_data_dir = Path('data_raw/xenium/') -processed_data_dir = Path('data_tidy/pyg_datasets') -sample_tag = "Xenium_FFPE_Human_Breast_Cancer_Rep1" - -raw_data_dir.mkdir(parents=True, exist_ok=True) -processed_data_dir.mkdir(parents=True, exist_ok=True) - -transcripts_path = raw_data_dir / "transcripts.parquet" -nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' - -scRNAseq = sc.read(scRNAseq_path) - - -# Step 1: Calculate the gene cell type abundance embedding -celltype_column = 'celltype_minor' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) - -# Step 2: Create a XeniumSample instance -xenium_sample = XeniumSample() - -# Step 3: Load transcripts and include the cell type abundance embedding -xenium_sample.load_transcripts( - base_path=Path(raw_data_dir), - sample=sample_tag, - transcripts_filename='transcripts.parquet', - file_format="parquet", - additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} -) - -# Step 4: Set the embedding to "cell_type_abundance" -xenium_sample.set_embedding("cell_type_abundance") - -xenium_sample.load_nuclei(path=nuclei_path, file_format='parquet') - -xenium_sample.get_bounding_box(x_max=1000, y_max=1000, in_place=True) - -xenium_sample.save_dataset_for_segger( - processed_dir=processed_data_dir / (sample_tag +'_emb'), - x_size=200, - y_size=200, - d_x=180, - d_y=180, - margin_x=20, - margin_y=20, - r_tx=10, - val_prob=.2, - test_prob=.2, - compute_labels=True, - sampling_rate=1, - num_workers=0, - receptive_field={ - "k_nc": 5, - "dist_nc": 10, - "k_tx": 10, - "dist_tx": 3, - }, - ) - -# Now, `pyg_data` will use the cell type abundance embedding for the transcripts - diff --git a/scripts/create_tile_speed.py b/scripts/create_tile_speed.py deleted file mode 100644 index 6ccca9f..0000000 --- a/scripts/create_tile_speed.py +++ /dev/null @@ -1,140 +0,0 @@ -import scanpy as sc -from segger.data.io import * -from pathlib import Path -import time -import pandas as pd -import matplotlib.pyplot as plt - -# Assuming we have an AnnData object `adata` from scRNA-seq analysis -raw_data_dir = Path('data_raw/xenium/') -processed_data_dir = Path('data_tidy/pyg_datasets') -sample_tag = "Xenium_FFPE_Human_Breast_Cancer_Rep1" - -raw_data_dir.mkdir(parents=True, exist_ok=True) -processed_data_dir.mkdir(parents=True, exist_ok=True) - -transcripts_path = raw_data_dir / "transcripts.parquet" -nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' - -scRNAseq = sc.read(scRNAseq_path) - -sc.pp.subsample(scRNAseq, 0.1) - -# Step 1: Calculate the gene cell type abundance embedding -celltype_column = 'celltype_minor' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) - -# Step 2: Create a XeniumSample instance -xenium_sample = XeniumSample() - -# Step 3: Load transcripts and include the cell type abundance embedding -xenium_sample.load_transcripts( - base_path=Path(raw_data_dir), - sample=sample_tag, - transcripts_filename='transcripts.parquet', - file_format="parquet", - additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} -) - -# Step 4: Set the embedding to "cell_type_abundance" -xenium_sample.set_embedding("cell_type_abundance") - -# Load nuclei data -xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') - -# xenium_sample.get_bounding_box(x_min = 1000, y_min = 1000, x_max=1360, y_max=1360, in_place=True) - - -# xenium_sample.precompute_tx_tx_graph(k=5, dist=10, workers = 8) - -import time -t0 = time.time() -data = xenium_sample.build_pyg_data_from_tile( - boundaries_df=xenium_sample.boundaries_df, - transcripts_df=xenium_sample.transcripts_df, - r_tx = 20, - k_tx = 20, - use_precomputed=True, - workers=1 - ) -t1 = time.time() - -t0 = time.time() -xenium_sample.save_dataset_for_segger( - processed_dir = Path('data_tidy/pyg_datasets/embedding'), - x_size = 360, - y_size = 360, - d_x = 180, - d_y = 180, - margin_x = 10, - margin_y = 10, - compute_labels = False, - r_tx = 5, - k_tx = 5, # New parameter for k_tx - val_prob = 0.1, - test_prob = 0.2, - neg_sampling_ratio_approx = 5, - sampling_rate = 1, - num_workers = 1, - receptive_field = { - "k_bd": 4, - "dist_bd": 15, - "k_tx": 5, - "dist_tx": 5, - }, - # gpu: bool = False, - # workers: int = 1, - use_precomputed = False # New argument - ) -t1 = time.time() -# Crop to a smaller bounding box to speed up the comparison - - -# # Compare the speed of different methods -# methods = ['kd_tree', 'hnsw', 'faiss_cpu', 'faiss_gpu'] - -# # methods = ['faiss_cpu', 'kd_tree'] -# timings = {} - -# # Measure the time taken by each method -# for method in methods: -# base_method = method -# if 'faiss' in method: -# gpu = 'gpu' in method # Determine if GPU should be used for FAISS -# base_method = method.split('_')[0] -# else: -# gpu = False # RAPIDS and cuGraph always use GPU, no need for the flag - -# # Extract the base method (e.g., 'faiss', 'rapids', etc.) - -# start_time = time.time() -# data = xenium_sample.build_pyg_data_from_tile( -# boundaries_df=xenium_sample.boundaries_df, -# transcripts_df=xenium_sample.transcripts_df, -# compute_labels=True, -# method=base_method, -# gpu=gpu, -# workers=1 -# ) -# elapsed_time = time.time() - start_time -# timings[method] = elapsed_time -# print(f"{method} method took {elapsed_time:.4f} seconds") - -# # Save timings to a CSV file -# timings_df = pd.DataFrame(list(timings.items()), columns=['Method', 'Time']) -# timings_df.to_csv('timings_results.csv', index=False) - -# # Generate a bar plot of the timings -# plt.figure(figsize=(10, 6)) -# plt.bar(timings_df['Method'], timings_df['Time'], color='skyblue') -# plt.xlabel('Method') -# plt.ylabel('Time (seconds)') -# plt.title('Timing Comparison of Different Methods') -# plt.xticks(rotation=45) -# plt.tight_layout() - -# # Save the plot as an image file -# plt.savefig('timings_comparison_plot.png') - -# print("Results saved to 'timings_results.csv' and 'timings_comparison_plot.png'.") diff --git a/scripts/create_tile_speed_iter.py b/scripts/create_tile_speed_iter.py deleted file mode 100644 index 34edec3..0000000 --- a/scripts/create_tile_speed_iter.py +++ /dev/null @@ -1,137 +0,0 @@ -import scanpy as sc -from segger.data.io import * -from pathlib import Path -import time -import pandas as pd -import matplotlib.pyplot as plt - -# Load your data and initial setup -raw_data_dir = Path('data_raw/xenium/') -processed_data_dir = Path('data_tidy/pyg_datasets') -figures_dir = Path('figures/') -sample_tag = "Xenium_FFPE_Human_Breast_Cancer_Rep1" - -# Create the figures directory if it doesn't exist -figures_dir.mkdir(parents=True, exist_ok=True) - -raw_data_dir.mkdir(parents=True, exist_ok=True) -processed_data_dir.mkdir(parents=True, exist_ok=True) - -transcripts_path = raw_data_dir / "transcripts.parquet" -nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' - -scRNAseq = sc.read(scRNAseq_path) - -sc.pp.subsample(scRNAseq, 0.1) - -# Step 1: Calculate the gene cell type abundance embedding -celltype_column = 'celltype_minor' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) - -# Step 2: Create a XeniumSample instance -xenium_sample = XeniumSample() - -# Step 3: Load transcripts and include the cell type abundance embedding -xenium_sample.load_transcripts( - base_path=Path(raw_data_dir), - sample=sample_tag, - transcripts_filename='transcripts.parquet', - file_format="parquet", - additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} -) - - - -data = xenium_sample.build_pyg_data_from_tile( - boundaries_df=xenium_sample.boundaries_df, - transcripts_df=xenium_sample.transcripts_df, - use_precomputed=True, - workers=1 - ) -# Step 4: Set the embedding to "cell_type_abundance" -xenium_sample.set_embedding("cell_type_abundance") - -# Load nuclei data -xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') - -# Set initial bounds -xenium_sample._set_bounds() - -# Define the methods to compare -methods = ['kd_tree', 'hnsw', 'faiss_cpu', 'faiss_gpu'] - -# Initialize an empty dictionary to store timings for each method and subset size -all_timings = [] - -# Step 1: Measure the time taken by each method on progressively smaller datasets -x_range = (xenium_sample.x_max - xenium_sample.x_min) -y_range = (xenium_sample.y_max - xenium_sample.y_min) - -while x_range > 10 and y_range > 10: # Arbitrary cutoff to avoid excessively small subsets - # Update bounds for the current subset - xenium_sample.get_bounding_box( - x_min=xenium_sample.x_min, - y_min=xenium_sample.y_min, - x_max=xenium_sample.x_min + x_range, - y_max=xenium_sample.y_min + y_range, - in_place=True - ) - - # Record the number of transcripts - num_transcripts = len(xenium_sample.transcripts_df) - - # Measure the time for each method - timings = {} - for method in methods: - base_method = method - if 'faiss' in method: - gpu = 'gpu' in method # Determine if GPU should be used for FAISS - base_method = method.split('_')[0] - else: - gpu = False # RAPIDS and cuGraph always use GPU, no need for the flag - - start_time = time.time() - data = xenium_sample.build_pyg_data_from_tile( - boundaries_df=xenium_sample.boundaries_df, - transcripts_df=xenium_sample.transcripts_df, - compute_labels=True, - method=base_method, - gpu=gpu, - workers=1 - ) - elapsed_time = time.time() - start_time - timings[method] = elapsed_time - print(f"{method} method took {elapsed_time:.4f} seconds on {num_transcripts} transcripts") - - # Store the results - timings['num_transcripts'] = num_transcripts - all_timings.append(timings) - - # Reduce the bounding box size by half - x_range /= 2 - y_range /= 2 - -# Convert the results to a DataFrame -timings_df = pd.DataFrame(all_timings) - -# Save the results to a CSV file -timings_df.to_csv('timings_results_by_subset.csv', index=False) - -# Step 2: Plot the results with color-blind-friendly colors -color_palette = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf'] # A color-blind-friendly palette - -plt.figure(figsize=(10, 6)) -for method, color in zip(methods, color_palette): - plt.plot(timings_df['num_transcripts'], timings_df[method], label=method, color=color) - -plt.xlabel('Number of Transcripts') -plt.ylabel('Time (seconds)') -plt.title('Method Timing vs. Number of Transcripts') -plt.legend() -plt.grid(True) - -# Save the plot as a PDF -plt.savefig(figures_dir / 'method_timing_vs_transcripts.pdf', format='pdf') - -plt.show() From 103f8d02e995fe4261346aaf4955be93ace86606 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:19:52 +0200 Subject: [PATCH 016/156] Update static.yml --- .github/workflows/static.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 69cb4cc..f01442e 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -60,4 +60,15 @@ jobs: with: path: './site' - + + publish: + needs: [build] + if: contains(github.ref, 'tags') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Build and publish to pypi + uses: JRubics/poetry-publish@v1.17 + with: + python_version: "3.10" + pypi_token: ${{ secrets.PYPI_TOKEN }} From ef9cfc14f79c672c1d8be207f55faa88bd12304b Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:34:11 +0200 Subject: [PATCH 017/156] Update static.yml --- .github/workflows/static.yml | 38 ++++++++---------------------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index f01442e..6d28307 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -3,7 +3,6 @@ name: Deploy MkDocs to GitHub Pages on: push: branches: ["main"] - workflow_dispatch: permissions: @@ -20,55 +19,34 @@ jobs: runs-on: ubuntu-latest steps: - # Step 1: Checkout the repository - name: Checkout repository uses: actions/checkout@v4 - # Step 2: Set up Python environment - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' - - # Step 3: Install package and set PYTHONPATH - name: Install package and documentation dependencies run: | - # export PYTHONPATH=$PYTHONPATH:$(pwd) pip install -e . - # Step 4: Install MkDocs and required plugins - name: Install MkDocs and plugins run: | pip install mkdocs mkdocs-material mkdocs-autorefs mkdocstrings[python] mkdocs-jupyter pymdown-extensions termynal mkdocs-minify-plugin - # Step 5: Debug environment to check installed packages - name: Debug environment run: | python -m pip freeze python -m mkdocs --version - - # Step 6: Build the MkDocs site with verbose output - name: Build MkDocs site - run: mkdocs gh-deploy --force - - - # Step 7: Upload the generated site directory as an artifact for GitHub Pages - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: './site' - + run: mkdocs build --verbose - publish: - needs: [build] - if: contains(github.ref, 'tags') - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Build and publish to pypi - uses: JRubics/poetry-publish@v1.17 - with: - python_version: "3.10" - pypi_token: ${{ secrets.PYPI_TOKEN }} + - name: Deploy to GitHub Pages + run: | + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + mkdocs gh-deploy --force + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} From d996d3898c5f5298a2eae6abfe4d237e616fdd27 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:43:11 +0200 Subject: [PATCH 018/156] Update static.yml --- .github/workflows/static.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 6d28307..a9eb529 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -45,8 +45,6 @@ jobs: - name: Deploy to GitHub Pages run: | - git config --global user.name "GitHub Actions" - git config --global user.email "actions@github.com" mkdocs gh-deploy --force env: GH_TOKEN: ${{ secrets.GH_TOKEN }} From 721b468e6850d6e6645a6410293c0fe13134b829 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:44:16 +0200 Subject: [PATCH 019/156] Update static.yml --- .github/workflows/static.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index a9eb529..dc6cfba 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -45,6 +45,8 @@ jobs: - name: Deploy to GitHub Pages run: | - mkdocs gh-deploy --force + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + mkdocs gh-deploy --force --remote-name origin env: - GH_TOKEN: ${{ secrets.GH_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 20abe07b733f8a758fe3e8e5a2dec879730051b1 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:50:07 +0200 Subject: [PATCH 020/156] Update segger_tutorial.ipynb --- docs/notebooks/segger_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index 96ccb70..27d380c 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -78,7 +78,7 @@ } }, "source": [ - "To create the dataset, you need to specify the path to the **transcripts** file and the **nuclei boundaries** file. These are typically downloaded from a spatial transcriptomics dataset like the **Xenium Human Pancreatic Dataset**.\n", + "To create the dataset, you need to specify the path to the **transcripts** file and the **nuclei boundaries** file. These are typically downloaded from a spatial transcriptomics dataset like the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer).\n", "\n", "- **`--transcripts_path`**: Path to the transcripts file, which contains single-cell transcriptomic data.\n", "- **`--boundaries_path`**: Path to the boundaries file, most often representing the nuclei boundaries in the imaging dataset." From 23a1e9480515efe1ea6abf7c906b69cf88617492 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 12 Sep 2024 06:51:55 +0200 Subject: [PATCH 021/156] Update static.yml --- .github/workflows/static.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index dc6cfba..b6a7e3a 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -6,7 +6,7 @@ on: workflow_dispatch: permissions: - contents: read + contents: write pages: write id-token: write From e24f997b90f1ad699565a33842bf756a40757e27 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 08:37:42 +0200 Subject: [PATCH 022/156] added notebooks to the docs --- mkdocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index 2fc7d31..06f6922 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,6 +45,8 @@ nav: - Dataset Creation: user_guide/data_creation.md - Training: user_guide/training.md - Validation: user_guide/validation.md + - Tutorials: + - Sample workflow: notebooks/segger_tutorial.md - CLI: cli.md - API Reference: - Data: api/data/index.md From b103e2dfe628e25327f206163c4068131a497861 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 09:07:36 +0200 Subject: [PATCH 023/156] edited the readme --- README.md | 103 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 6b5e926..fd7860b 100644 --- a/README.md +++ b/README.md @@ -1,75 +1,94 @@ -# Segger +# 🍳 Welcome to segger +**segger** is a cutting-edge tool for **cell segmentation** in **single-molecule spatial omics** datasets. By leveraging **graph neural networks (GNNs)** and heterogeneous graphs, segger offers unmatched accuracy and scalability. -*Segger* is a cell segmentation model for single-molecule resolved datasets, addressing the challenges of accurate and fast single-cell segmentation in imaging-based spatial omics. By leveraging the co-occurrence of nucleic and cytoplasmic molecules (e.g., transcripts), Segger employs a heterogeneous graph structure integrating fixed-radius nearest neighbor graphs for nuclei and molecules, with edges connecting transcripts to nuclei based on spatial proximity. A graph neural network (GNN) propagates information across these edges to learn molecule-nuclei associations, refining cell borders post-training. Benchmarks on 10X Xenium and MERSCOPE demonstrate Segger's superior accuracy and efficiency over existing methods like Baysor and Cellpose, with faster training and easy adaptability to different datasets and technologies. - +# How segger Works ![Segger Model](docs/images/Segger_model_08_2024.png) +--- -## Installation +# Quick Links -To install Segger, clone this repository and install the required dependencies: +- 💾 **[Installation Guide](https://elihei2.github.io/segger_dev/installation/)** + Get started with installing segger on your machine. -```bash -git clone https://github.com/EliHei2/segger_dev.git -cd segger_dev -pip install -r requirements.txt -``` +- 📖 **[User Guide](https://elihei2.github.io/segger_dev/user_guide/)** + Learn how to use segger for cell segmentation tasks. -Alternatively, you can create a conda environment using the provided `environment.yml` file: +- 💻 **[Command-Line Interface (CLI)](https://elihei2.github.io/segger_dev/cli/)** + Explore the CLI options for working with segger. -```bash -conda env create -f environment.yml -conda activate segger -``` +- 📚 **[API Reference](https://elihei2.github.io/segger_dev/api/)** + Dive into the detailed API documentation for advanced usage. -## Download Pancreas Dataset +--- -Download the Pancreas dataset from 10x Genomics: +# Why segger? -1. Go to the [Xenium Human Pancreatic Dataset Explorer](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer). -2. Download the `transcripts.csv.gz` and `nucleus_boundaries.csv.gz` files. -3. Place these files in a directory, e.g., `data_raw/pancreas`. +- ⚙️ **Highly parallelizable** – Optimized for multi-GPU environments +- ⚡ **Fast and efficient** – Trains in a fraction of the time compared to alternatives +- 🔄 **Transfer learning** – Easily adaptable to new datasets and technologies -## Creating Dataset +### Challenges in Segmentation -To create a dataset for Segger, use the `create_data.py` script. The script takes several arguments to customize the dataset creation process. +Spatial omics segmentation faces issues like: -```bash -python create_data.py --transcripts_path data_raw/pancreas/transcripts.csv.gz --nuclei_path data_raw/pancreas/nucleus_boundaries.csv.gz --output_dir data_tidy/pyg_datasets/pancreas --d_x 180 --d_y 180 --x_size 200 --y_size 200 --r 3 --val_prob 0.1 --test_prob 0.1 --k_nc 3 --dist_nc 10 --k_tx 5 --dist_tx 3 --compute_labels True --sampling_rate 1 -``` +- **Over/Under-segmentation** +- **Transcript contamination** +- **Scalability limitations** -This command will process the Pancreas dataset and save the processed data in the specified output directory. +segger tackles these with a **graph-based approach**, achieving superior segmentation accuracy. -## Training +--- +## Installation Options -To train the Segger model, use the `train.py` script. The script takes several arguments to customize the training process. +Choose the installation method that best suits your needs. + +### Micromamba Installation ```bash -python train.py --train_dir data_tidy/pyg_datasets/pancreas/train_tiles/processed --val_dir data_tidy/pyg_datasets/pancreas/val_tiles/processed --test_dir data_tidy/pyg_datasets/pancreas/test_tiles/processed --epochs 100 --batch_size_train 4 --batch_size_val 4 --learning_rate 1e-3 --init_emb 8 --hidden_channels 64 --out_channels 16 --heads 4 --aggr sum --accelerator cuda --strategy auto --precision 16-mixed --devices 4 --default_root_dir ./models/pancreas +micromamba create -n segger-rapids --channel-priority 1 \ + -c rapidsai -c conda-forge -c nvidia -c pytorch -c pyg \ + rapids=24.08 python=3.* 'cuda-version>=11.4,<=11.8' jupyterlab \ + 'pytorch=*=*cuda*' 'pyg=*=*cu118' pyg-lib pytorch-sparse +micromamba install -n segger-rapids --channel-priority 1 --file mamba_environment.yml +micromamba run -n segger-rapids pip install --no-deps ./ ``` -This command will train the Segger model on the processed Pancreas dataset and save the trained model in the specified output directory. - -## Prediction - -To make predictions using a trained Segger model, use the `predict.py` script. The script takes several arguments to customize the prediction process. +### GitHub Installation ```bash -python predict.py --train_dir data_tidy/pyg_datasets/pancreas/train_tiles/processed --val_dir data_tidy/pyg_datasets/pancreas/val_tiles/processed --test_dir data_tidy/pyg_datasets/pancreas/test_tiles/processed --checkpoint_path ./models/pancreas/lightning_logs/version_0/checkpoints/epoch=99-step=100.ckpt --batch_size 1 --init_emb 8 --hidden_channels 64 --out_channels 16 --heads 4 --aggr sum --accelerator cuda --devices 1 --default_root_dir ./log_final --score_cut 0.5 --k_nc 4 --dist_nc 20 --k_tx 5 --dist_tx 10 +git clone https://github.com/EliHei2/segger_dev.git +cd segger_dev +pip install . ``` +--- + + + +--- + +# Powered by -This command will use the trained Segger model to make predictions on the Pancreas dataset and save the predictions in the specified output directory. +- ⚡ **PyTorch Lightning & PyTorch Geometric**: Enables fast, efficient graph neural network (GNN) implementation for heterogeneous graphs. +- ⚙️ **Dask**: Scalable parallel processing and distributed task scheduling, ideal for handling large transcriptomic datasets. +- 🗺️ **Shapely & Geopandas**: Utilized for spatial operations such as polygon creation, scaling, and spatial relationship computations. +- 🖥️ **RAPIDS**: Provides GPU-accelerated computation for tasks like k-nearest neighbors (KNN) graph construction. +- 📊 **AnnData & Scanpy**: Efficient processing for single-cell datasets. +- 📐 **SciPy**: Facilitates spatial graph construction, including distance metrics and convex hull calculations for transcript clustering. -## Benchmarking +--- -Benchmarking utilities are provided to evaluate the performance of the Segger model. You can find these utilities in the `benchmark` directory. +# Contributions -## Visualization +segger is **open-source** and welcomes contributions. Join us in advancing spatial omics segmentation! -Visualization scripts are also provided to help visualize the results. You can find these scripts in the `benchmark` directory. +- 🛠️ **Source Code** + [GitHub](https://github.com/EliHei2/segger_dev) -## License +- 🐞 **Bug Tracker** + [Report Issues](https://github.com/EliHei2/segger_dev/issues) -This project is licensed under the MIT License - see the LICENSE file for details. +- 📚 **Full Documentation** + [API Reference](https://elihei2.github.io/segger_dev/api/) From d3b75598822a59ed8c6fee474464ed799206c1d0 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 09:08:33 +0200 Subject: [PATCH 024/156] edited the readme --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index fd7860b..ac858ab 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,6 @@ git clone https://github.com/EliHei2/segger_dev.git cd segger_dev pip install . ``` ---- - - --- From 7c8caa53e3980b6a7d4048e9bf14daa95bef7862 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 09:10:16 +0200 Subject: [PATCH 025/156] bug fix docs --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 06f6922..e5044d3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,7 +46,7 @@ nav: - Training: user_guide/training.md - Validation: user_guide/validation.md - Tutorials: - - Sample workflow: notebooks/segger_tutorial.md + - Sample workflow: notebooks/segger_tutorial.ipynb - CLI: cli.md - API Reference: - Data: api/data/index.md From 239dca9fa7371113c4bb6f81c59e1780b0b9bba3 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 09:12:42 +0200 Subject: [PATCH 026/156] bug fix docs --- docs/user_guide/data_creation.md | 263 +++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 docs/user_guide/data_creation.md diff --git a/docs/user_guide/data_creation.md b/docs/user_guide/data_creation.md new file mode 100644 index 0000000..571f9ef --- /dev/null +++ b/docs/user_guide/data_creation.md @@ -0,0 +1,263 @@ +# Data Preparation for `segger` + +The `segger` package provides a comprehensive data preparation module for cell segmentation and subsequent graph-based deep learning tasks by leveraging scalable and efficient processing tools. + + +!!! note + Currently, `segger` supports **Xenium** and **Merscope** datasets. + + +## Steps + +The data preparation module offers the following key functionalities: + + +1. **Lazy Loading of Large Datasets**: Utilizes **Dask** to handle large-scale transcriptomics and boundary datasets efficiently, avoiding memory bottlenecks. +2. **Initial Filtering**: Filters transcripts based on quality metrics and dataset-specific criteria to ensure data integrity and relevance. +3. **Tiling**: Divides datasets into spatial tiles, essential for localized graph-based models and parallel processing. +4. **Graph Construction**: Converts spatial data into graph formats using **PyTorch Geometric (PyG)**, enabling the application of graph neural networks (GNNs). +5. **Boundary Processing**: Handles polygons, performs spatial geometrical calculations, and checks transcript overlaps with boundaries. + + +!!! note "Key Technologies" + - **Dask**: Facilitates parallel and lazy data processing, enabling scalable handling of large datasets. + - **PyTorch Geometric (PyG)**: Enables the construction of graph-based data representations suitable for GNNs. + - **Shapely & Geopandas**: Utilized for spatial operations such as polygon creation, scaling, and spatial relationship computations. + - **Dask-Geopandas**: Extends Geopandas for parallel processing of geospatial data, enhancing scalability. + +## Core Components + +### 1. `SpatialTranscriptomicsSample` (Abstract Base Class) + +This abstract class defines the foundational structure for managing spatial transcriptomics datasets. It provides essential methods for: + +- **Loading Data**: Scalable loading of transcript and boundary data using Dask. +- **Filtering Transcripts**: Applying quality-based or dataset-specific filtering criteria. +- **Spatial Relationships**: Computing overlaps and spatial relationships between transcripts and boundaries. +- **Tiling**: Dividing datasets into smaller spatial tiles for localized processing. +- **Graph Preparation**: Converting data tiles into `PyTorch Geometric` graph structures. + +#### Key Methods: + +- **`load_transcripts()`**: Loads transcriptomic data from Parquet files, applies quality filtering, and incorporates additional gene embeddings. +- **`load_boundaries()`**: Loads boundary data (e.g., cell or nucleus boundaries) from Parquet files. +- **`get_tile_data()`**: Retrieves transcriptomic and boundary data within specified spatial bounds. +- **`generate_and_scale_polygons()`**: Creates and scales polygon representations of boundaries for spatial computations. +- **`compute_transcript_overlap_with_boundaries()`**: Determines the association of transcripts with boundary polygons. +- **`build_pyg_data_from_tile()`**: Converts tile-specific data into `HeteroData` objects suitable for PyG models. + +### 2. `XeniumSample` and `MerscopeSample` (Child Classes) + +These classes inherit from `SpatialTranscriptomicsSample` and implement dataset-specific processing logic: + +- **`XeniumSample`**: Tailored for **Xenium** datasets, it includes specific filtering rules to exclude unwanted transcripts based on naming patterns (e.g., `NegControlProbe_`, `BLANK_`). +- **`MerscopeSample`**: Designed for **Merscope** datasets, allowing for custom filtering and processing logic as needed. + +## Workflow + +The dataset creation and processing workflow involves several key steps, each ensuring that the spatial transcriptomics data is appropriately prepared for downstream machine learning tasks. + +### Step 1: Data Loading and Filtering + +- **Transcriptomic Data**: Loaded lazily using Dask to handle large datasets efficiently. Custom filtering rules specific to the dataset (Xenium or Merscope) are applied to ensure data quality. +- **Boundary Data**: Loaded similarly using Dask, representing spatial structures such as cell or nucleus boundaries. + +### Step 2: Tiling + +- **Spatial Segmentation**: The dataset is divided into smaller, manageable tiles of size $x_{\text{size}} \times y_{\text{size}}$, defined by their top-left corner coordinates $(x_i, y_j)$. + +$$ +n_x = \left\lfloor \frac{x_{\text{max}} - x_{\text{min}}}{d_x} \right\rfloor, \quad n_y = \left\lfloor \frac{y_{\text{max}} - y_{\text{min}}}{d_y} \right\rfloor +$$ + + Where: + - $x_{\text{min}}, y_{\text{min}}$: Minimum spatial coordinates. + - $x_{\text{max}}, y_{\text{max}}$: Maximum spatial coordinates. + - $d_x, d_y$: Step sizes along the $x$- and $y$-axes, respectively. + +- **Transcript and Boundary Inclusion**: For each tile, transcripts and boundaries within the spatial bounds (with optional margins) are included: + +$$ +x_i - \text{margin}_x \leq x_t < x_i + x_{\text{size}} + \text{margin}_x, \quad y_j - \text{margin}_y \leq y_t < y_j + y_{\text{size}} + \text{margin}_y +$$ + + Where: + - $x_t, y_t$: Transcript coordinates. + - $\text{margin}_x, \text{margin}_y$: Optional margins to include contextual data. + +### Step 3: Graph Construction + +For each tile, a graph $G$ is constructed with: + +- **Nodes ($V$)**: + - **Transcripts**: Represented by their spatial coordinates $(x_t, y_t)$ and feature vectors $\mathbf{f}_t$. + - **Boundaries**: Represented by centroid coordinates $(x_b, y_b)$ and associated properties (e.g., area). + +- **Edges ($E$)**: + - Created based on spatial proximity using methods like KD-Tree or FAISS. + - Defined by a distance threshold $d$ and the number of nearest neighbors $k$: + +$$ +E = \{ (v_i, v_j) \mid \text{dist}(v_i, v_j) < d, \, v_i \in V, \, v_j \in V \} +$$ + +### Step 4: Label Computation + +If enabled, edges can be labeled based on relationships, such as whether a transcript belongs to a boundary: + +$$ +\text{label}(t, b) = +\begin{cases} +1 & \text{if } t \text{ belongs to } b \\ +0 & \text{otherwise} +\end{cases} +$$ + +### Step 5: Train, Test, Validation Splitting + +The dataset is partitioned into training, validation, and test sets based on predefined probabilities $p_{\text{train}}, p_{\text{val}}, p_{\text{test}}$: + +$$ +p_{\text{train}} + p_{\text{val}} + p_{\text{test}} = 1 +$$ + +Each tile is randomly assigned to one of these sets according to the specified probabilities. + +### Output + +The final output consists of a set of tiles, each containing a graph representation of the spatial transcriptomics data. These tiles are stored in designated directories (`train_tiles`, `val_tiles`, `test_tiles`) and are ready for integration into machine learning pipelines. + + +## Example Usage + +Below are examples demonstrating how to utilize the `segger` data preparation module for both Xenium and Merscope datasets. + +### Xenium Data + +```python +from segger.data import XeniumSample +from pathlib import Path +import scanpy as sc + +# Set up the file paths +raw_data_dir = Path('/path/to/xenium_output') +processed_data_dir = Path('path/to/processed_files') +sample_tag = "sample/tag" + +# Load scRNA-seq data using Scanpy and subsample for efficiency +scRNAseq_path = 'path/to/scRNAseq.h5ad' +scRNAseq = sc.read(scRNAseq_path) +sc.pp.subsample(scRNAseq, fraction=0.1) + +# Calculate gene cell type abundance embedding from scRNA-seq data +from segger.utils import calculate_gene_celltype_abundance_embedding +celltype_column = 'celltype_column' +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) + +# Create a XeniumSample instance for spatial transcriptomics processing +xenium_sample = XeniumSample() + +# Load transcripts and include the calculated cell type abundance embedding +xenium_sample.load_transcripts( + base_path=raw_data_dir, + sample=sample_tag, + transcripts_filename='transcripts.parquet', + file_format="parquet", + additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} +) + +# Set the embedding to "cell_type_abundance" to use it in further processing +xenium_sample.set_embedding("cell_type_abundance") + +# Load nuclei data to define boundaries +nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" +xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') + +# Build PyTorch Geometric (PyG) data from a tile of the dataset +tile_pyg_data = xenium_sample.build_pyg_data_from_tile( + boundaries_df=xenium_sample.boundaries_df, + transcripts_df=xenium_sample.transcripts_df, + r_tx=20, + k_tx=20, + use_precomputed=False, + workers=1 +) + +# Save dataset in processed format for segmentation +xenium_sample.save_dataset_for_segger( + processed_dir=processed_data_dir, + x_size=360, + y_size=360, + d_x=180, + d_y=180, + margin_x=10, + margin_y=10, + compute_labels=False, + r_tx=5, + k_tx=5, + val_prob=0.1, + test_prob=0.2, + neg_sampling_ratio_approx=5, + sampling_rate=1, + num_workers=1 +) +``` + +### Merscope Data + +```python +from segger.data import MerscopeSample +from pathlib import Path + +# Set up the file paths +raw_data_dir = Path('path/to/merscope_outputs') +processed_data_dir = Path('path/to/processed_files') +sample_tag = "sample_tag" + +# Create a MerscopeSample instance for spatial transcriptomics processing +merscope_sample = MerscopeSample() + +# Load transcripts from a CSV file +merscope_sample.load_transcripts( + base_path=raw_data_dir, + sample=sample_tag, + transcripts_filename='transcripts.csv', + file_format='csv' +) + +# Optionally load cell boundaries +cell_boundaries_path = raw_data_dir / sample_tag / "cell_boundaries.parquet" +merscope_sample.load_boundaries(path=cell_boundaries_path, file_format='parquet') + +# Filter transcripts based on specific criteria +filtered_transcripts = merscope_sample.filter_transcripts(merscope_sample.transcripts_df) + +# Build PyTorch Geometric (PyG) data from a tile of the dataset +tile_pyg_data = merscope_sample.build_pyg_data_from_tile( + boundaries_df=merscope_sample.boundaries_df, + transcripts_df=filtered_transcripts, + r_tx=15, + k_tx=15, + use_precomputed=True, + workers=2 +) + +# Save dataset in processed format for segmentation +merscope_sample.save_dataset_for_segger( + processed_dir=processed_data_dir, + x_size=360, + y_size=360, + d_x=180, + d_y=180, + margin_x=10, + margin_y=10, + compute_labels=True, + r_tx=5, + k_tx=5, + val_prob=0.1, + test_prob=0.2, + neg_sampling_ratio_approx=3, + sampling_rate=1, + num_workers=2 +) +``` From 16bea778852786fdfff6937600eecc67ca0e8645 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 12 Sep 2024 09:30:54 +0200 Subject: [PATCH 027/156] edited readme --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ac858ab..e52cd93 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,9 @@ - 📚 **[API Reference](https://elihei2.github.io/segger_dev/api/)** Dive into the detailed API documentation for advanced usage. +- 📝 **[Sample Workflow](https://elihei2.github.io/segger_dev/notebooks/segger_tutorial/)** + Check out a tutorial showcasing a sample workflow with segger. + --- # Why segger? From 1df3d01ec75c141415a70ca8fda2854123c97251 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Mon, 16 Sep 2024 12:59:33 +0200 Subject: [PATCH 028/156] Update issue templates --- .../\342\236\225-performance-improvement.md" | 17 ++++++++ .../\360\237\220\233-bug-report.md" | 41 +++++++++++++++++++ .../\360\237\232\200-feature-request.md" | 20 +++++++++ 3 files changed, 78 insertions(+) create mode 100644 ".github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" create mode 100644 ".github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" create mode 100644 ".github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" diff --git "a/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" "b/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" new file mode 100644 index 0000000..b281b2f --- /dev/null +++ "b/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" @@ -0,0 +1,17 @@ +--- +name: "➕ Performance Improvement" +about: Suggest an improvement in the performance +title: '' +labels: '' +assignees: andrewmoorman, EliHei2 + +--- + +**Describe the issue with the current implementation** +A clear and concise description of what is wrong or not efficient in the implementation. + +**Suggested improvement** +If you know how to fix it, provide suggestions on what to change. + +**Additional context** +Any other context or screenshots that might help clarify the issue. diff --git "a/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" "b/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" new file mode 100644 index 0000000..b899e5a --- /dev/null +++ "b/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" @@ -0,0 +1,41 @@ +--- +name: "\U0001F41B Bug Report" +about: Create a report to help us improve +title: "[BUG]" +labels: '' +assignees: andrewmoorman, EliHei2 + +--- + +--- +name: Bug Report +about: Report a bug or unexpected behavior +title: "[BUG] " +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots or logs** +If applicable, add screenshots or logs to help explain your problem. + +**Environment (please complete the following information):** +- OS: [e.g. macOS, Windows, Linux] +- Python version: [e.g. 3.9] +- Package version: [e.g. 1.2.3] + +**Additional context** +Add any other context about the problem here. diff --git "a/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" "b/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" new file mode 100644 index 0000000..08679f6 --- /dev/null +++ "b/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" @@ -0,0 +1,20 @@ +--- +name: "\U0001F680 Feature Request" +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. From 0298dbf4c64b74fba91436465a56422a72fe82b3 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 16 Sep 2024 15:12:18 +0200 Subject: [PATCH 029/156] fixed dependencies: dask_geopandas --- README.md | 2 +- pyproject.toml | 3 ++- src/segger/data/io.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e52cd93..295bc42 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Dive into the detailed API documentation for advanced usage. - 📝 **[Sample Workflow](https://elihei2.github.io/segger_dev/notebooks/segger_tutorial/)** - Check out a tutorial showcasing a sample workflow with segger. + Check out our tutorial showcasing a sample workflow with segger. --- diff --git a/pyproject.toml b/pyproject.toml index 033a2db..d65644e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,8 @@ dependencies = [ "shapely>=1.7.0", "path>=17.0.0", "pyarrow>=17.0.0", - "torch-geometric>=2.2.0" + "torch-geometric>=2.2.0", + "dask_geopandas>=0.4.0" ] [project.optional-dependencies] diff --git a/src/segger/data/io.py b/src/segger/data/io.py index 2face2b..a7c12fd 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -59,7 +59,7 @@ def __init__( self.boundaries_graph = boundaries_graph self.keys = keys self.embedding_df = None - self.current_embedding = 'one_hot' + self.current_embedding = 'token' self.verbose = verbose @@ -892,7 +892,7 @@ def build_pyg_data_from_tile( # Lazily prepare transcript embeddings (if available) if self.verbose: print("Preparing transcript embeddings..") token_encoding = self.tx_encoder.transform(transcripts_df[self.keys.FEATURE_NAME.value]) - transcripts_df['one_hot'] = token_encoding # Store the integer tokens in the 'one_hot' column + transcripts_df['token'] = token_encoding # Store the integer tokens in the 'token' column data['tx'].token = torch.as_tensor(token_encoding).int() # Handle additional embeddings lazily as well if self.embedding_df: From 2fe91aa2fd0900d3cd08eeecb68b146f217d3a70 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 17 Sep 2024 16:20:06 +0200 Subject: [PATCH 030/156] added scale boundaries as the argument --- pyproject.toml | 4 +- scripts/create_toy_datasets.py | 85 +++++++++++++++++++ setup.py | 145 +++++++++++++++++---------------- src/segger/data/io.py | 29 ++++--- 4 files changed, 177 insertions(+), 86 deletions(-) create mode 100644 scripts/create_toy_datasets.py diff --git a/pyproject.toml b/pyproject.toml index d65644e..de21c0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,9 +83,9 @@ tests = [ [project.urls] bug_tracker = "https://github.com/EliHei2/segger_dev/issues" -documentation = "https://github.com/EliHei2/segger_dev#readme" +documentation = "https://EliHei2.github.io/segger_dev" source_code = "https://github.com/EliHei2/segger_dev" -homepage = "https://github.com/EliHei2/segger_dev" +homepage = "https://EliHei2.github.io/segger_dev" repository = "https://github.com/EliHei2/segger_dev" [tool.setuptools] diff --git a/scripts/create_toy_datasets.py b/scripts/create_toy_datasets.py new file mode 100644 index 0000000..65b886d --- /dev/null +++ b/scripts/create_toy_datasets.py @@ -0,0 +1,85 @@ +import dask.dataframe as dd +import tifffile as tiff # Use tifffile instead of PIL for OME-TIFF +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from pathlib import Path + +# Define the paths to the input Parquet and TIFF files (update these paths to match your file locations) +transcripts_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/transcripts.parquet" +nuclei_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/nucleus_boundaries.parquet" +cell_boundaries_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/cell_boundaries.parquet" +morphology_tiff_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/morphology.ome.tif" + +# Define the output directory for the toy dataset +output_dir = Path("data_raw/package_toy_data/xenium_pancreas_cancer") +output_dir.mkdir(parents=True, exist_ok=True) + +def find_fovs_in_square(transcripts_file, square_size=3): + print(f"Loading transcripts from {transcripts_file} using Dask...") + transcripts_df = dd.read_parquet(transcripts_file) + fov_list = transcripts_df['fov_name'].drop_duplicates().compute().tolist() + sorted_fovs = sorted(fov_list) + middle_index = len(sorted_fovs) // 2 + half_size = square_size // 2 + start_index = max(middle_index - half_size, 0) + end_index = min(middle_index + half_size + 1, len(sorted_fovs)) + selected_fovs = sorted_fovs[start_index:end_index] + if len(selected_fovs) < square_size ** 2: + print("Warning: The selected square is smaller than expected due to slide boundaries.") + print(f"Selected FOVs: {selected_fovs}") + return selected_fovs, transcripts_df + +def filter_transcripts_by_fovs(transcripts_df, fovs): + print("Filtering transcripts based on selected FOVs...") + filtered_transcripts = transcripts_df[transcripts_df['fov_name'].isin(fovs)] + return filtered_transcripts.compute() + +def filter_boundaries_by_cells(file_path, cell_ids): + print(f"Loading boundaries from {file_path} using Dask...") + boundaries_df = dd.read_parquet(file_path) + filtered_boundaries = boundaries_df[boundaries_df['cell_id'].isin(cell_ids)] + return filtered_boundaries.compute() + +def save_to_parquet(df, output_file): + print(f"Saving data to {output_file}...") + df.to_parquet(output_file, index=False) + print(f"Data saved to {output_file}.") + +def visualize_fovs_on_tiff(fovs_in_square, filtered_fovs_df, tiff_image_file, output_image_file, fov_column='fov_name', x_column='x', y_column='y', width_column='width', height_column='height'): + print(f"Loading TIFF image from {tiff_image_file}...") + tiff_image = tiff.imread(tiff_image_file) + plt.figure(figsize=(10, 10)) + plt.imshow(tiff_image, cmap='gray') + for _, row in filtered_fovs_df.iterrows(): + x, y, width, height = row[x_column], row[y_column], row[width_column], row[height_column] + rect = patches.Rectangle((x, y), width, height, linewidth=1, edgecolor='r', facecolor='none') + plt.gca().add_patch(rect) + plt.title("Selected FOVs over Morphology TIFF Image") + plt.savefig(output_image_file) + plt.show() + +# Step 1: Get FOVs from the transcripts file for the middle square +square_size = 4 # Example: 4x4 FOVs +fovs_in_square, transcripts_df = find_fovs_in_square(transcripts_file, square_size) + +# Step 2: Filter transcripts for the selected FOVs +filtered_transcripts = filter_transcripts_by_fovs(transcripts_df, fovs_in_square) + +# Step 3: Get the cell_ids from the filtered transcripts +cell_ids_in_fovs = filtered_transcripts['cell_id'].unique() + +# Step 4: Process and save filtered cell boundaries for the selected FOVs +cell_boundaries_df = filter_boundaries_by_cells(cell_boundaries_file, cell_ids_in_fovs) +save_to_parquet(cell_boundaries_df, output_dir / f"cell_boundaries.parquet") + +# Step 5: Process and save filtered nuclei boundaries for the selected FOVs +nuclei_boundaries_df = filter_boundaries_by_cells(nuclei_file, cell_ids_in_fovs) +save_to_parquet(nuclei_boundaries_df, output_dir / f"nuclei_boundaries.parquet") + +# Step 6: Process and save filtered transcripts for the selected FOVs +save_to_parquet(filtered_transcripts, output_dir / f"transcripts.parquet") + +# Step 7: Visualize the selected FOVs as squares on top of the TIFF image and save the plot +visualize_fovs_on_tiff(fovs_in_square, filtered_transcripts, morphology_tiff_file, output_dir / "fovs_on_tiff.png") + +print("Toy dataset generation and visualization complete!") diff --git a/setup.py b/setup.py index 5e21e80..964fe9b 100644 --- a/setup.py +++ b/setup.py @@ -1,83 +1,90 @@ from setuptools import setup, find_packages setup( - name='segger', - version='0.1.0', - author='Elyas Heidari', - author_email='elyas.heidari@dkfz-heidelberg.de', - description='Fast and accurate cell segmentation for single-molecule spatial omics', - packages=find_packages(where="src"), - package_dir={"": "src"}, + name="segger", + version="0.1.0", + description="Fast and accurate cell segmentation for single-molecule spatial omics", + author="Elyas Heidari", + author_email="elyas.heidari@dkfz-heidelberg.de", + license="MIT", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + python_requires=">=3.10", + keywords=["segmentation", "deep learning", "pytorch", "geometric deep learning"], install_requires=[ - 'torch>=2.0.0', - 'numpy>=1.21.0', - 'pandas>=1.3.0', - 'scipy>=1.7.0', - 'matplotlib>=3.4.0', - 'seaborn>=0.11.0', - 'tqdm>=4.61.0', - 'torchvision>=0.10.0', - 'lightning>=1.9.0', - 'torchmetrics>=0.5.0', - 'scanpy>=1.9.3', - 'squidpy>=1.1.0', - 'adjustText>=0.8', - 'scikit-learn>=0.24.0', - 'geopandas>=0.9.0', - 'shapely>=1.7.0', - 'path>=17.0.0', - 'pyarrow>=17.0.0' + "torch>=2.0.0", + "numpy>=1.21.0", + "pandas>=1.3.0", + "scipy>=1.7.0", + "matplotlib>=3.4.0", + "seaborn>=0.11.0", + "tqdm>=4.61.0", + "torchvision>=0.10.0", + "lightning>=1.9.0", + "torchmetrics>=0.5.0", + "scanpy>=1.9.3", + "squidpy==1.2.0", + "adjustText>=0.8", + "scikit-learn>=0.24.0", + "geopandas>=0.9.0", + "shapely>=1.7.0", + "path>=17.0.0", + "pyarrow>=17.0.0", + "torch-geometric>=2.2.0", + "dask_geopandas>=0.4.0" ], extras_require={ - 'gpu': [ - 'cuml>=21.08', - 'cudf>=21.08', - 'cugraph>=21.08', - 'cuspatial>=21.08', - 'faiss-cpu>=1.7.0', - 'faiss-gpu>=1.7.0; platform_system=="Linux"' + "gpu": [ + "cuml>=21.08", + "cudf>=21.08", + "cugraph>=21.08", + "cuspatial>=21.08", + "faiss-cpu>=1.7.0", + "faiss-gpu>=1.7.0" + ], + "torch-geometric": [ + "torch-scatter>=2.1.2", + "torch-sparse>=0.6.18", + "torch-cluster>=1.6.3", ], - 'torch-geometric': [ - 'torch-scatter>=2.1.2', - 'torch-sparse>=0.6.18', - 'torch-cluster>=1.6.3', - 'torch-geometric>=2.2.0', + "multiprocessing": ["multiprocessing"], + "dev": [ + "pytest", + "black", + "flake8", + "pre-commit", + "twine>=4.0.2", ], - 'multiprocessing': ['multiprocessing'], - 'dev': [ - 'pytest', - 'black', - 'flake8', - 'pre-commit', - 'twine>=4.0.2', + "docs": [ + "docutils>=0.8,!=0.18.*,!=0.19.*", + "sphinx>=4.1", + "sphinx-book-theme>=1.0.0", + "myst-nb", + "myst-parser", + "sphinxcontrib-bibtex>=1.0.0", + "sphinx-autodoc-typehints", + "sphinx_rtd_theme", + "sphinxext-opengraph", + "sphinx-copybutton", + "sphinx-design", + "sphinx-hoverxref", + "ipykernel", + "ipython", + "pandas", ], - 'docs': [ - 'docutils>=0.8,!=0.18.*,!=0.19.*', - 'sphinx>=4.1', - 'sphinx-book-theme>=1.0.0', - 'myst-nb', - 'myst-parser', - 'sphinxcontrib-bibtex>=1.0.0', - 'sphinx-autodoc-typehints', - 'sphinx_rtd_theme', - 'sphinxext-opengraph', - 'sphinx-copybutton', - 'sphinx-design', - 'sphinx-hoverxref', - 'ipykernel', - 'ipython', - 'pandas', + "tests": [ + "pytest", + "coverage", ], - 'tests': [ - 'pytest', - 'coverage', - ] }, - python_requires='>=3.10', + url="https://github.com/EliHei2/segger_dev", project_urls={ - 'Bug Tracker': 'https://github.com/EliHei2/segger_dev/issues', - 'Documentation': 'https://github.com/EliHei2/segger_dev#readme', - 'Source Code': 'https://github.com/EliHei2/segger_dev', - 'Homepage': 'https://github.com/EliHei2/segger_dev', + "Bug Tracker": "https://github.com/EliHei2/segger_dev/issues", + "Documentation": "https://EliHei2.github.io/segger_dev", + "Source Code": "https://github.com/EliHei2/segger_dev", + "Homepage": "https://EliHei2.github.io/segger_dev", + "Repository": "https://github.com/EliHei2/segger_dev", }, + packages=find_packages(where="src"), + package_dir={"": "src"}, ) diff --git a/src/segger/data/io.py b/src/segger/data/io.py index a7c12fd..4cbc3f2 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -602,12 +602,7 @@ def save_dataset_for_segger( neg_sampling_ratio_approx: float = 5, sampling_rate: float = 1, num_workers: int = 1, - receptive_field: Dict[str, float] = { - "k_bd": 4, - "dist_bd": 20, - "k_tx": 5, - "dist_tx": 10, - }, + scale_boundaries: float = 1.0, method: str = 'kd_tree', gpu: bool = False, workers: int = 1 @@ -631,10 +626,11 @@ def save_dataset_for_segger( neg_sampling_ratio_approx (float, optional): Approximate ratio of negative samples. sampling_rate (float, optional): Rate of sampling tiles. num_workers (int, optional): Number of workers to use for parallel processing. - receptive_field (Dict[str, float], optional): Dictionary containing the values for 'k_bd', 'dist_bd', 'k_tx', and 'dist_tx'. + scale_boundaries (float, optional): The factor by which to scale the boundary polygons. Default is 1.0. method (str, optional): Method for computing edge indices (e.g., 'kd_tree', 'faiss'). gpu (bool, optional): Whether to use GPU acceleration for edge index computation. workers (int, optional): Number of workers to use to compute the neighborhood graph (per tile). + """ # Prepare directories for storing processed tiles self._prepare_directories(processed_dir) @@ -646,7 +642,7 @@ def save_dataset_for_segger( tile_params = self._generate_tile_params( x_range, y_range, x_size, y_size, margin_x, margin_y, compute_labels, r_tx, k_tx, val_prob, test_prob, neg_sampling_ratio_approx, sampling_rate, - processed_dir, receptive_field, method, gpu, workers + processed_dir, scale_boundaries, method, gpu, workers ) # Process each tile using Dask to parallelize the task @@ -693,7 +689,7 @@ def _generate_tile_params( neg_sampling_ratio_approx: float, sampling_rate: float, processed_dir: Path, - receptive_field: Dict[str, float], + scale_boundaries: float, method: str, gpu: bool, workers: int @@ -712,7 +708,7 @@ def _generate_tile_params( ( i, j, x_size, y_size, x_range[i], y_range[j], margin_x, margin_y, compute_labels, r_tx, k_tx, neg_sampling_ratio_approx, val_prob, - test_prob, processed_dir, receptive_field, sampling_rate, + test_prob, processed_dir, scale_boundaries, sampling_rate, method, gpu, workers ) for i in range(len(x_range)) @@ -756,7 +752,7 @@ def _process_tile(self, tile_params: Tuple) -> None: ( i, j, x_size, y_size, x_loc, y_loc, margin_x, margin_y, compute_labels, r_tx, k_tx, neg_sampling_ratio_approx, val_prob, test_prob, processed_dir, - receptive_field, sampling_rate, method, gpu, workers + scale_boundaries, sampling_rate, method, gpu, workers ) = tile_params if self.verbose: print(f"Processing tile at location (x_min: {x_loc}, y_min: {y_loc}), size (width: {x_size}, height: {y_size})") @@ -797,7 +793,7 @@ def _process_tile(self, tile_params: Tuple) -> None: # Build PyG data structure from tile-specific data if self.verbose: print(f"Building PyG data for tile at (x_min: {x_loc}, y_min: {y_loc})...") data = delayed(self.build_pyg_data_from_tile)( - boundaries_df, transcripts_df, r_tx=r_tx, k_tx=k_tx, method=method, gpu=gpu, workers=workers + boundaries_df, transcripts_df, r_tx=r_tx, k_tx=k_tx, method=method, gpu=gpu, workers=workers, scale_boundaries=scale_boundaries ) data = data.compute() @@ -844,7 +840,9 @@ def build_pyg_data_from_tile( k_tx: int = 3, method: str = 'kd_tree', gpu: bool = False, - workers: int = 1 + workers: int = 1, + scale_boundaries: float = 1.0 + ) -> HeteroData: """ Builds PyG data from a tile of boundaries and transcripts data using Dask utilities for efficient processing. @@ -857,7 +855,8 @@ def build_pyg_data_from_tile( method (str, optional): Method for computing edge indices (e.g., 'kd_tree', 'faiss'). gpu (bool, optional): Whether to use GPU acceleration for edge index computation. workers (int, optional): Number of workers to use for parallel processing. - + scale_boundaries (float, optional): The factor by which to scale the boundary polygons. Default is 1.0. + Returns: HeteroData: PyG Heterogeneous Data object. """ @@ -866,7 +865,7 @@ def build_pyg_data_from_tile( # Lazily compute boundaries geometries using Dask if self.verbose: print("Computing boundaries geometries...") - bd_gdf = self.compute_boundaries_geometries(boundaries_df) + bd_gdf = self.compute_boundaries_geometries(boundaries_df, scale_factor=scale_boundaries) bd_gdf = bd_gdf[bd_gdf['geometry'].notnull()] # Add boundary node data to PyG HeteroData lazily From 29fc161e1ee46f0ca4e4ebbea3069875d4f6adec Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 08:20:34 +0200 Subject: [PATCH 031/156] dropped setup.py --- setup.py | 90 -------------------------------------------------------- 1 file changed, 90 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 964fe9b..0000000 --- a/setup.py +++ /dev/null @@ -1,90 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="segger", - version="0.1.0", - description="Fast and accurate cell segmentation for single-molecule spatial omics", - author="Elyas Heidari", - author_email="elyas.heidari@dkfz-heidelberg.de", - license="MIT", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - python_requires=">=3.10", - keywords=["segmentation", "deep learning", "pytorch", "geometric deep learning"], - install_requires=[ - "torch>=2.0.0", - "numpy>=1.21.0", - "pandas>=1.3.0", - "scipy>=1.7.0", - "matplotlib>=3.4.0", - "seaborn>=0.11.0", - "tqdm>=4.61.0", - "torchvision>=0.10.0", - "lightning>=1.9.0", - "torchmetrics>=0.5.0", - "scanpy>=1.9.3", - "squidpy==1.2.0", - "adjustText>=0.8", - "scikit-learn>=0.24.0", - "geopandas>=0.9.0", - "shapely>=1.7.0", - "path>=17.0.0", - "pyarrow>=17.0.0", - "torch-geometric>=2.2.0", - "dask_geopandas>=0.4.0" - ], - extras_require={ - "gpu": [ - "cuml>=21.08", - "cudf>=21.08", - "cugraph>=21.08", - "cuspatial>=21.08", - "faiss-cpu>=1.7.0", - "faiss-gpu>=1.7.0" - ], - "torch-geometric": [ - "torch-scatter>=2.1.2", - "torch-sparse>=0.6.18", - "torch-cluster>=1.6.3", - ], - "multiprocessing": ["multiprocessing"], - "dev": [ - "pytest", - "black", - "flake8", - "pre-commit", - "twine>=4.0.2", - ], - "docs": [ - "docutils>=0.8,!=0.18.*,!=0.19.*", - "sphinx>=4.1", - "sphinx-book-theme>=1.0.0", - "myst-nb", - "myst-parser", - "sphinxcontrib-bibtex>=1.0.0", - "sphinx-autodoc-typehints", - "sphinx_rtd_theme", - "sphinxext-opengraph", - "sphinx-copybutton", - "sphinx-design", - "sphinx-hoverxref", - "ipykernel", - "ipython", - "pandas", - ], - "tests": [ - "pytest", - "coverage", - ], - }, - url="https://github.com/EliHei2/segger_dev", - project_urls={ - "Bug Tracker": "https://github.com/EliHei2/segger_dev/issues", - "Documentation": "https://EliHei2.github.io/segger_dev", - "Source Code": "https://github.com/EliHei2/segger_dev", - "Homepage": "https://EliHei2.github.io/segger_dev", - "Repository": "https://github.com/EliHei2/segger_dev", - }, - packages=find_packages(where="src"), - package_dir={"": "src"}, -) From 98a48bc9c976221b1bdfb15c2becd0e96aeac01d Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:15:50 +0200 Subject: [PATCH 032/156] fixed installation --- docs/installation.md | 62 ++++++++++++++++++++++++++++++-------------- pyproject.toml | 58 +++++++++++++++++++++++++++++++---------- requirements.txt | 22 ---------------- 3 files changed, 86 insertions(+), 56 deletions(-) delete mode 100644 requirements.txt diff --git a/docs/installation.md b/docs/installation.md index f6a5f20..ec3c284 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,33 +1,55 @@ +## Segger Installation Guide +Select the appropriate installation method based on your requirements. -# Installation Guide - -## Installation Options +=== ":rocket: Micromamba Installation" +```bash +micromamba create -n segger-rapids --channel-priority 1 \ + -c rapidsai -c conda-forge -c nvidia -c pytorch -c pyg \ + rapids=24.08 python=3.* 'cuda-version>=11.4,<=11.8' jupyterlab \ + 'pytorch=*=*cuda*' 'pyg=*=*cu118' pyg-lib pytorch-sparse +micromamba install -n segger-rapids --channel-priority 1 --file mamba_environment.yml +micromamba run -n segger-rapids pip install --no-deps ./ +``` -Select the appropriate installation method based on your requirements. +=== ":snake: Conda Installation" +```bash +conda create -n segger-env python=3.10 +conda activate segger-env +conda install pytorch torchvision torchaudio cudatoolkit=11.7 -c pytorch +conda install pyg -c pyg +pip install . +``` -=== "Micromamba Installation" - ```bash - micromamba create -n segger-rapids --channel-priority 1 \ - -c rapidsai -c conda-forge -c nvidia -c pytorch -c pyg \ - rapids=24.08 python=3.* 'cuda-version>=11.4,<=11.8' jupyterlab \ - 'pytorch=*=*cuda*' 'pyg=*=*cu118' pyg-lib pytorch-sparse - micromamba install -n segger-rapids --channel-priority 1 --file mamba_environment.yml - micromamba run -n segger-rapids pip install --no-deps ./ - ``` +=== ":octocat: Github Installation" +```bash +git clone https://github.com/EliHei2/segger_dev.git +cd segger_dev +pip install . +``` -=== "Github Installation" - ```bash - git clone https://github.com/EliHei2/segger_dev.git - cd segger_dev - pip install . - ``` +=== ":package: Pip Installation (CUDA 11)" +```bash +pip install -e ".[cuda11]" +``` +=== ":package: Pip Installation (CUDA 12)" +```bash +pip install -e ".[cuda12]" +``` +=== ":rocket: Pip Installation (RAPIDS and CUDA 11)" +```bash +pip install "segger[cuda11,rapids11,cupy11,faiss]" +``` +=== ":rocket: Pip Installation (RAPIDS and CUDA 12)" +```bash +pip install "segger[cuda12,rapids12,cupy12,faiss]" +``` !!! warning "Common Installation Issues" - - **Python Version**: Ensure you are using **Python >= 3.10**. Check your version with: + - **Python Version**: Ensure you are using Python >= 3.10. Check your version with: ```bash python --version ``` diff --git a/pyproject.toml b/pyproject.toml index de21c0e..dbafbf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,9 @@ requires-python = ">=3.10" keywords = ["segmentation", "deep learning", "pytorch", "geometric deep learning"] dependencies = [ - "torch>=2.0.0", + "torch>=2.0.0", # Moved to main dependencies + "dask-cuda>=23.10.0", # Moved to main dependencies + "distributed>=2023.10.0", # Moved to main dependencies "numpy>=1.21.0", "pandas>=1.3.0", "scipy>=1.7.0", @@ -33,32 +35,59 @@ dependencies = [ "shapely>=1.7.0", "path>=17.0.0", "pyarrow>=17.0.0", - "torch-geometric>=2.2.0", "dask_geopandas>=0.4.0" ] [project.optional-dependencies] -gpu = [ - "cuml>=21.08", - "cudf>=21.08", - "cugraph>=21.08", - "cuspatial>=21.08", +faiss = [ "faiss-cpu>=1.7.0", "faiss-gpu>=1.7.0" ] -torch-geometric = [ - "torch-scatter>=2.1.2", - "torch-sparse>=0.6.18", - "torch-cluster>=1.6.3", + +cuda11 = [ + "torch-scatter>=2.1.2+pt20cu117", + "torch-sparse>=0.6.18+pt20cu117", + "torch-cluster>=1.6.3+pt20cu117" +] + +cuda12 = [ + "torch-scatter>=2.1.2+pt20cu120", + "torch-sparse>=0.6.18+pt20cu120", + "torch-cluster>=1.6.3+pt20cu120" +] + +rapids11 = [ + "cudf-cu11==24.8.*", + "cuml-cu11==24.8.*", + "cugraph-cu11==24.8.*", + "cuspatial-cu11==24.8.*" +] + +rapids12 = [ + "cudf-cu12==24.8.*", + "cuml-cu12==24.8.*", + "cugraph-cu12==24.8.*", + "cuspatial-cu12==24.8.*" ] + +cupy11 = [ + "cupy-cuda11x" +] + +cupy12 = [ + "cupy-cuda12x" +] + multiprocessing = ["multiprocessing"] + dev = [ "pytest", "black", "flake8", "pre-commit", - "twine>=4.0.2", + "twine>=4.0.2" ] + docs = [ "docutils>=0.8,!=0.18.*,!=0.19.*", "sphinx>=4.1", @@ -74,11 +103,12 @@ docs = [ "sphinx-hoverxref", "ipykernel", "ipython", - "pandas", + "pandas" ] + tests = [ "pytest", - "coverage", + "coverage" ] [project.urls] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 13b2436..0000000 --- a/requirements.txt +++ /dev/null @@ -1,22 +0,0 @@ -numpy>=1.21.0 -pandas>=1.3.0 -scipy>=1.7.0 -matplotlib>=3.4.0 -seaborn>=0.11.0 -tqdm>=4.61.0 -torch>=2.0.0 -torchvision>=0.10.0 -pytorch-lightning>=1.3.0 -torchmetrics>=0.5.0 -# scanpy>=1.8.0 -squidpy==1.2.0 -adjustText>=0.8 -scikit-learn>=0.24.0 -geopandas>=0.9.0 -shapely>=1.7.0 -scanpy>=1.9.3 -torch-geometric>=2.2.0 -# pyg_lib>=0.0.0 -torch_scatter>=2.1.2 -torch_sparse>=0.6.18 -torch_cluster>=1.6.3 \ No newline at end of file From 5f81fc030406db7a4b350ddf41f9377bfe277d27 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:18:24 +0200 Subject: [PATCH 033/156] fixed installation --- README.md | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 295bc42..a43fb42 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,18 @@ Spatial omics segmentation faces issues like: segger tackles these with a **graph-based approach**, achieving superior segmentation accuracy. --- +Here’s the updated plain Markdown version for your README file, formatted for clear installation instructions without the need for MkDocs styling: + +--- + ## Installation Options Choose the installation method that best suits your needs. ### Micromamba Installation +To set up Segger with `micromamba` and install the required dependencies, use the following commands: + ```bash micromamba create -n segger-rapids --channel-priority 1 \ -c rapidsai -c conda-forge -c nvidia -c pytorch -c pyg \ @@ -61,12 +67,49 @@ micromamba run -n segger-rapids pip install --no-deps ./ ### GitHub Installation +For a straightforward local installation from GitHub, clone the repository and install the package using `pip`: + ```bash git clone https://github.com/EliHei2/segger_dev.git cd segger_dev -pip install . ``` +#### Pip Installation (CUDA 11) + +To install with CUDA 11 support: + +```bash +pip install -e ".[cuda11]" +``` + +#### Pip Installation (CUDA 12) + +To install with CUDA 12 support: + +```bash +pip install -e ".[cuda12]" +``` + +#### Pip Installation (RAPIDS and CUDA 11) + +For installations requiring RAPIDS and CUDA 11 support, run: + +```bash +pip install -e ".[cuda11,rapids11,cupy11,faiss]" +``` + +#### Pip Installation (RAPIDS and CUDA 12) + +For installations requiring RAPIDS and CUDA 12 support, run: + +```bash +pip install -e ".[cuda12,rapids12,cupy12,faiss]" +``` + +--- + +This version is well-suited for the plain Markdown format in your README file, providing clear installation instructions with no need for MkDocs-specific features. + --- # Powered by From 6a5eec1ba92234a51d2213450a113228baa33dbe Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:24:43 +0200 Subject: [PATCH 034/156] fixed installation --- README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/README.md b/README.md index a43fb42..f5aad6b 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,6 @@ Spatial omics segmentation faces issues like: segger tackles these with a **graph-based approach**, achieving superior segmentation accuracy. ---- -Here’s the updated plain Markdown version for your README file, formatted for clear installation instructions without the need for MkDocs styling: - --- ## Installation Options @@ -108,10 +105,6 @@ pip install -e ".[cuda12,rapids12,cupy12,faiss]" --- -This version is well-suited for the plain Markdown format in your README file, providing clear installation instructions with no need for MkDocs-specific features. - ---- - # Powered by - ⚡ **PyTorch Lightning & PyTorch Geometric**: Enables fast, efficient graph neural network (GNN) implementation for heterogeneous graphs. From 9099dcc1f58077622106a8e3ca3a2fd3e9bc02b7 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:43:27 +0200 Subject: [PATCH 035/156] fixed installation --- pyproject.toml | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dbafbf0..e7a95f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,9 +15,9 @@ requires-python = ">=3.10" keywords = ["segmentation", "deep learning", "pytorch", "geometric deep learning"] dependencies = [ - "torch>=2.0.0", # Moved to main dependencies - "dask-cuda>=23.10.0", # Moved to main dependencies - "distributed>=2023.10.0", # Moved to main dependencies + "torch>=2.0.0", + "dask-cuda>=23.10.0", + "distributed>=2023.10.0", "numpy>=1.21.0", "pandas>=1.3.0", "scipy>=1.7.0", @@ -35,7 +35,10 @@ dependencies = [ "shapely>=1.7.0", "path>=17.0.0", "pyarrow>=17.0.0", - "dask_geopandas>=0.4.0" + "dask_geopandas>=0.4.0", # Missing comma added + "torch-scatter>=2.1.2", + "torch-sparse>=0.6.18", + "torch-cluster>=1.6.3" ] [project.optional-dependencies] @@ -44,38 +47,24 @@ faiss = [ "faiss-gpu>=1.7.0" ] -cuda11 = [ - "torch-scatter>=2.1.2+pt20cu117", - "torch-sparse>=0.6.18+pt20cu117", - "torch-cluster>=1.6.3+pt20cu117" -] - -cuda12 = [ - "torch-scatter>=2.1.2+pt20cu120", - "torch-sparse>=0.6.18+pt20cu120", - "torch-cluster>=1.6.3+pt20cu120" -] - rapids11 = [ "cudf-cu11==24.8.*", "cuml-cu11==24.8.*", "cugraph-cu11==24.8.*", - "cuspatial-cu11==24.8.*" + "cuspatial-cu11==24.8.*", + "cupy-cuda11x", + "nccl=2.12.*", + "cutensor=1.3.*" ] rapids12 = [ "cudf-cu12==24.8.*", "cuml-cu12==24.8.*", "cugraph-cu12==24.8.*", - "cuspatial-cu12==24.8.*" -] - -cupy11 = [ - "cupy-cuda11x" -] - -cupy12 = [ - "cupy-cuda12x" + "cuspatial-cu12==24.8.*", + "cupy-cuda12x", + "nccl=2.13.*", + "cutensor=1.4.*" ] multiprocessing = ["multiprocessing"] From 6d58a3548c13d7151fd4bed6c7c6945223a3511e Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:45:25 +0200 Subject: [PATCH 036/156] fixed installation --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e7a95f0..8a983b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,8 +53,8 @@ rapids11 = [ "cugraph-cu11==24.8.*", "cuspatial-cu11==24.8.*", "cupy-cuda11x", - "nccl=2.12.*", - "cutensor=1.3.*" + "nccl==2.12.*", + "cutensor==1.3.*" ] rapids12 = [ From cd89934207844d7e0103380277a3407114ecaf74 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:45:35 +0200 Subject: [PATCH 037/156] fixed installation --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8a983b8..7d830ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,8 +63,8 @@ rapids12 = [ "cugraph-cu12==24.8.*", "cuspatial-cu12==24.8.*", "cupy-cuda12x", - "nccl=2.13.*", - "cutensor=1.4.*" + "nccl==2.13.*", + "cutensor==1.4.*" ] multiprocessing = ["multiprocessing"] From 1dd99552702b1b79ebbd4ded002820b585a53f0e Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 09:49:20 +0200 Subject: [PATCH 038/156] fixed installation --- pyproject.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7d830ca..ae4e331 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,10 +35,7 @@ dependencies = [ "shapely>=1.7.0", "path>=17.0.0", "pyarrow>=17.0.0", - "dask_geopandas>=0.4.0", # Missing comma added - "torch-scatter>=2.1.2", - "torch-sparse>=0.6.18", - "torch-cluster>=1.6.3" + "dask_geopandas>=0.4.0" ] [project.optional-dependencies] From 89528f17ceb1032a0df5e370edb5083facb64a73 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:07:47 +0200 Subject: [PATCH 039/156] fixed installation --- README.md | 72 ++++++++++++++++++--------------- environment.yml | 103 +++++++++++++++++++++++++++--------------------- 2 files changed, 99 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index f5aad6b..b30e71f 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,9 @@ # Why segger? -- ⚙️ **Highly parallelizable** – Optimized for multi-GPU environments -- ⚡ **Fast and efficient** – Trains in a fraction of the time compared to alternatives -- 🔄 **Transfer learning** – Easily adaptable to new datasets and technologies +- **Highly parallelizable** – Optimized for multi-GPU environments +- **Fast and efficient** – Trains in a fraction of the time compared to alternatives +- **Transfer learning** – Easily adaptable to new datasets and technologies ### Challenges in Segmentation @@ -47,7 +47,32 @@ segger tackles these with a **graph-based approach**, achieving superior segment ## Installation Options -Choose the installation method that best suits your needs. + +### Important: PyTorch Geometric Dependencies + +Segger **highly depends** on PyTorch Geometric. One **must** install its dependencies (such as `torch-sparse` and `torch-scatter`) based on their system’s specifications, especially CUDA and PyTorch versions. + +Please follow the official [PyTorch Geometric Installation Guide](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) to install the correct versions of `torch-sparse`, `torch-scatter`, and other relevant libraries. + +Here’s how to install them manually, e.g., for torch 2.0.0: + +#### For CUDA 11.x: + +```bash +pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu117.html +pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu117.html +``` + +#### For CUDA 12.x: + +```bash +pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu120.html +pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu120.html +``` + + +Afterwards choose the installation method that best suits your needs. + ### Micromamba Installation @@ -69,22 +94,7 @@ For a straightforward local installation from GitHub, clone the repository and i ```bash git clone https://github.com/EliHei2/segger_dev.git cd segger_dev -``` - -#### Pip Installation (CUDA 11) - -To install with CUDA 11 support: - -```bash -pip install -e ".[cuda11]" -``` - -#### Pip Installation (CUDA 12) - -To install with CUDA 12 support: - -```bash -pip install -e ".[cuda12]" +pip install -e "." ``` #### Pip Installation (RAPIDS and CUDA 11) @@ -92,7 +102,7 @@ pip install -e ".[cuda12]" For installations requiring RAPIDS and CUDA 11 support, run: ```bash -pip install -e ".[cuda11,rapids11,cupy11,faiss]" +pip install -e ".[rapids11]" ``` #### Pip Installation (RAPIDS and CUDA 12) @@ -100,19 +110,19 @@ pip install -e ".[cuda11,rapids11,cupy11,faiss]" For installations requiring RAPIDS and CUDA 12 support, run: ```bash -pip install -e ".[cuda12,rapids12,cupy12,faiss]" +pip install -e ".[rapids12]" ``` --- # Powered by -- ⚡ **PyTorch Lightning & PyTorch Geometric**: Enables fast, efficient graph neural network (GNN) implementation for heterogeneous graphs. -- ⚙️ **Dask**: Scalable parallel processing and distributed task scheduling, ideal for handling large transcriptomic datasets. -- 🗺️ **Shapely & Geopandas**: Utilized for spatial operations such as polygon creation, scaling, and spatial relationship computations. -- 🖥️ **RAPIDS**: Provides GPU-accelerated computation for tasks like k-nearest neighbors (KNN) graph construction. -- 📊 **AnnData & Scanpy**: Efficient processing for single-cell datasets. -- 📐 **SciPy**: Facilitates spatial graph construction, including distance metrics and convex hull calculations for transcript clustering. +- **PyTorch Lightning & PyTorch Geometric**: Enables fast, efficient graph neural network (GNN) implementation for heterogeneous graphs. +- **Dask**: Scalable parallel processing and distributed task scheduling, ideal for handling large transcriptomic datasets. +- **Shapely & Geopandas**: Utilized for spatial operations such as polygon creation, scaling, and spatial relationship computations. +- **RAPIDS**: Provides GPU-accelerated computation for tasks like k-nearest neighbors (KNN) graph construction. +- **AnnData & Scanpy**: Efficient processing for single-cell datasets. +- **SciPy**: Facilitates spatial graph construction, including distance metrics and convex hull calculations for transcript clustering. --- @@ -120,11 +130,11 @@ pip install -e ".[cuda12,rapids12,cupy12,faiss]" segger is **open-source** and welcomes contributions. Join us in advancing spatial omics segmentation! -- 🛠️ **Source Code** +- **Source Code** [GitHub](https://github.com/EliHei2/segger_dev) -- 🐞 **Bug Tracker** +- **Bug Tracker** [Report Issues](https://github.com/EliHei2/segger_dev/issues) -- 📚 **Full Documentation** +- **Full Documentation** [API Reference](https://elihei2.github.io/segger_dev/api/) diff --git a/environment.yml b/environment.yml index 2312e5a..3087056 100644 --- a/environment.yml +++ b/environment.yml @@ -1,62 +1,75 @@ -name: segger-env +name: segger channels: + - rapidsai - conda-forge + - nvidia - defaults dependencies: - - python=3.10 + # Python version + - python>=3.10 + + # Main dependencies from pyproject.toml - pytorch>=2.0.0 + - torchvision>=0.10.0 + - pytorch-lightning>=1.9.0 + - torchmetrics>=0.5.0 - numpy>=1.21.0 - pandas>=1.3.0 - scipy>=1.7.0 - matplotlib>=3.4.0 - seaborn>=0.11.0 - tqdm>=4.61.0 - - torchvision>=0.10.0 - - lightning>=1.9.0 - - torchmetrics>=0.5.0 - scanpy>=1.9.3 - - squidpy>=1.2.0 - - adjustText>=0.8 + - squidpy==1.2.0 + - adjusttext>=0.8 - scikit-learn>=0.24.0 - geopandas>=0.9.0 - shapely>=1.7.0 - path>=17.0.0 - pyarrow>=17.0.0 - - pip - - pip: - - torch-scatter>=2.1.2 - - torch-sparse>=0.6.18 - - torch-cluster>=1.6.3 - - torch-geometric>=2.2.0 - - optional: - - cudf>=21.08 - - cuml>=21.08 - - cugraph>=21.08 - - cuspatial>=21.08 - - faiss-cpu>=1.7.0 - - faiss-gpu>=1.7.0 - - optional: - - multiprocessing - - optional: - - pytest - - black - - flake8 - - pre-commit - - twine>=4.0.2 - - docutils>=0.8,!=0.18.*,!=0.19.* - - sphinx>=4.1 - - sphinx-book-theme>=1.0.0 - - myst-nb - - myst-parser - - sphinxcontrib-bibtex>=1.0.0 - - sphinx-autodoc-typehints - - sphinx_rtd_theme - - sphinxext-opengraph - - sphinx-copybutton - - sphinx-design - - sphinx-hoverxref - - ipykernel - - ipython - - pandas - - pytest - - coverage + - dask-geopandas>=0.4.0 + - dask-cuda>=23.10.0 + - distributed>=2023.10.0 + + # RAPIDS for CUDA 11 + - cudf-cu11==24.8.* + - cuml-cu11==24.8.* + - cugraph-cu11==24.8.* + - cuspatial-cu11==24.8.* + - cupy-cuda11x + - nccl==2.12.* + - cutensor==1.3.* + + # Optional dependencies + - faiss-cpu>=1.7.0 + - faiss-gpu>=1.7.0 + + # Development dependencies + - pytest + - black + - flake8 + - pre-commit + - twine>=4.0.2 + + # Documentation dependencies + - docutils>=0.8,!=0.18.*,!=0.19.* + - sphinx>=4.1 + - sphinx-book-theme>=1.0.0 + - myst-nb + - myst-parser + - sphinxcontrib-bibtex>=1.0.0 + - sphinx-autodoc-typehints + - sphinx_rtd_theme + - sphinxext-opengraph + - sphinx-copybutton + - sphinx-design + - sphinx-hoverxref + - ipykernel + - ipython + - pandas + + # Testing dependencies + - coverage + + # Multiprocessing support + - multiprocessing From 567f04b09c86eb32ff434d737456e64a04473b1c Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:14:00 +0200 Subject: [PATCH 040/156] fixed installation --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ae4e331..05659f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,8 @@ dependencies = [ "shapely>=1.7.0", "path>=17.0.0", "pyarrow>=17.0.0", - "dask_geopandas>=0.4.0" + "dask_geopandas>=0.4.0", + "torch-geometric>=2.2.0" ] [project.optional-dependencies] From beb4e3ca498320e30eea640b72a7440bb0985a97 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:18:33 +0200 Subject: [PATCH 041/156] fixed installation --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 05659f4..8fa3dc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "geopandas>=0.9.0", "shapely>=1.7.0", "path>=17.0.0", - "pyarrow>=17.0.0", + "pyarrow>=14.0.1,<15.0.0", "dask_geopandas>=0.4.0", "torch-geometric>=2.2.0" ] From 0863be26e4e487e7209702e4084fe5749c35d854 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:36:53 +0200 Subject: [PATCH 042/156] fixed installation --- pyproject.toml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8fa3dc0..cec1803 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,21 +46,19 @@ faiss = [ ] rapids11 = [ + "cupy-cuda11x", "cudf-cu11==24.8.*", - "cuml-cu11==24.8.*", "cugraph-cu11==24.8.*", - "cuspatial-cu11==24.8.*", - "cupy-cuda11x", + "cuvs-cu11==24.8.*", "nccl==2.12.*", "cutensor==1.3.*" ] rapids12 = [ + "cupy-cuda12x", "cudf-cu12==24.8.*", - "cuml-cu12==24.8.*", "cugraph-cu12==24.8.*", - "cuspatial-cu12==24.8.*", - "cupy-cuda12x", + "cuvs-cu12==24.8.*", "nccl==2.13.*", "cutensor==1.4.*" ] From a458290514e91606aa483f27648d8c7c69a35c95 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:50:56 +0200 Subject: [PATCH 043/156] fixed installation --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cec1803..4c21e5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,8 +50,8 @@ rapids11 = [ "cudf-cu11==24.8.*", "cugraph-cu11==24.8.*", "cuvs-cu11==24.8.*", - "nccl==2.12.*", - "cutensor==1.3.*" + "nvidia-nccl-cu11==2.18.*", + "cutensor-cu11" ] rapids12 = [ @@ -59,8 +59,8 @@ rapids12 = [ "cudf-cu12==24.8.*", "cugraph-cu12==24.8.*", "cuvs-cu12==24.8.*", - "nccl==2.13.*", - "cutensor==1.4.*" + "nvidia-nccl-cu12==2.18.*", + "cutensor-cu12" ] multiprocessing = ["multiprocessing"] From d9ae0e4476de98f1d08aa1b78bfcc9a3a2349046 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:56:23 +0200 Subject: [PATCH 044/156] fixed installation --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4c21e5a..8ce8859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ faiss = [ ] rapids11 = [ + "pyarrow>=16.1.0,<16.2.0", "cupy-cuda11x", "cudf-cu11==24.8.*", "cugraph-cu11==24.8.*", @@ -55,6 +56,7 @@ rapids11 = [ ] rapids12 = [ + "pyarrow>=16.1.0,<16.2.0", "cupy-cuda12x", "cudf-cu12==24.8.*", "cugraph-cu12==24.8.*", From b309fd13ad149c1fe50f699bd9371895f815dd10 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 10:58:56 +0200 Subject: [PATCH 045/156] fixed installation --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8ce8859..1ef0da0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "geopandas>=0.9.0", "shapely>=1.7.0", "path>=17.0.0", - "pyarrow>=14.0.1,<15.0.0", + "pyarrow>=16.1.0,<16.2.0", "dask_geopandas>=0.4.0", "torch-geometric>=2.2.0" ] @@ -46,7 +46,6 @@ faiss = [ ] rapids11 = [ - "pyarrow>=16.1.0,<16.2.0", "cupy-cuda11x", "cudf-cu11==24.8.*", "cugraph-cu11==24.8.*", @@ -56,7 +55,6 @@ rapids11 = [ ] rapids12 = [ - "pyarrow>=16.1.0,<16.2.0", "cupy-cuda12x", "cudf-cu12==24.8.*", "cugraph-cu12==24.8.*", From a47ee166ce28791f1fc6b56d97159368852f3aca Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 23 Sep 2024 11:27:53 +0200 Subject: [PATCH 046/156] fixed the example code for data creation --- src/segger/data/README.md | 74 +++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/src/segger/data/README.md b/src/segger/data/README.md index 16d0d48..df6e979 100644 --- a/src/segger/data/README.md +++ b/src/segger/data/README.md @@ -131,36 +131,33 @@ Below are examples demonstrating how to utilize the `segger` data preparation mo ### Xenium Data ```python -from segger.data import XeniumSample +from segger.data.io import XeniumSample from pathlib import Path +from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc +import os -# Set up the file paths -raw_data_dir = Path('data_raw/xenium/') -processed_data_dir = Path('data_tidy/pyg_datasets') -sample_tag = "Xenium_FFPE_Human_Breast_Cancer_Rep1" +xenium_data_dir = Path('./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1') +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') +models_dir = Path('./models/bc_embedding_0919') + +scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' -# Load scRNA-seq data using Scanpy and subsample for efficiency -scRNAseq_path = raw_data_dir / 'scRNAseq' / 'atlas_filtered.h5ad' scRNAseq = sc.read(scRNAseq_path) -sc.pp.subsample(scRNAseq, fraction=0.1) -# Calculate gene cell type abundance embedding from scRNA-seq data -from segger.utils import calculate_gene_celltype_abundance_embedding +sc.pp.subsample(scRNAseq, 0.1) + +# Step 1: Calculate the gene cell type abundance embedding celltype_column = 'celltype_minor' gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) -# Create a XeniumSample instance for spatial transcriptomics processing -xenium_sample = XeniumSample() - -# Load transcripts and include the calculated cell type abundance embedding -xenium_sample.load_transcripts( - base_path=raw_data_dir, - sample=sample_tag, - transcripts_filename='transcripts.parquet', - file_format="parquet", - additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} +# Setup Xenium sample to create dataset +xs = XeniumSample(verbose=False , embedding_df=gene_celltype_abundance_embedding) +xs.set_file_paths( + transcripts_path=xenium_data_dir / 'transcripts.parquet', + boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', ) +xs.set_metadata() # Set the embedding to "cell_type_abundance" to use it in further processing xenium_sample.set_embedding("cell_type_abundance") @@ -179,24 +176,25 @@ tile_pyg_data = xenium_sample.build_pyg_data_from_tile( workers=1 ) -# Save dataset in processed format for segmentation -xenium_sample.save_dataset_for_segger( - processed_dir=processed_data_dir / 'embedding', - x_size=360, - y_size=360, - d_x=180, - d_y=180, - margin_x=10, - margin_y=10, - compute_labels=False, - r_tx=5, - k_tx=5, - val_prob=0.1, - test_prob=0.2, - neg_sampling_ratio_approx=5, - sampling_rate=1, - num_workers=1 -) + +try: + xs.save_dataset_for_segger( + processed_dir=segger_data_dir, + x_size=400, + y_size=400, + d_x=350, + d_y=350, + margin_x=20, + margin_y=20, + compute_labels=True, # Set to True if you need to compute labels + r_tx=5, + k_tx=10, + val_prob=0.4, + test_prob=0.1, + num_workers=6 + ) +except AssertionError as err: + print(f'Dataset already exists at {segger_data_dir}') ``` ### Merscope Data From 0184f3d6c4edd9f6d9df8b14bb874cd7ed2cd2c8 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 24 Sep 2024 15:15:17 +0200 Subject: [PATCH 047/156] fixes #11: included addtional codewords for xenium --- docs/notebooks/segger_tutorial.ipynb | 30 +++++++--------------------- src/segger/data/utils.py | 1 + src/segger/prediction/predict.py | 2 +- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index 27d380c..d5cfc54 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -122,33 +122,15 @@ "- **`--processed_dir`**: Directory where the processed dataset will be saved.\n", "- **`--x_size`, `--y_size`**: These parameters specify the size of the tiles used to divide the image. The size of the tiles determines how the spatial region is partitioned for processing.\n", "- **`--d_x`, `--d_y`**: These define the step size of the spatial grid used to bin transcripts and nuclei into tiles.\n", - "- **`--r`**: Specifies the radius used for graph construction. A smaller radius will connect transcripts to nearby nuclei, while a larger radius might connect them to more distant neighbors.\n", - "- **`--k_bd`, `--k_tx`**: These parameters define the number of nearest neighbors considered when building graphs for nuclei (`k_bd`) and transcripts (`k_tx`).\n", + "- **`--r_tx`**: Specifies the radius used for graph construction. A smaller radius will connect transcripts to nearby nuclei, while a larger radius might connect them to more distant neighbors.\n", + "- **`--scale_boundaries`**: The factor by which to scale the boundary polygons. Suggested to keep `=1` when boundaries refer to nuclei.\n", + "- **`--k_tx`**: Defines the number of nearest neighbors considered when building graphs for transcripts (`k_tx`).\n", "- **`--val_prob` and `--test_prob`**: These control the proportion of the dataset that will be set aside for validation and testing. For instance, `--val_prob 0.1` means 10% of the data will be used for validation.\n", "- **`--compute_labels`**: When set to `True`, this flag triggers the computation of labels (cell assignments) for each transcript. Use False if you just plan to perform prediction using a pre-existing model.\n", "\n", "Once the dataset is processed, the output will be ready for training the Segger model.\n" ] }, - { - "cell_type": "code", - "execution_count": 6, - "id": "76dd37d9-12f7-4636-8410-6ef42c72ddc2", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:49:05.473679Z", - "iopub.status.busy": "2024-09-12T00:49:05.473467Z", - "iopub.status.idle": "2024-09-12T00:49:06.357221Z", - "shell.execute_reply": "2024-09-12T00:49:06.356835Z", - "shell.execute_reply.started": "2024-09-12T00:49:05.473663Z" - } - }, - "outputs": [], - "source": [ - "# It is used below in prediction as well\n", - "receptive_field = {'k_bd': 3, 'dist_bd': 20,'k_tx': 15, 'dist_tx': 3}" - ] - }, { "cell_type": "code", "execution_count": 7, @@ -178,13 +160,13 @@ " processed_dir=segger_data_dir,\n", " r_tx=5,\n", " k_tx=15,\n", - " receptive_field=receptive_field,\n", " x_size=120,\n", " y_size=120,\n", " d_x=100,\n", " d_y=100,\n", " margin_x=10,\n", " margin_y=10,\n", + " scale_boundaries=1,\n", " num_workers=4, # change to you number of CPUs\n", " )\n", "except AssertionError as err:\n", @@ -286,6 +268,8 @@ " num_workers=2, \n", ")\n", "\n", + "dm.setup()\n", + "\n", "# Initialize the Lightning trainer\n", "trainer = Trainer(\n", " accelerator='cuda', \n", @@ -416,7 +400,7 @@ "model = load_model(model_path / 'checkpoints')\n", "dm.setup()\n", "\n", - "receptive_field = {'k_bd': 3, 'dist_bd': 20,'k_tx': 30, 'dist_tx': 5}\n", + "receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 15, 'dist_tx': 3}\n", "\n", "# Perform segmentation (predictions)\n", "segmentation = predict(\n", diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 37195bf..4573b7b 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -60,6 +60,7 @@ def filter_transcripts( "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", + "UnassignedCodeword_" ) mask = transcripts_df["qv"].ge(min_qv) mask &= ~transcripts_df["feature_name"].str.startswith(filter_codewords) diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index f022aba..9a9e825 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -268,4 +268,4 @@ def predict( idx = assignments.groupby('transcript_id')['score'].idxmax() assignments = assignments.loc[idx].reset_index(drop=True) - return assignments + return assignments \ No newline at end of file From bf9ce8e794a9ca8858eda1a9331402289fbacebf Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 24 Sep 2024 15:40:44 +0200 Subject: [PATCH 048/156] added lines to the tutorial --- docs/notebooks/segger_tutorial.ipynb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index d5cfc54..6982d34 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -213,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "4db89cb4-d0eb-426a-a71f-d127926fa412", "metadata": { "execution": { @@ -270,6 +270,11 @@ "\n", "dm.setup()\n", "\n", + "\n", + "# if you wish to use more than 1 device for training you should run this:\n", + "batch = dm.train[0]\n", + "ls.forward(batch)\n", + "\n", "# Initialize the Lightning trainer\n", "trainer = Trainer(\n", " accelerator='cuda', \n", From 21cc742aa192b54f16339cddbe7a2373571d5cbe Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 24 Sep 2024 15:48:43 +0200 Subject: [PATCH 049/156] fixes #11: included addtional codewords for xenium --- src/segger/data/io.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/segger/data/io.py b/src/segger/data/io.py index 4cbc3f2..3d45afd 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -299,6 +299,7 @@ def set_metadata(self) -> None: "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", + "UnassignedCodeword_" ) # Iterate over row groups to extract statistics and unique gene names @@ -975,6 +976,7 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", + "UnassignedCodeword_" ) # Ensure FEATURE_NAME is a string type for proper filtering (compatible with Dask) From 0c1b49e0063e12bfeb73dd25cfead96009d14be2 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Wed, 25 Sep 2024 10:53:30 +0200 Subject: [PATCH 050/156] some random bug fixes --- README.md | 2 +- scripts/predict_model_sample.py | 105 ++++++++++++++++++++++++++++++++ scripts/train_model_sample.py | 58 ++++++++++++++++++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 scripts/predict_model_sample.py create mode 100644 scripts/train_model_sample.py diff --git a/README.md b/README.md index b30e71f..a079a15 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🍳 Welcome to segger +# 🍳 Welcome to segger! **segger** is a cutting-edge tool for **cell segmentation** in **single-molecule spatial omics** datasets. By leveraging **graph neural networks (GNNs)** and heterogeneous graphs, segger offers unmatched accuracy and scalability. diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py new file mode 100644 index 0000000..be2380f --- /dev/null +++ b/scripts/predict_model_sample.py @@ -0,0 +1,105 @@ +from segger.data.io import XeniumSample +from segger.training.train import LitSegger +from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.predict import predict, load_model +from lightning.pytorch.loggers import CSVLogger +from pytorch_lightning import Trainer +from pathlib import Path +from lightning.pytorch.plugins.environments import LightningEnvironment +from matplotlib import pyplot as plt +import seaborn as sns +# import pandas as pd +from segger.data.utils import calculate_gene_celltype_abundance_embedding +import scanpy as sc +import os +import dask.dataframe as dd +import pandas as pd + + +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') +models_dir = Path('./models/bc_embedding_0919') +benchmarks_path = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') + +# Initialize the Lightning data module +dm = SeggerDataModule( + data_dir=segger_data_dir, + batch_size=1, + num_workers=2, +) + +dm.setup() + + +model_version = 2 + +# Load in latest checkpoint +model_path = models_dir / 'lightning_logs' / f'version_{model_version}' +model = load_model(model_path / 'checkpoints') +dm.setup() + +receptive_field = {'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} + +# Perform segmentation (predictions) +segmentation_train = predict( + model, + dm.train_dataloader(), + score_cut=0.5, + receptive_field=receptive_field, + use_cc=True, + # device='cuda', + # num_workers=4 +) + +segmentation_val = predict( + model, + dm.val_dataloader(), + score_cut=0.5, + receptive_field=receptive_field, + use_cc=True, + # use_cc=False, + # device='cpu' +) + +segmentation_test = predict( + model, + dm.test_dataloader(), + score_cut=0.5, + receptive_field=receptive_field, + use_cc=True, + # use_cc=False, + # device='cpu' +) + + + +seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test]) +# Group by transcript_id and keep the row with the highest score for each transcript +seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test]).reset_index() + +# Group by transcript_id and keep the row with the highest score for each transcript +seg_final = seg_combined.loc[seg_combined.groupby('transcript_id')['score'].idxmax()] + +# Drop rows where segger_cell_id is NaN +seg_final = seg_final.dropna(subset=['segger_cell_id']) + +# Reset the index if needed +seg_final.reset_index(drop=True, inplace=True) + +transcripts_df = dd.read_parquet('data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet') + +# # Assuming seg_final is already computed with pandas +# # Convert seg_final to a Dask DataFrame to enable efficient merging with Dask +seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) + +# # Step 1: Merge segmentation with the transcripts on transcript_id +# # Use 'inner' join to keep only matching transcript_ids +transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') + +# Compute the result if needed +transcripts_df_filtered = transcripts_df_filtered.compute() + + +from segger.data.utils import create_anndata +segger_adata = create_anndata(transcripts_df_filtered, cell_id_col='segger_cell_id') +segger_adata.write(benchmarks_path / 'adata_segger_embedding_full.h5ad') + diff --git a/scripts/train_model_sample.py b/scripts/train_model_sample.py new file mode 100644 index 0000000..2ee3a55 --- /dev/null +++ b/scripts/train_model_sample.py @@ -0,0 +1,58 @@ +from segger.data.io import XeniumSample +from segger.training.train import LitSegger +from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.predict import predict, load_model +from lightning.pytorch.loggers import CSVLogger +from pytorch_lightning import Trainer +from pathlib import Path +from lightning.pytorch.plugins.environments import LightningEnvironment +from matplotlib import pyplot as plt +import seaborn as sns +# import pandas as pd +from segger.data.utils import calculate_gene_celltype_abundance_embedding +import scanpy as sc +import os + + +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') +models_dir = Path('./models/bc_embedding_0919') + +dm = SeggerDataModule( + data_dir=segger_data_dir, + batch_size=1, + num_workers=1, +) + +dm.setup() + +metadata = (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]) +ls = LitSegger( + num_tx_tokens=500, + init_emb=8, + hidden_channels=32, + out_channels=8, + heads=2, + num_mid_layers=2, + aggr='sum', + metadata=metadata, +) + +# Initialize the Lightning trainer +trainer = Trainer( + accelerator='cuda', + strategy='auto', + precision='16-mixed', + devices=4, + max_epochs=200, + default_root_dir=models_dir, + logger=CSVLogger(models_dir), +) + +batch = dm.train[0] +ls.forward(batch) + + +trainer.fit( + model=ls, + datamodule=dm +) \ No newline at end of file From bb4e15158907a7bc1056b10a7f840f31ce9e2ba1 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Wed, 25 Sep 2024 11:31:59 +0200 Subject: [PATCH 051/156] some random bug fixes --- scripts/predict_model_sample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index be2380f..074491d 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -1,7 +1,7 @@ from segger.data.io import XeniumSample from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule -from segger.prediction.predict import predict, load_model +from segger.prediction.predict_gpu import predict, load_model from lightning.pytorch.loggers import CSVLogger from pytorch_lightning import Trainer from pathlib import Path @@ -14,6 +14,7 @@ import os import dask.dataframe as dd import pandas as pd +from pathlib import Path segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') From e3ffffc11b35d67773226db250b99b2de5a69539 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Wed, 25 Sep 2024 17:50:59 +0200 Subject: [PATCH 052/156] cleaned up get_edge_index functions and add cuvs&cupy verison --- src/segger/data/utils.py | 199 ++++++++++----------------------------- 1 file changed, 49 insertions(+), 150 deletions(-) diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 4573b7b..8fb7954 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -31,11 +31,14 @@ def try_import(module_name): try_import('multiprocessing') try_import('joblib') try_import('faiss') -try_import('cuml') -try_import('cudf') -try_import('cugraph') -try_import('cuspatial') -try_import('hnswlib') +try_import('cuvs') +try: + import cupy as cp + from cuvs.neighbors import cagra +except ImportError: + print(f"Warning: cupy and/or cuvs are not installed. Please install them to use this functionality.") + +import torch.utils.dlpack as dlpack @@ -260,14 +263,14 @@ def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_colu def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, method: str = 'kd_tree', gpu: bool = False, workers: int = 1) -> torch.Tensor: """ - Computes edge indices using various methods (KD-Tree, FAISS, RAPIDS cuML, cuGraph, or cuSpatial). + Computes edge indices using various methods (KD-Tree, FAISS, RAPIDS::cuvs+cupy (cuda)). Parameters: coords_1 (np.ndarray): First set of coordinates. coords_2 (np.ndarray): Second set of coordinates. k (int, optional): Number of nearest neighbors. dist (int, optional): Distance threshold. - method (str, optional): The method to use ('kd_tree', 'faiss', 'rapids', 'cugraph', 'cuspatial'). + method (str, optional): The method to use ('kd_tree', 'faiss', 'cuda'). gpu (bool, optional): Whether to use GPU acceleration (applicable for FAISS). Returns: @@ -277,14 +280,8 @@ def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: return get_edge_index_kdtree(coords_1, coords_2, k=k, dist=dist, workers=workers) elif method == 'faiss': return get_edge_index_faiss(coords_1, coords_2, k=k, dist=dist, gpu=gpu) - elif method == 'rapids': - return get_edge_index_rapids(coords_1, coords_2, k=k, dist=dist) - elif method == 'cugraph': - return get_edge_index_cugraph(coords_1, coords_2, k=k, dist=dist) - elif method == 'cuspatial': - return get_edge_index_cuspatial(coords_1, coords_2, k=k, dist=dist) - elif method == 'hnsw': - return get_edge_index_hnsw(coords_1, coords_2, k=k, dist=dist) + elif method == 'cuda': + return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) else: raise ValueError(f"Unknown method {method}") @@ -359,150 +356,52 @@ def get_edge_index_faiss(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, return edge_index -def get_edge_index_rapids(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10) -> torch.Tensor: - """ - Computes edge indices using RAPIDS cuML. - - Parameters: - coords_1 (np.ndarray): First set of coordinates. - coords_2 (np.ndarray): Second set of coordinates. - k (int, optional): Number of nearest neighbors. - dist (int, optional): Distance threshold. - - Returns: - torch.Tensor: Edge indices. - """ - index = cuml.neighbors.NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean') - index.fit(coords_1) - D, I = index.kneighbors(coords_2) - - valid_mask = D < dist ** 2 - edges = [] - - for idx, valid in enumerate(valid_mask): - valid_indices = I[idx][valid] - if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) - - edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() - return edge_index - -def get_edge_index_cugraph( - coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10 +def get_edge_index_cuda( + coords_1: torch.Tensor, + coords_2: torch.Tensor, + k: int = 10, + dist: float = 10.0 ) -> torch.Tensor: """ - Computes edge indices using RAPIDS cuGraph. + Computes edge indices using RAPIDS cuVS with cagra for vector similarity search, + with input coordinates as PyTorch tensors on CUDA, using DLPack for conversion. Parameters: - coords_1 (np.ndarray): First set of coordinates. - coords_2 (np.ndarray): Second set of coordinates. + coords_1 (torch.Tensor): First set of coordinates (query vectors) on CUDA. + coords_2 (torch.Tensor): Second set of coordinates (index vectors) on CUDA. k (int, optional): Number of nearest neighbors. - dist (int, optional): Distance threshold. + dist (float, optional): Distance threshold. Returns: - torch.Tensor: Edge indices. - """ - gdf_1 = cudf.DataFrame({'x': coords_1[:, 0], 'y': coords_1[:, 1]}) - gdf_2 = cudf.DataFrame({'x': coords_2[:, 0], 'y': coords_2[:, 1]}) - - gdf_1['id'] = gdf_1.index - gdf_2['id'] = gdf_2.index - - result = cugraph.spatial_knn( - gdf_1, gdf_2, k=k, return_distance=True - ) - - valid_mask = result['distance'] < dist - edges = result[['src', 'dst']].loc[valid_mask].to_pandas().values - edge_index = torch.tensor(edges.T, dtype=torch.long).contiguous() - return edge_index - - -def get_edge_index_cuspatial(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10) -> torch.Tensor: + torch.Tensor: Edge indices as a PyTorch tensor on CUDA. """ - Computes edge indices using cuSpatial's spatial join functionality. - - Parameters: - coords_1 (np.ndarray): First set of coordinates (2D). - coords_2 (np.ndarray): Second set of coordinates (2D). - k (int, optional): Number of nearest neighbors. - dist (int, optional): Distance threshold. - - Returns: - torch.Tensor: Edge indices. - """ - # Convert numpy arrays to cuDF DataFrames - coords_1_df = cudf.DataFrame({'x': coords_1[:, 0], 'y': coords_1[:, 1]}) - coords_2_df = cudf.DataFrame({'x': coords_2[:, 0], 'y': coords_2[:, 1]}) - - # Perform the nearest neighbor search using cuSpatial's point-to-point nearest neighbor - result = cuspatial.point_to_nearest_neighbor( - coords_1_df['x'], coords_1_df['y'], - coords_2_df['x'], coords_2_df['y'], - k=k - ) - - # The result is a tuple (distances, indices) - distances, indices = result - - # Filter by distance threshold - valid_mask = distances < dist - edges = [] - - for idx, valid in enumerate(valid_mask): - valid_indices = indices[idx][valid] - if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) + def cupy_to_torch(cupy_array): + return torch.from_dlpack((cupy_array.toDlpack())) - # Convert to torch.Tensor - edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() - return edge_index - - - - -def get_edge_index_hnsw(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10) -> torch.Tensor: - """ - Computes edge indices using the HNSW algorithm. - - Parameters: - coords_1 (np.ndarray): First set of coordinates. - coords_2 (np.ndarray): Second set of coordinates. - k (int, optional): Number of nearest neighbors. - dist (int, optional): Distance threshold. - - Returns: - torch.Tensor: Edge indices. - """ - num_elements = coords_1.shape[0] - dim = coords_1.shape[1] - - # Initialize the HNSW index - p = hnswlib.Index(space='l2', dim=dim) # l2 for Euclidean distance - p.init_index(max_elements=num_elements, ef_construction=200, M=16) - - # Add points to the index - p.add_items(coords_1) - - # Query the index for nearest neighbors - indices, distances = p.knn_query(coords_2, k=k) - - # Filter by distance threshold - valid_mask = distances < dist ** 2 - edges = [] - - for idx, valid in enumerate(valid_mask): - valid_indices = indices[idx][valid] - if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) - - edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() + def torch_to_cupy(tensor): + return cp.fromDlpack(dlpack.to_dlpack(tensor)) + # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack + cp_coords_1 = torch_to_cupy(coords_1) + cp_coords_2 = torch_to_cupy(coords_2) + # Define the distance threshold in CuPy + cp_dist = cp.float32(dist) + # IndexParams and SearchParams for cagra + index_params = cagra.IndexParams() + search_params = cagra.SearchParams() + # Build index using CuPy coords + index = cagra.build_index(index_params, cp_coords_1) + # Perform search to get distances and indices (still in CuPy) + D, I = cagra.search(search_params, index, cp_coords_2, k) + # Boolean mask for filtering distances below the squared threshold (all in CuPy) + valid_mask = cp.asarray(D < cp_dist ** 2) + # Vectorized operations for row and valid indices (all in CuPy) + repeats = valid_mask.sum(axis=1).tolist() + row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats) + valid_indices = cp.asarray(I)[cp.where(valid_mask)] + # Stack row indices with valid indices to form edges + edges = cp.vstack((row_indices, valid_indices)).T + # Convert the result back to a PyTorch tensor using DLPack + edge_index = cupy_to_torch(edges).long().contiguous() return edge_index class SpatialTranscriptomicsDataset(InMemoryDataset): From 441434a8bd11f3ad0b747eecd012ad35955aa3e9 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Wed, 25 Sep 2024 21:47:13 +0200 Subject: [PATCH 053/156] fixes #16: added automated merging and saving of segmentations --- src/segger/prediction/predict.py | 102 ++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index 9a9e825..fa95499 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -17,6 +17,7 @@ from segger.data.io import XeniumSample from segger.models.segger_model import Segger from segger.training.train import LitSegger +from segger.training.segger_data_module import SeggerDataModule from lightning import LightningModule from torch_geometric.nn import to_hetero import random @@ -28,6 +29,13 @@ import typing import re from tqdm import tqdm +from segger.data.utils import create_anndata +import dask.dataframe as dd +import dask +import pandas as pd +from dask import delayed +from typing import Union, Optional +import anndata as ad # CONFIG @@ -268,4 +276,96 @@ def predict( idx = assignments.groupby('transcript_id')['score'].idxmax() assignments = assignments.loc[idx].reset_index(drop=True) - return assignments \ No newline at end of file + return assignments + + +def segment( + model: LitSegger, + dm: SeggerDataModule, + save_dir: Union[str, Path], + seg_tag: str, + transcript_file: Union[str, Path], + file_format: str = 'anndata', + receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, + **anndata_kwargs +) -> None: + """ + Perform segmentation using the model, merge segmentation results with transcripts_df, and save in the specified format. + + Parameters: + ---------- + model : LitSegger + The trained segmentation model. + dm : SeggerDataModule + The SeggerDataModule instance for data loading. + save_dir : Union[str, Path] + Directory to save the final segmentation results. + seg_tag : str + Tag to include in the saved filename. + transcript_file : Union[str, Path] + Path to the transcripts parquet file. + file_format : str, optional + File format to save the results ('csv', 'parquet', or 'anndata'). Defaults to 'anndata'. + **anndata_kwargs : dict, optional + Additional keyword arguments passed to the `create_anndata` function, such as: + - panel_df: pd.DataFrame + - min_transcripts: int + - cell_id_col: str + - qv_threshold: float + - min_cell_area: float + - max_cell_area: float + + Returns: + ------- + None + """ + # Ensure the save directory exists + save_dir = Path(save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + + # Define delayed prediction steps for parallel execution + delayed_train = delayed(predict)(model, dm.train_dataloader(), score_cut=0.5, receptive_field=receptive_field, use_cc=True) + delayed_val = delayed(predict)(model, dm.val_dataloader(), score_cut=0.5, receptive_field=receptive_field, use_cc=True) + delayed_test = delayed(predict)(model, dm.test_dataloader(), score_cut=0.5, receptive_field=receptive_field, use_cc=True) + + # Trigger parallel execution and get results + segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) + + # Combine the segmentation results + seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test]).reset_index() + + # Group by transcript_id and keep the row with the highest score + seg_final = seg_combined.loc[seg_combined.groupby('transcript_id')['score'].idxmax()] + + # Drop rows where segger_cell_id is NaN + seg_final = seg_final.dropna(subset=['segger_cell_id']) + + # Reset the index + seg_final.reset_index(drop=True, inplace=True) + + # Load the transcript data + transcripts_df = dd.read_parquet(transcript_file) + + # Convert seg_final to a Dask DataFrame and merge with transcripts + seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) + transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') + + # Compute the final result + transcripts_df_filtered = transcripts_df_filtered.compute() + + # Save the merged result based on the file format + if file_format == 'csv': + save_path = save_dir / f'{seg_tag}_segmentation.csv' + transcripts_df_filtered.to_csv(save_path, index=False) + elif file_format == 'parquet': + save_path = save_dir / f'{seg_tag}_segmentation.parquet' + transcripts_df_filtered.to_parquet(save_path, index=False) + elif file_format == 'anndata': + # Create an AnnData object and save as h5ad, passing additional arguments from kwargs + save_path = save_dir / f'{seg_tag}_segmentation.h5ad' + segger_adata = create_anndata(transcripts_df_filtered, **anndata_kwargs) + segger_adata.write(save_path) + else: + raise ValueError(f"Unsupported file format: {file_format}") + + print(f"Segmentation results saved at {save_path}") \ No newline at end of file From aa200a366785c8db610113ebda57ec42c14e8ed8 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 26 Sep 2024 10:17:07 +0200 Subject: [PATCH 054/156] added some simple changes regarding prediciton --- scripts/predict_model_sample.py | 80 ++++++--------------------------- src/segger/data/utils.py | 21 +++++++++ 2 files changed, 35 insertions(+), 66 deletions(-) diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 074491d..d51d8e9 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -1,7 +1,7 @@ from segger.data.io import XeniumSample from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule -from segger.prediction.predict_gpu import predict, load_model +from segger.prediction.predict import segment, load_model from lightning.pytorch.loggers import CSVLogger from pytorch_lightning import Trainer from pathlib import Path @@ -19,8 +19,8 @@ segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') models_dir = Path('./models/bc_embedding_0919') -benchmarks_path = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') - +benchmarks_dir = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') +transcripts_file = 'data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet' # Initialize the Lightning data module dm = SeggerDataModule( data_dir=segger_data_dir, @@ -40,67 +40,15 @@ receptive_field = {'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} -# Perform segmentation (predictions) -segmentation_train = predict( - model, - dm.train_dataloader(), - score_cut=0.5, - receptive_field=receptive_field, - use_cc=True, - # device='cuda', - # num_workers=4 -) - -segmentation_val = predict( +segment( model, - dm.val_dataloader(), - score_cut=0.5, - receptive_field=receptive_field, - use_cc=True, - # use_cc=False, - # device='cpu' -) - -segmentation_test = predict( - model, - dm.test_dataloader(), - score_cut=0.5, - receptive_field=receptive_field, - use_cc=True, - # use_cc=False, - # device='cpu' -) - - - -seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test]) -# Group by transcript_id and keep the row with the highest score for each transcript -seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test]).reset_index() - -# Group by transcript_id and keep the row with the highest score for each transcript -seg_final = seg_combined.loc[seg_combined.groupby('transcript_id')['score'].idxmax()] - -# Drop rows where segger_cell_id is NaN -seg_final = seg_final.dropna(subset=['segger_cell_id']) - -# Reset the index if needed -seg_final.reset_index(drop=True, inplace=True) - -transcripts_df = dd.read_parquet('data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet') - -# # Assuming seg_final is already computed with pandas -# # Convert seg_final to a Dask DataFrame to enable efficient merging with Dask -seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) - -# # Step 1: Merge segmentation with the transcripts on transcript_id -# # Use 'inner' join to keep only matching transcript_ids -transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') - -# Compute the result if needed -transcripts_df_filtered = transcripts_df_filtered.compute() - - -from segger.data.utils import create_anndata -segger_adata = create_anndata(transcripts_df_filtered, cell_id_col='segger_cell_id') -segger_adata.write(benchmarks_path / 'adata_segger_embedding_full.h5ad') - + dm, + save_dir=benchmarks_dir, + seg_tag='test_segger_segment', + transcript_file=transcripts_file, + file_format='anndata', + receptive_field = receptive_field, + min_transcripts=10, + max_transcripts=1000 + cell_id_col='segger_cell_id' +) \ No newline at end of file diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 8fb7954..8b18886 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -39,6 +39,7 @@ def try_import(module_name): print(f"Warning: cupy and/or cuvs are not installed. Please install them to use this functionality.") import torch.utils.dlpack as dlpack +from datetime import timedelta @@ -542,3 +543,23 @@ def coo_to_dense_adj( nbr_idx[i, :len(nbrs)] = nbrs return nbr_idx + + + + + +def format_time(elapsed: float) -> str: + """ + Format elapsed time to h:m:s. + + Parameters: + ---------- + elapsed : float + Elapsed time in seconds. + + Returns: + ------- + str + Formatted time in h:m:s. + """ + return str(timedelta(seconds=int(elapsed))) \ No newline at end of file From 963604d71a54ecf74f3b1e67cab5917e4cd0e0b4 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 26 Sep 2024 10:23:20 +0200 Subject: [PATCH 055/156] dasked the prediciton --- src/segger/prediction/predict.py | 247 ++++++++++++++++++------------- 1 file changed, 142 insertions(+), 105 deletions(-) diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index fa95499..fd5d81a 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -6,36 +6,32 @@ import torch._dynamo from torch_geometric.loader import DataLoader from torch_geometric.data import Batch -from torchmetrics import F1Score from scipy.sparse.csgraph import connected_components as cc - from segger.data.utils import ( - SpatialTranscriptomicsDataset, get_edge_index, coo_to_dense_adj, + create_anndata, + format_time ) from segger.data.io import XeniumSample from segger.models.segger_model import Segger from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule -from lightning import LightningModule -from torch_geometric.nn import to_hetero import random import string -import os -import yaml from pathlib import Path import glob -import typing +from typing import Union import re from tqdm import tqdm -from segger.data.utils import create_anndata import dask.dataframe as dd import dask -import pandas as pd from dask import delayed -from typing import Union, Optional +from dask.array import from_array +from dask.diagnostics import ProgressBar +from pqdm.threads import pqdm import anndata as ad +import time # CONFIG @@ -147,84 +143,100 @@ def get_similarity_scores( def predict_batch( lit_segger: LitSegger, - batch: Batch, + batch: object, score_cut: float, receptive_field: dict, use_cc: bool = True, -) -> pd.DataFrame: + knn_method: str = 'cuda' +) -> dd.DataFrame: """ - Predict cell assignments for a batch of transcript data using a - segmentation model. + Predict cell assignments for a batch of transcript data using a segmentation model. Parameters ---------- lit_segger : LitSegger The lightning module wrapping the segmentation model. - batch : Batch + batch : object A batch of transcript and cell data. score_cut : float - The threshold for assigning transcripts to cells based on similarity - scores. + The threshold for assigning transcripts to cells based on similarity scores. + receptive_field : dict + Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. + use_cc : bool, optional + If True, perform connected components analysis for unassigned transcripts. Returns ------- pd.DataFrame - A DataFrame containing the transcript IDs, similarity scores, and - assigned cell IDs. + A DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. """ # Get random Xenium-style ID def _get_id(): id_chars = random.choices(string.ascii_lowercase, k=8) return ''.join(id_chars) + '-nx' - - with torch.no_grad(): + with torch.no_grad(): batch = batch.to("cuda") # Assignments of cells to nuclei - assignments = pd.DataFrame() - assignments['transcript_id'] = batch['tx'].id.cpu().numpy() + transcript_id = delayed(batch['tx'].id.cpu().numpy)() + assignments = dd.from_array(transcript_id, columns=['transcript_id']) + if len(batch['bd'].id[0]) > 0: - # Transcript-cell similarity scores, filtered by neighbors - edge_index = get_edge_index( + # Step 2.1: Calculate edge index (delayed operation) + edge_index = delayed(get_edge_index)( batch['bd'].pos[:, :2].cpu(), batch['tx'].pos[:, :2].cpu(), k=receptive_field['k_bd'], dist=receptive_field['dist_bd'], - method='kd_tree', + method=knn_method, ).T - batch['tx']['bd_field'] = coo_to_dense_adj( + + # Step 2.2: Compute dense adjacency matrix lazily + batch['tx']['bd_field'] = delayed(coo_to_dense_adj)( edge_index, num_nodes=batch['tx'].id.shape[0], num_nbrs=receptive_field['k_bd'], ) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd") - # 1. Get direct assignments from similarity matrix - belongs = scores.max(1) - assignments['score'] = belongs.values.cpu() - mask = assignments['score'] > score_cut - all_ids = np.concatenate(batch['bd'].id)[belongs.indices.cpu()] - assignments.loc[mask, 'segger_cell_id'] = all_ids[mask] + # Step 2.3: Calculate similarity scores lazily + scores = delayed(get_similarity_scores)(lit_segger.model, batch, "tx", "bd") + belongs = delayed(lambda x: x.max(1))(scores) + + # Step 2.4: Add scores to assignments (Dask DataFrame) + score_values = belongs.values.cpu().numpy() + assignments = assignments.assign(score=from_array(score_values)) + + # Step 2.5: Apply score cut-off and assign cell IDs lazily + all_ids = delayed(np.concatenate)(batch['bd'].id)[belongs.indices.cpu().numpy()] + mask = delayed(assignments['score'] > score_cut) + assignments['segger_cell_id'] = dd.from_array(delayed(np.where)(mask, all_ids, np.nan)) + + + # Step 3: If connected components (CC) are enabled, further process the assignments if use_cc: - # Transcript-transcript similarity scores, filtered by neighbors - edge_index = batch['tx', 'neighbors', 'tx'].edge_index - batch['tx']['tx_field'] = coo_to_dense_adj( - edge_index, + # Step 3.1: Calculate transcript-to-transcript field lazily + edge_index_tx = batch['tx', 'neighbors', 'tx'].edge_index + batch['tx']['tx_field'] = delayed(coo_to_dense_adj)( + edge_index_tx, num_nodes=batch['tx'].id.shape[0], ) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "tx") - scores = scores.fill_diagonal_(0) # ignore self-similarity - # 2. Assign remainder using connected components - no_id = assignments['segger_cell_id'].isna().values - no_id_scores = scores[no_id][:, no_id] - print('here') - n, comps = cc(no_id_scores, connection="weak", directed=False) - new_ids = np.array([_get_id() for _ in range(n)]) - assignments.loc[no_id, 'segger_cell_id'] = new_ids[comps] + # Step 3.2: Compute similarity scores for transcript-to-transcript lazily + scores_tx = delayed(get_similarity_scores)(lit_segger.model, batch, "tx", "tx") + scores_tx = delayed(lambda x: x.fill_diagonal_(0))(scores_tx) + + # Step 3.3: Handle connected components lazily + no_id = assignments['segger_cell_id'].isna() + no_id_scores = delayed(lambda s, mask: s[mask].T[mask])(scores_tx, no_id) + n, comps = delayed(pqdm)([lambda: cc(no_id_scores, connection="weak", directed=False)], n_jobs=-1) + + # Assign new IDs based on connected components + new_ids = delayed(np.array)([_get_id() for _ in range(n)]) + assignments['segger_cell_id'] = dd.from_array(np.where(no_id, new_ids[comps], assignments['segger_cell_id'].values)) + return assignments @@ -233,12 +245,11 @@ def predict( data_loader: DataLoader, score_cut: float, receptive_field: dict, - use_cc: bool = True, -) -> pd.DataFrame: + use_cc: bool = True +) -> dd.DataFrame: """ - Predict cell assignments for multiple batches of transcript data using - a segmentation model. - + Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. + Parameters ---------- lit_segger : LitSegger @@ -246,37 +257,34 @@ def predict( data_loader : DataLoader A data loader providing batches of transcript and cell data. score_cut : float - The threshold for assigning transcripts to cells based on similarity - scores. + The threshold for assigning transcripts to cells based on similarity scores. + receptive_field : dict + Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. + use_cc : bool, optional + If True, perform connected components analysis for unassigned transcripts. Returns ------- - pd.DataFrame - A DataFrame containing the transcript IDs, similarity scores, and - assigned cell IDs, consolidated across all batches. + dd.DataFrame + A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. """ - # If data loader is empty, do nothing if len(data_loader) == 0: return None - - assignments = [] - # Assign transcripts from each batch to nuclei - # TODO: parallelize this step - for batch in tqdm(data_loader): - batch_assignments = predict_batch( - lit_segger, batch, score_cut, receptive_field, use_cc - ) - assignments.append(batch_assignments) + # Convert the entire data loader to delayed predictions + delayed_assignments = pqdm([delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc) + for batch in data_loader], n_jobs=-1) + assignments_dd = dd.from_delayed(delayed_assignments) - # Join across batches and handle duplicates between batches - assignments = pd.concat(assignments).reset_index(drop=True) + # Group by transcript_id and lazily select the row with the highest score + idx = assignments_dd.groupby('transcript_id')['score'].idxmax() + final_assignments = assignments_dd.map_partitions(lambda df: df.loc[idx].reset_index(drop=True)) - # Handle duplicate assignments of transcripts - idx = assignments.groupby('transcript_id')['score'].idxmax() - assignments = assignments.loc[idx].reset_index(drop=True) + # Use Dask's progress bar for task execution tracking + with ProgressBar(): + final_result = final_assignments.compute() - return assignments + return final_result def segment( @@ -285,8 +293,12 @@ def segment( save_dir: Union[str, Path], seg_tag: str, transcript_file: Union[str, Path], + score_cut: float = .25, + use_cc: bool = True, file_format: str = 'anndata', - receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, + receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, + knn_method: str = 'cuda', + verbose: bool = False, **anndata_kwargs ) -> None: """ @@ -306,54 +318,73 @@ def segment( Path to the transcripts parquet file. file_format : str, optional File format to save the results ('csv', 'parquet', or 'anndata'). Defaults to 'anndata'. + score_cut : float, optional + The threshold for assigning transcripts to cells based on similarity scores. + use_cc : bool, optional + If to further re-group transcripts that have not been assigned to any nucleus. + knn_method : str, optional + The method to use for nearest neighbors ('cuda' by default). **anndata_kwargs : dict, optional - Additional keyword arguments passed to the `create_anndata` function, such as: - - panel_df: pd.DataFrame - - min_transcripts: int - - cell_id_col: str - - qv_threshold: float - - min_cell_area: float - - max_cell_area: float - + Additional keyword arguments passed to the create_anndata function. + Returns: ------- None """ + start_time = time.time() + # Ensure the save directory exists save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) + + if verbose: + print(f"Starting segmentation for {seg_tag}...") + + # Step 1: Prediction + step_start_time = time.time() + + delayed_train = delayed(predict)(model, dm.train_dataloader(), score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + delayed_val = delayed(predict)(model, dm.val_dataloader(), score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + delayed_test = delayed(predict)(model, dm.test_dataloader(), score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - # Define delayed prediction steps for parallel execution - delayed_train = delayed(predict)(model, dm.train_dataloader(), score_cut=0.5, receptive_field=receptive_field, use_cc=True) - delayed_val = delayed(predict)(model, dm.val_dataloader(), score_cut=0.5, receptive_field=receptive_field, use_cc=True) - delayed_test = delayed(predict)(model, dm.test_dataloader(), score_cut=0.5, receptive_field=receptive_field, use_cc=True) - - # Trigger parallel execution and get results segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) - # Combine the segmentation results - seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test]).reset_index() + if verbose: + elapsed_time = format_time(time.time() - step_start_time) + print(f"Predictions completed in {elapsed_time}.") + + # Step 2: Combine and group by transcript_id (Use Dask to handle large dataframes) + step_start_time = time.time() - # Group by transcript_id and keep the row with the highest score - seg_final = seg_combined.loc[seg_combined.groupby('transcript_id')['score'].idxmax()] + seg_combined = dd.concat([segmentation_train, segmentation_val, segmentation_test]) + seg_final = seg_combined.loc[seg_combined.groupby('transcript_id')['score'].idxmax()].compute() + seg_final = seg_final.dropna(subset=['segger_cell_id']).reset_index(drop=True) - # Drop rows where segger_cell_id is NaN - seg_final = seg_final.dropna(subset=['segger_cell_id']) + if verbose: + elapsed_time = format_time(time.time() - step_start_time) + print(f"Segmentation results processed in {elapsed_time}.") - # Reset the index - seg_final.reset_index(drop=True, inplace=True) + # Step 3: Load transcripts and merge (using Dask) + step_start_time = time.time() - # Load the transcript data transcripts_df = dd.read_parquet(transcript_file) - # Convert seg_final to a Dask DataFrame and merge with transcripts + if verbose: + print("Merging segmentation results with transcripts...") + seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) - transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') + transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner').compute() + + if verbose: + elapsed_time = format_time(time.time() - step_start_time) + print(f"Transcripts merged in {elapsed_time}.") - # Compute the final result - transcripts_df_filtered = transcripts_df_filtered.compute() + # Step 4: Save the merged result + step_start_time = time.time() + + if verbose: + print(f"Saving results in {file_format} format...") - # Save the merged result based on the file format if file_format == 'csv': save_path = save_dir / f'{seg_tag}_segmentation.csv' transcripts_df_filtered.to_csv(save_path, index=False) @@ -361,11 +392,17 @@ def segment( save_path = save_dir / f'{seg_tag}_segmentation.parquet' transcripts_df_filtered.to_parquet(save_path, index=False) elif file_format == 'anndata': - # Create an AnnData object and save as h5ad, passing additional arguments from kwargs save_path = save_dir / f'{seg_tag}_segmentation.h5ad' segger_adata = create_anndata(transcripts_df_filtered, **anndata_kwargs) segger_adata.write(save_path) else: raise ValueError(f"Unsupported file format: {file_format}") - print(f"Segmentation results saved at {save_path}") \ No newline at end of file + if verbose: + elapsed_time = format_time(time.time() - step_start_time) + print(f"Results saved in {elapsed_time} at {save_path}.") + + # Total time + if verbose: + total_time = format_time(time.time() - start_time) + print(f"Total segmentation process completed in {total_time}.") From 9f7ab2a921041d2f98d2f2adde8e220108cfd99a Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 26 Sep 2024 10:27:50 +0200 Subject: [PATCH 056/156] edited the model to be compatible, embeddings only --- src/segger/models/segger_model.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/segger/models/segger_model.py b/src/segger/models/segger_model.py index 1b1a9eb..d2e13ad 100644 --- a/src/segger/models/segger_model.py +++ b/src/segger/models/segger_model.py @@ -56,7 +56,7 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: """ x = torch.nan_to_num(x, nan = 0) is_one_dim = (x.ndim == 1) * 1 - x = x[:, None] + # x = x[:, None] x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) # First layer x = x.relu() @@ -65,15 +65,12 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: # Middle layers if self.num_mid_layers > 0: - # for conv_mid, lin_mid in zip(self.conv_mid_layers, self.lin_mid_layers): for conv_mid in self.conv_mid_layers: x = conv_mid(x, edge_index) # + lin_mid(x) x = x.relu() # Last layer x = self.conv_last(x, edge_index) # + self.lin_last(x) - # x = x.relu() - # x = x / x.norm(dim=-1, keepdim=True) # Normalize to L2 norm of 1 return x From a619d3f6d94c2b710a63d6b30779007ca2e29366 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 26 Sep 2024 19:53:05 +0200 Subject: [PATCH 057/156] dasked prediciton a bit --- pyproject.toml | 3 +- scripts/predict_model_sample.py | 16 +-- src/segger/data/utils.py | 3 +- src/segger/models/segger_model.py | 2 +- src/segger/prediction/predict.py | 124 +++++++++++----------- src/segger/training/segger_data_module.py | 2 +- 6 files changed, 75 insertions(+), 75 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1ef0da0..3c538b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ dependencies = [ "path>=17.0.0", "pyarrow>=16.1.0,<16.2.0", "dask_geopandas>=0.4.0", - "torch-geometric>=2.2.0" + "torch-geometric>=2.2.0", + "pqdm>=0.2.0" ] [project.optional-dependencies] diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index d51d8e9..d5b0a6d 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -1,15 +1,8 @@ -from segger.data.io import XeniumSample -from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule from segger.prediction.predict import segment, load_model -from lightning.pytorch.loggers import CSVLogger -from pytorch_lightning import Trainer from pathlib import Path -from lightning.pytorch.plugins.environments import LightningEnvironment from matplotlib import pyplot as plt import seaborn as sns -# import pandas as pd -from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os import dask.dataframe as dd @@ -25,7 +18,7 @@ dm = SeggerDataModule( data_dir=segger_data_dir, batch_size=1, - num_workers=2, + num_workers=1, ) dm.setup() @@ -38,7 +31,7 @@ model = load_model(model_path / 'checkpoints') dm.setup() -receptive_field = {'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} +receptive_field = {'k_bd': 4, 'dist_bd': 15,'k_tx': 5, 'dist_tx': 3} segment( model, @@ -49,6 +42,7 @@ file_format='anndata', receptive_field = receptive_field, min_transcripts=10, - max_transcripts=1000 - cell_id_col='segger_cell_id' + max_transcripts=1000, + cell_id_col='segger_cell_id', + knn_method='kd_tree' ) \ No newline at end of file diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 8b18886..a847f5c 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -282,7 +282,8 @@ def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: elif method == 'faiss': return get_edge_index_faiss(coords_1, coords_2, k=k, dist=dist, gpu=gpu) elif method == 'cuda': - return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) + pass + # return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) else: raise ValueError(f"Unknown method {method}") diff --git a/src/segger/models/segger_model.py b/src/segger/models/segger_model.py index d2e13ad..5ad9af8 100644 --- a/src/segger/models/segger_model.py +++ b/src/segger/models/segger_model.py @@ -56,7 +56,7 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: """ x = torch.nan_to_num(x, nan = 0) is_one_dim = (x.ndim == 1) * 1 - # x = x[:, None] + x = x[:, None] x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) # First layer x = x.relu() diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index fd5d81a..90d85d2 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -147,8 +147,8 @@ def predict_batch( score_cut: float, receptive_field: dict, use_cc: bool = True, - knn_method: str = 'cuda' -) -> dd.DataFrame: + knn_method: str = 'kd_tree' +) -> pd.DataFrame: """ Predict cell assignments for a batch of transcript data using a segmentation model. @@ -164,7 +164,9 @@ def predict_batch( Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. use_cc : bool, optional If True, perform connected components analysis for unassigned transcripts. - + knn_method : str, optional + The method to use for nearest neighbors ('cuda' by default). + Returns ------- pd.DataFrame @@ -179,13 +181,12 @@ def _get_id(): batch = batch.to("cuda") # Assignments of cells to nuclei - transcript_id = delayed(batch['tx'].id.cpu().numpy)() - assignments = dd.from_array(transcript_id, columns=['transcript_id']) - + transcript_id = batch['tx'].id.cpu().numpy() + assignments = pd.DataFrame({'transcript_id': transcript_id}) if len(batch['bd'].id[0]) > 0: - # Step 2.1: Calculate edge index (delayed operation) - edge_index = delayed(get_edge_index)( + # Step 2.1: Calculate edge index lazily + edge_index = get_edge_index( batch['bd'].pos[:, :2].cpu(), batch['tx'].pos[:, :2].cpu(), k=receptive_field['k_bd'], @@ -193,51 +194,38 @@ def _get_id(): method=knn_method, ).T - # Step 2.2: Compute dense adjacency matrix lazily - batch['tx']['bd_field'] = delayed(coo_to_dense_adj)( + # Step 2.2: Compute dense adjacency matrix + batch['tx']['bd_field'] = coo_to_dense_adj( edge_index, num_nodes=batch['tx'].id.shape[0], num_nbrs=receptive_field['k_bd'], ) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd") + # 1. Get direct assignments from similarity matrix + belongs = scores.max(1) + assignments['score'] = belongs.values.cpu() + mask = assignments['score'] > score_cut + all_ids = np.concatenate(batch['bd'].id)[belongs.indices.cpu()] + assignments.loc[mask, 'segger_cell_id'] = all_ids[mask] - # Step 2.3: Calculate similarity scores lazily - scores = delayed(get_similarity_scores)(lit_segger.model, batch, "tx", "bd") - belongs = delayed(lambda x: x.max(1))(scores) - - # Step 2.4: Add scores to assignments (Dask DataFrame) - score_values = belongs.values.cpu().numpy() - assignments = assignments.assign(score=from_array(score_values)) - - # Step 2.5: Apply score cut-off and assign cell IDs lazily - all_ids = delayed(np.concatenate)(batch['bd'].id)[belongs.indices.cpu().numpy()] - mask = delayed(assignments['score'] > score_cut) - assignments['segger_cell_id'] = dd.from_array(delayed(np.where)(mask, all_ids, np.nan)) - - - # Step 3: If connected components (CC) are enabled, further process the assignments if use_cc: - # Step 3.1: Calculate transcript-to-transcript field lazily - edge_index_tx = batch['tx', 'neighbors', 'tx'].edge_index - batch['tx']['tx_field'] = delayed(coo_to_dense_adj)( - edge_index_tx, + # Transcript-transcript similarity scores, filtered by neighbors + edge_index = batch['tx', 'neighbors', 'tx'].edge_index + batch['tx']['tx_field'] = coo_to_dense_adj( + edge_index, num_nodes=batch['tx'].id.shape[0], ) - - # Step 3.2: Compute similarity scores for transcript-to-transcript lazily - scores_tx = delayed(get_similarity_scores)(lit_segger.model, batch, "tx", "tx") - scores_tx = delayed(lambda x: x.fill_diagonal_(0))(scores_tx) - - # Step 3.3: Handle connected components lazily - no_id = assignments['segger_cell_id'].isna() - no_id_scores = delayed(lambda s, mask: s[mask].T[mask])(scores_tx, no_id) - n, comps = delayed(pqdm)([lambda: cc(no_id_scores, connection="weak", directed=False)], n_jobs=-1) - - - # Assign new IDs based on connected components - new_ids = delayed(np.array)([_get_id() for _ in range(n)]) - assignments['segger_cell_id'] = dd.from_array(np.where(no_id, new_ids[comps], assignments['segger_cell_id'].values)) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "tx") + scores = scores.fill_diagonal_(0) # ignore self-similarity + + # 2. Assign remainder using connected components + no_id = assignments['segger_cell_id'].isna().values + no_id_scores = scores[no_id][:, no_id] + n, comps = cc(no_id_scores, connection="weak", directed=False) + new_ids = np.array([_get_id() for _ in range(n)]) + assignments.loc[no_id, 'segger_cell_id'] = new_ids[comps] - return assignments + return assignments # Ensure this is a pandas DataFrame def predict( @@ -245,7 +233,8 @@ def predict( data_loader: DataLoader, score_cut: float, receptive_field: dict, - use_cc: bool = True + use_cc: bool = True, + knn_method: str = 'cuda' ) -> dd.DataFrame: """ Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. @@ -262,7 +251,9 @@ def predict( Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. use_cc : bool, optional If True, perform connected components analysis for unassigned transcripts. - + knn_method : str, optional + The method to use for nearest neighbors ('cuda' by default). + Returns ------- dd.DataFrame @@ -271,20 +262,29 @@ def predict( if len(data_loader) == 0: return None + # Create meta for the DataFrame + meta = pd.DataFrame({ + 'transcript_id': pd.Series(dtype='int64'), + 'score': pd.Series(dtype='float64'), + 'segger_cell_id': pd.Series(dtype='object') + }) + # Convert the entire data loader to delayed predictions - delayed_assignments = pqdm([delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc) - for batch in data_loader], n_jobs=-1) - assignments_dd = dd.from_delayed(delayed_assignments) + delayed_assignments = [delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) + for batch in data_loader] + + # Pass the meta to from_delayed + assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) + + # Modify the logic to compute idxmax within each partition using map_partitions + def select_max_score_partition(df): + idx = df.groupby('transcript_id')['score'].idxmax() # Compute idxmax within each partition + return df.loc[idx].reset_index(drop=True) - # Group by transcript_id and lazily select the row with the highest score - idx = assignments_dd.groupby('transcript_id')['score'].idxmax() - final_assignments = assignments_dd.map_partitions(lambda df: df.loc[idx].reset_index(drop=True)) + final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) - # Use Dask's progress bar for task execution tracking - with ProgressBar(): - final_result = final_assignments.compute() + return final_assignments - return final_result def segment( @@ -342,10 +342,14 @@ def segment( # Step 1: Prediction step_start_time = time.time() - - delayed_train = delayed(predict)(model, dm.train_dataloader(), score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - delayed_val = delayed(predict)(model, dm.val_dataloader(), score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - delayed_test = delayed(predict)(model, dm.test_dataloader(), score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + + train_dataloader = dm.train_dataloader() + test_dataloader = dm.test_dataloader() + val_dataloader = dm.val_dataloader() + + delayed_train = predict(model, train_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + delayed_val = predict(model, val_dataloader , score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) diff --git a/src/segger/training/segger_data_module.py b/src/segger/training/segger_data_module.py index ec1b5a9..678194e 100644 --- a/src/segger/training/segger_data_module.py +++ b/src/segger/training/segger_data_module.py @@ -33,7 +33,7 @@ def setup(self, stage=None): self.loader_kwargs = dict( batch_size=self.batch_size, num_workers=self.num_workers, - pin_memory=True, + pin_memory=False, ) # TODO: Add documentation From 059967620f6b5206a177490a93c649ddf05ed911 Mon Sep 17 00:00:00 2001 From: andrewmoorman Date: Fri, 27 Sep 2024 13:58:36 -0400 Subject: [PATCH 058/156] Refactored parquet-based dataset creation, added to CLI --- src/segger/data/parquet/_experimental.py | 65 + src/segger/data/parquet/_ndtree.py | 116 ++ src/segger/data/parquet/_settings/xenium.yaml | 45 + src/segger/data/parquet/_utils.py | 305 ++++ src/segger/data/parquet/pyg_dataset.py | 67 + src/segger/data/parquet/sample.py | 1227 +++++++++++++++++ .../data/parquet/transcript_embedding.py | 71 + 7 files changed, 1896 insertions(+) create mode 100644 src/segger/data/parquet/_experimental.py create mode 100644 src/segger/data/parquet/_ndtree.py create mode 100644 src/segger/data/parquet/_settings/xenium.yaml create mode 100644 src/segger/data/parquet/_utils.py create mode 100644 src/segger/data/parquet/pyg_dataset.py create mode 100644 src/segger/data/parquet/sample.py create mode 100644 src/segger/data/parquet/transcript_embedding.py diff --git a/src/segger/data/parquet/_experimental.py b/src/segger/data/parquet/_experimental.py new file mode 100644 index 0000000..f8af0f1 --- /dev/null +++ b/src/segger/data/parquet/_experimental.py @@ -0,0 +1,65 @@ + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: # False at runtime + import dask, cudf, dask_cudf, pandas as pd + +class BackendHandler: + """ + A class to handle different DataFrame backends for reading and processing + Parquet files. + + Attributes + ---------- + _valid_backends : set + A set of valid backend options ('pandas', 'dask', 'cudf', 'dask_cudf'). + backend : str + The selected backend for reading Parquet files. + + Methods + ------- + read_parquet(): + Returns the function to read Parquet files according to the selected + backend. + """ + + _valid_backends = { + 'pandas', + 'dask', + 'cudf', + 'dask_cudf', + } + + def __init__(self, backend): + # Make sure requested backend is supported + if backend in self._valid_backends: + self.backend = backend + else: + valid = ', '.join(map(lambda o: f"'{o}'", self._valid_backends)) + msg = f"Unsupported backend: {backend}. Valid options are {valid}." + raise ValueError(msg) + + # Dynamically import packages only if requested + if self.backend == 'pandas': + import pandas as pd + elif self.backend == 'dask': + import dask + elif self.backend == 'cudf': + import cudf + elif self.backend == 'dask_cudf': + import dask_cudf + else: + raise ValueError('Internal Error') + + @property + def read_parquet(self): + if self.backend == 'pandas': + return pd.read_parquet + elif self.backend == 'dask': + return dask.dataframe.read_parquet + elif self.backend == 'cudf': + return cudf.read_parquet + elif self.backend == 'dask_cudf': + return dask_cudf.read_parquet + else: + raise ValueError('Internal Error') \ No newline at end of file diff --git a/src/segger/data/parquet/_ndtree.py b/src/segger/data/parquet/_ndtree.py new file mode 100644 index 0000000..cc68ef0 --- /dev/null +++ b/src/segger/data/parquet/_ndtree.py @@ -0,0 +1,116 @@ +from scipy.spatial import Rectangle +import shapely +import numpy as np +import math + +class NDTree(): + """ + NDTree is a data structure for recursively splitting multi-dimensional data + into smaller regions until each leaf node contains less than or equal to a + specified number of points. It stores these regions in a balanced binary + tree. + + Attributes + ---------- + data : np.ndarray + The input data to be partitioned. + n : int + The maximum number of points allowed in a leaf node. + idx : np.ndarray + The indices of the input data points. + boxes : list + A list to store the bounding boxes (as shapely polygons) of each region + in the tree. + rect : Rectangle + The bounding box of the entire input data space. + tree : innernode + The root of the NDTree. + """ + + def __init__(self, data, n): + """ + Initializes the NDTree with the given data and maximum points per leaf + node. + + Parameters + ---------- + data : np.ndarray + The input data to be partitioned. + n : int + The maximum number of points allowed in a leaf node. + """ + self.data = np.asarray(data) + self.n = n + self.idx = np.arange(data.shape[0]) + self.boxes = [] + self.rect = Rectangle(data.min(0), data.max(0)) + self.tree = innernode(self.n, self.idx, self.rect, self) + +class innernode(): + """ + Represents a node in the NDTree. Each node either stores a bounding box for + the data it contains (leaf nodes) or splits the data into two child nodes. + + Attributes + ---------- + n : int + The maximum number of points allowed in a leaf node for this subtree. + idx : np.ndarray + The indices of the data points in this node. + tree : NDTree + The reference to the main NDTree that holds the data and bounding boxes. + rect : Rectangle + The bounding box of the data points in this node. + split_dim : int + The dimension along which the node splits the data. + split_point : float + The value along the split dimension used to divide the data. + less : innernode + The child node containing data points less than or equal to the split + point. + greater : innernode + The child node containing data points greater than the split point. + """ + + def __init__(self, n, idx, rect, tree): + """ + Initializes the innernode and splits the data if necessary. + """ + self.n = n + self.idx = idx + self.tree = tree + self.rect = rect + if not n == 1: + self.split() + else: + box = shapely.box(*self.rect.mins, *self.rect.maxes) + self.tree.boxes.append(box) + + def split(self): + """ + Recursively splits the node's data into two child nodes along the + dimension with the largest spread. + """ + less = math.floor(self.n // 2) + greater = self.n - less + data = self.tree.data[self.idx] + self.split_dim = np.argmax(self.rect.maxes - self.rect.mins) + data = data[:, self.split_dim] + self.split_point = np.quantile(data, less / (less + greater)) + mask = data <= self.split_point + less_rect, greater_rect = self.rect.split( + self.split_dim, + self.split_point + ) + self.less = innernode( + less, + self.idx[mask], + less_rect, + self.tree + ) + self.greater = innernode( + greater, + self.idx[~mask], + greater_rect, + self.tree + ) \ No newline at end of file diff --git a/src/segger/data/parquet/_settings/xenium.yaml b/src/segger/data/parquet/_settings/xenium.yaml new file mode 100644 index 0000000..7304aa7 --- /dev/null +++ b/src/segger/data/parquet/_settings/xenium.yaml @@ -0,0 +1,45 @@ +transcripts: + filename: "transcripts.parquet" + x: "x_location" + y: "y_location" + z: "z_location" + id: "transcript_id" + label: "feature_name" + nuclear: "overlaps_nucleus" + filter_substrings: + - "NegControlProbe_" + - "antisense_" + - "NegControlCodeword" + - "BLANK_" + - "DeprecatedCodeword_" + - "UnassignedCodeword_" + xy: + - "x_location" + - "y_location" + xyz: + - "x_location" + - "y_location" + - "z_location" + columns: + - "x_location" + - "y_location" + - "z_location" + - "feature_name" + - "overlaps_nucleus" + - "transcript_id" + - "cell_id" + - "qv" + +boundaries: + filename: "nucleus_boundaries.parquet" + x: "vertex_x" + y: "vertex_y" + id: "cell_id" + label: "cell_id" + xy: + - "vertex_x" + - "vertex_y" + columns: + - "vertex_x" + - "vertex_y" + - "cell_id" diff --git a/src/segger/data/parquet/_utils.py b/src/segger/data/parquet/_utils.py new file mode 100644 index 0000000..6f29cec --- /dev/null +++ b/src/segger/data/parquet/_utils.py @@ -0,0 +1,305 @@ +import pandas as pd +import geopandas as gpd +import shapely +from pyarrow import parquet as pq +import numpy as np +import scipy as sp +from typing import Optional, List +import sys +from types import SimpleNamespace +from pathlib import Path +import yaml + +def get_xy_extents( + filepath, + x: str, + y: str, +) -> shapely.Polygon: + """ + Get the bounding box of the x and y coordinates from a Parquet file. + + Parameters + ---------- + filepath : str + The path to the Parquet file. + x : str + The name of the column representing the x-coordinate. + y : str + The name of the column representing the y-coordinate. + + Returns + ------- + shapely.Polygon + A polygon representing the bounding box of the x and y coordinates. + """ + # Get index of columns of parquet file + metadata = pq.read_metadata(filepath) + schema_idx = dict(map(reversed, enumerate(metadata.schema.names))) + + # Find min and max values across all row groups + x_max = -1 + x_min = sys.maxsize + y_max = -1 + y_min = sys.maxsize + for i in range(metadata.num_row_groups): + group = metadata.row_group(i) + x_min = min(x_min, group.column(schema_idx[x]).statistics.min) + x_max = max(x_max, group.column(schema_idx[x]).statistics.max) + y_min = min(y_min, group.column(schema_idx[y]).statistics.min) + y_max = max(y_max, group.column(schema_idx[y]).statistics.max) + bounds = shapely.box(x_min, y_min, x_max, y_max) + return bounds + +def read_parquet_region( + filepath, + x: str, + y: str, + bounds: shapely.Polygon = None, + extra_columns: list[str] = [], + extra_filters: list[str] = [], + row_group_chunksize: Optional[int] = None, +): + """ + Read a region from a Parquet file based on x and y coordinates and optional + filters. + + Parameters + ---------- + filepath : str + The path to the Parquet file. + x : str + The name of the column representing the x-coordinate. + y : str + The name of the column representing the y-coordinate. + bounds : shapely.Polygon, optional + A polygon representing the bounding box to filter the data. If None, + no bounding box filter is applied. + extra_columns : list of str, optional + A list of additional columns to include in the output DataFrame. + extra_filters : list of str, optional + A list of additional filters to apply to the data. + + Returns + ------- + DataFrame + A DataFrame containing the filtered data from the Parquet file. + """ + # Check backend and load dependencies if not already loaded + + # Find bounds of full file if not supplied + if bounds is None: + bounds = get_xy_bounds(filepath, x, y) + + # Load pre-filtered data from Parquet file + filters = [[ + (x, '>', bounds.bounds[0]), + (y, '>', bounds.bounds[1]), + (x, '<', bounds.bounds[2]), + (y, '<', bounds.bounds[3]), + ] + extra_filters] + + columns = list({x, y} | set(extra_columns)) + + region = pd.read_parquet( + filepath, + filters=filters, + columns=columns, + ) + return region + +def get_polygons_from_xy( + boundaries: pd.DataFrame, + x: str, + y: str, + label: str, +) -> gpd.GeoSeries: + """ + Convert boundary coordinates from a cuDF DataFrame to a GeoSeries of + polygons. + + Parameters + ---------- + boundaries : pd.DataFrame + A DataFrame containing the boundary data with x and y coordinates + and identifiers. + x : str + The name of the column representing the x-coordinate. + y : str + The name of the column representing the y-coordinate. + label : str + The name of the column representing the cell or nucleus label. + + + Returns + ------- + gpd.GeoSeries + A GeoSeries containing the polygons created from the boundary + coordinates. + """ + # Polygon offsets in coords + ids = boundaries[label].values + splits = np.where(ids[:-1] != ids[1:])[0] + 1 + geometry_offset = np.hstack([0, splits, len(ids)]) + part_offset = np.arange(len(np.unique(ids)) + 1) + + # Convert to GeoSeries of polygons + polygons = shapely.from_ragged_array( + shapely.GeometryType.POLYGON, + coords=boundaries[[x, y]], + offsets=(geometry_offset, part_offset), + ) + gs = gpd.GeoSeries(polygons, index=np.unique(ids)) + + return gs + +def filter_boundaries( + boundaries: pd.DataFrame, + inset: shapely.Polygon, + outset: shapely.Polygon, + x: str, + y: str, + label: str, +): + """ + Filter boundary polygons based on their overlap with specified inset and + outset regions. + + Parameters + ---------- + boundaries : cudf.DataFrame + A DataFrame containing the boundary data with x and y coordinates and + identifiers. + inset : shapely.Polygon + A polygon representing the inner region to filter the boundaries. + outset : shapely.Polygon + A polygon representing the outer region to filter the boundaries. + x : str + The name of the column representing the x-coordinate. + y : str + The name of the column representing the y-coordinate. + label : str + The name of the column representing the cell or nucleus label. + + Returns + ------- + cudf.DataFrame + A DataFrame containing the filtered boundary polygons. + + Notes + ----- + The function determines overlaps of boundary polygons with the specified + inset and outset regions. It creates boolean masks for overlaps with the + top, left, right, and bottom sides of the outset region, as well as the + center region defined by the inset polygon. The filtering logic includes + polygons that: + - Are completely within the center region. + - Overlap with the center and the left side, but not the bottom side. + - Overlap with the center and the top side, but not the right side. + """ + # Determine overlaps of boundary polygons + def in_region(region): + in_x = boundaries[x].between(region.bounds[0], region.bounds[2]) + in_y = boundaries[y].between(region.bounds[1], region.bounds[3]) + return in_x & in_y + x1, y1, x4, y4 = outset.bounds + x2, y2, x3, y3 = inset.bounds + boundaries['top'] = in_region(shapely.box(x1, y1, x4, y2)) + boundaries['left'] = in_region(shapely.box(x1, y1, x2, y4)) + boundaries['right'] = in_region(shapely.box(x3, y1, x4, y4)) + boundaries['bottom'] = in_region(shapely.box(x1, y3, x4, y4)) + boundaries['center'] = in_region(inset) + + # Filter boundary polygons + # Include overlaps with top and left, not bottom and right + gb = boundaries.groupby(label, sort=False) + total = gb['center'].transform('size') + in_top = gb['top'].transform('sum') + in_left = gb['left'].transform('sum') + in_right = gb['right'].transform('sum') + in_bottom = gb['bottom'].transform('sum') + in_center = gb['center'].transform('sum') + keep = in_center == total + keep |= ((in_center > 0) & (in_left > 0) & (in_bottom == 0)) + keep |= ((in_center > 0) & (in_top > 0) & (in_right == 0)) + inset_boundaries = boundaries.loc[keep] + return inset_boundaries + +def filter_transcripts( + transcripts_df: pd.DataFrame, + label: Optional[str] = None, + filter_substrings: Optional[List[str]] = None, + min_qv: Optional[float] = None, +) -> pd.DataFrame: + """ + Filters transcripts based on quality value and removes unwanted transcripts. + + Parameters + ---------- + transcripts_df : pd.DataFrame + The dataframe containing transcript data. + label : Optional[str] + The label of transcript features. + filter_substrings : Optional[str] + The list of feature substrings to remove. + min_qv : Optional[float] + The minimum quality value threshold for filtering transcripts. + + Returns + ------- + pd.DataFrame + The filtered dataframe. + """ + mask = pd.Series(True, index=transcripts_df.index) + if filter_substrings is not None and label is not None: + mask &= ~transcripts_df[label].str.startswith(tuple(filter_substrings)) + if min_qv is not None: + mask &= transcripts_df["qv"].ge(min_qv) + return transcripts_df[mask] + +def load_settings(sample_type: str) -> SimpleNamespace: + """ + Loads a matching YAML file from the _settings/ directory and converts its + contents into a SimpleNamespace. + + Parameters + ---------- + sample_type : str + Name of the sample type to load (case-insensitive). + + Returns + ------- + SimpleNamespace + The settings loaded from the YAML file as a SimpleNamespace. + + Raises + ------ + ValueError + If `sample_type` does not match any filenames. + """ + settings_dir = Path(__file__).parent.resolve() / '_settings' + # Get a list of YAML filenames (without extensions) in the _settings dir + filenames = [file.stem for file in settings_dir.glob('*.yaml')] + # Convert sample_type to lowercase and check if it matches any filename + sample_type = sample_type.lower() + if sample_type not in filenames: + msg = ( + f"Sample type '{sample_type}' not found in settings. " + f"Available options: {', '.join(filenames)}" + ) + raise FileNotFoundError(msg) + # Load the matching YAML file + yaml_file_path = settings_dir / f"{sample_type}.yaml" + with yaml_file_path.open('r') as file: + data = yaml.safe_load(file) + + # Convert the YAML data into a SimpleNamespace recursively + return _dict_to_namespace(data) + +def _dict_to_namespace(d): + """ + Recursively converts a dictionary to a SimpleNamespace. + """ + if isinstance(d, dict): + d = {k: _dict_to_namespace(v) for k, v in d.items()} + return SimpleNamespace(**d) + return d \ No newline at end of file diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py new file mode 100644 index 0000000..cfc95a5 --- /dev/null +++ b/src/segger/data/parquet/pyg_dataset.py @@ -0,0 +1,67 @@ +from typing import List, Optional, Callable +from torch_geometric.data import InMemoryDataset, Data +import glob +import os +from pathlib import Path +import torch + +class STPyGDataset(InMemoryDataset): + """ + An in-memory dataset class for handling training using spatial + transcriptomics data. + """ + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + pre_transform: Optional[Callable] = None, + pre_filter: Optional[Callable] = None + ): + super().__init__(root, transform, pre_transform, pre_filter) + os.makedirs(os.path.join(self.processed_dir, 'raw'), exist_ok=True) + + @property + def raw_file_names(self) -> List[str]: + """ + Return a list of raw file names in the raw directory. + + Returns: + List[str]: List of raw file names. + """ + return os.listdir(self.raw_dir) + + @property + def processed_file_names(self) -> List[str]: + """ + Return a list of processed file names in the processed directory. + + Returns: + List[str]: List of processed file names. + """ + paths = glob.glob(f'{self.processed_dir}/*.pt') + file_names = list(map(os.path.basename, paths)) + return file_names + + def len(self) -> int: + """ + Return the number of processed files. + + Returns: + int: Number of processed files. + """ + return len(self.processed_file_names) + + def get(self, idx: int) -> Data: + """ + Get a processed data object. + + Args: + idx (int): Index of the data object to retrieve. + + Returns: + Data: The processed data object. + """ + filepath = Path(self.processed_dir) / self.processed_file_names[idx] + data = torch.load(filepath) + data['tx'].x = data['tx'].x.to_dense() + return data \ No newline at end of file diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py new file mode 100644 index 0000000..314573f --- /dev/null +++ b/src/segger/data/parquet/sample.py @@ -0,0 +1,1227 @@ +import os +import shapely +from pyarrow import parquet as pq, compute as pc +import numpy as np +import pandas as pd +from pathlib import Path +import geopandas as gpd +from segger.data.parquet import _utils as utils +from scipy.spatial import KDTree, Rectangle +from segger.data.parquet._ndtree import NDTree +from functools import cached_property +from typing import List, Optional +import logging +from itertools import compress +from torch_geometric.data import HeteroData +from torch_geometric.transforms import RandomLinkSplit +import torch +from pqdm.threads import pqdm +import random +from segger.data.parquet.transcript_embedding import TranscriptEmbedding + + +# TODO: Add documentation for settings +class STSampleParquet(): + """ + A class to manage spatial transcriptomics data stored in parquet files. + + This class provides methods for loading, processing, and saving data related + to ST samples. It supports parallel processing and efficient handling of + transcript and boundary data. + """ + + def __init__( + self, + base_dir: os.PathLike, + n_workers: Optional[int] = 1, + sample_type: str = None, + ): + """ + Initializes the STSampleParquet instance. + + Parameters + ---------- + base_dir : os.PathLike + The base directory containing the ST data. + n_workers : Optional[int], default 1 + The number of workers for parallel processing. + sample_type : Optional[str], default None + The sample type of the raw data, e.g., 'xenium' or 'merscope' + + Raises + ------ + FileNotFoundError + If the base directory does not exist or the required files are + missing. + """ + # Setup paths and resource constraints + self._base_dir = Path(base_dir) + self.settings = utils.load_settings(sample_type) + transcripts_fn = self.settings.transcripts.filename + self._transcripts_filepath = self._base_dir / transcripts_fn + boundaries_fn = self.settings.boundaries.filename + self._boundaries_filepath = self._base_dir / boundaries_fn + self.n_workers = n_workers + + # Setup logging + logging.basicConfig(level=logging.INFO) + self.logger = logging.Logger(f'STSample@{base_dir}') + + # Internal caches + self._extents = None + self._transcripts_metadata = None + self._boundaries_metadata = None + + # Setup default embedding for transcripts + classes = self.transcripts_metadata['feature_names'] + self._transcript_embedding = TranscriptEmbedding(np.array(classes)) + + + @classmethod + def _get_parquet_metadata( + cls, + filepath: os.PathLike, + columns: Optional[List[str]] = None, + ) -> dict: + """ + Reads and returns metadata from the parquet file. + + Parameters + ---------- + filepath : os.PathLike + The path to the parquet file. + columns : Optional[List[str]], default None + List of columns to extract metadata for. If None, all columns + are used. + + Returns + ------- + dict + A dictionary containing metadata such as the number of rows, + number of columns, and column sizes. + + Raises + ------ + FileNotFoundError + If the parquet file does not exist at the specified path. + KeyError + If any of the requested columns are not found in the parquet file. + """ + # Size in bytes of field dtypes + size_map = { + 'BOOLEAN': 1, + 'INT32': 4, + 'FLOAT': 4, + 'INT64': 8, + 'DOUBLE': 8, + 'BYTE_ARRAY': 8, + 'INT96': 12, + } + + # Read in metadata + metadata = pq.read_metadata(filepath) + if columns is None: + columns = metadata.schema.names + missing = set(columns) - set(metadata.schema.names) + if len(missing) > 0: + msg = f"Columns {', '.join(missing)} not found in schema." + raise KeyError(msg) + + # Grab important fields from metadata + summary = dict() + summary['n_rows'] = metadata.num_rows + summary['n_columns'] = len(columns) + summary['column_sizes'] = dict() + for c in columns: + # Error where 10X saved BOOLEAN field as INT32 in schema + if c == 'overlaps_nucleus': + dtype = 'BOOLEAN' + else: + i = metadata.schema.names.index(c) + dtype = metadata.schema[i].physical_type + summary['column_sizes'][c] = size_map[dtype] + + return summary + + + @cached_property + def transcripts_metadata(self) -> dict: + """ + Retrieves metadata for the transcripts stored in the sample. + + Returns + ------- + dict + Metadata dictionary for transcripts including column sizes and + feature names. + + Raises + ------ + FileNotFoundError + If the transcript parquet file does not exist. + """ + if self._transcripts_metadata is None: + # Base metadata + metadata = STSampleParquet._get_parquet_metadata( + self._transcripts_filepath, + self.settings.transcripts.columns, + ) + # Get filtered unique feature names + table = pq.read_table(self._transcripts_filepath) + names = pc.unique(table[self.settings.transcripts.label]) + pattern = '|'.join(self.settings.transcripts.filter_substrings) + mask = pc.invert(pc.match_substring_regex(names, pattern)) + metadata['feature_names'] = pc.filter(names, mask).tolist() + self._transcripts_metadata = metadata + return self._transcripts_metadata + + + @cached_property + def boundaries_metadata(self) -> dict: + """ + Retrieves metadata for the boundaries stored in the sample. + + Returns + ------- + dict + Metadata dictionary for boundaries including column sizes. + + Raises + ------ + FileNotFoundError + If the boundaries parquet file does not exist. + """ + if self._boundaries_metadata is None: + metadata = STSampleParquet._get_parquet_metadata( + self._boundaries_filepath, + self.settings.boundaries.columns, + ) + self._boundaries_metadata = metadata + return self._boundaries_metadata + + + @property + def n_transcripts(self) -> int: + """ + The total number of transcripts in the sample. + + Returns + ------- + int + The number of transcripts. + """ + return self.transcripts_metadata['n_rows'] + + + @cached_property + def extents(self) -> shapely.Polygon: + """ + The combined extents (bounding box) of the transcripts and boundaries. + + Returns + ------- + shapely.Polygon + The bounding box covering all transcripts and boundaries. + """ + if self._extents is None: + # Get individual extents + xy = self.settings.transcripts.xy + tx_extents = utils.get_xy_extents(self._transcripts_filepath, *xy) + xy = self.settings.boundaries.xy + bd_extents = utils.get_xy_extents(self._boundaries_filepath, *xy) + + # Combine extents and get bounding box + extents = tx_extents.union(bd_extents) + self._extents = shapely.box(*extents.bounds) + + return self._extents + + + def _get_balanced_regions( + self, + ) -> List[shapely.Polygon]: + """ + Splits the sample extents into balanced regions for parallel processing. + See NDTree documentation for more information. + + Returns + ------- + List[shapely.Polygon] + A list of polygons representing the regions. + """ + # If no. workers is 1, return full extents + if self.n_workers == 1: + return [self.extents] + + # Otherwise, split based on boundary distribution which is much smaller + # than transcripts DataFrame. + # Note: Assumes boundaries are distributed similarly to transcripts at + # a coarse level. + data = pd.read_parquet( + self._boundaries_filepath, + columns=self.settings.boundaries.xy, + ).values + ndtree = NDTree(data, self.n_workers) + + return ndtree.boxes + + + @staticmethod + def _setup_directory( + data_dir: os.PathLike, + ): + """ + Sets up the directory structure for saving processed tiles. + + Ensures that the necessary subdirectories for 'train', 'test', and + 'val' are created under the provided base directory. If any of these + subdirectories already exist and are not empty, an error is raised. + + Directory structure created: + ---------------------------- + data_dir/ + ├── train/ + │ └── processed/ + ├── test/ + │ └── processed/ + └── val/ + └── processed/ + + Parameters + ---------- + data_dir : os.PathLike + The path to the base directory where the data should be stored. + + Raises + ------ + AssertionError + If any of the 'processed' directories already contain files. + """ + data_dir = Path(data_dir) # by default, convert to Path object + for dt in ['train', 'test', 'val']: + tile_dir = data_dir / dt / 'processed' + tile_dir.mkdir(parents=True, exist_ok=True) + if os.listdir(tile_dir): + msg = f"Directory '{tile_dir}' must be empty." + raise AssertionError(msg) + + + def set_transcript_embedding(self, weights: pd.DataFrame): + """ + Sets the transcript embedding for the sample. + + Parameters + ---------- + weights : pd.DataFrame + A DataFrame containing the weights for each transcript. + + Raises + ------ + ValueError + If the provided weights do not match the number of transcript + features. + """ + classes = self._transcripts_metadata['feature_names'] + self._transcript_embedding = TranscriptEmbedding(classes, weights) + + + def save( + self, + data_dir: os.PathLike, + k_bd: int = 3, + dist_bd: float = 15., + k_tx: int = 3, + dist_tx: float = 5., + tile_size: Optional[int] = None, + tile_width: Optional[float] = None, + tile_height: Optional[float] = None, + neg_sampling_ratio: float = 5., + frac: float = 1., + val_prob: float = 0.1, + test_prob: float = 0.2, + ): + """ + Saves the tiles of the sample as PyTorch geometric datasets. See + documentation for 'STTile' for more information on dataset contents. + + Note: This function requires either 'tile_size' OR both 'tile_width' and + 'tile_height' to be provided. + + Parameters + ---------- + data_dir : os.PathLike + The directory where the dataset should be saved. + k_bd : int, optional, default 3 + Number of nearest neighbors for boundary nodes. + dist_bd : float, optional, default 15.0 + Maximum distance for boundary neighbors. + k_tx : int, optional, default 3 + Number of nearest neighbors for transcript nodes. + dist_tx : float, optional, default 5.0 + Maximum distance for transcript neighbors. + tile_size : int, optional + If provided, specifies the size of the tile. Overrides `tile_width` + and `tile_height`. + tile_width : int, optional + Width of the tiles in pixels. Ignored if `tile_size` is provided. + tile_height : int, optional + Height of the tiles in pixels. Ignored if `tile_size` is provided. + neg_sampling_ratio : float, optional, default 5.0 + Ratio of negative samples. + frac : float, optional, default 1.0 + Fraction of the dataset to process. + val_prob: float, optional, default 0.1 + Proportion of data for use for validation split. + test_prob: float, optional, default 0.2 + Proportion of data for use for test split. + + Raises + ------ + ValueError + If the 'frac' parameter is greater than 1.0 or if the calculated + number of tiles is zero. + AssertionError + If the specified directory structure is not properly set up. + """ + # Check inputs + try: + if frac > 1: + msg = f"Arg 'frac' should be <= 1.0, but got {frac}." + raise ValueError(msg) + n_tiles = self.n_transcripts / tile_size / self.n_workers * frac + if int(n_tiles) == 0: + msg = f"Sampling parameters would yield 0 total tiles." + raise ValueError(msg) + # Propagate errors to logging + except Exception as e: + self.logger.error(str(e), exc_info=True) + raise e + + # Setup directory structure to save tiles + data_dir = Path(data_dir) + STSampleParquet._setup_directory(data_dir) + + # Function to parallelize over workers + def func(region): + xm = STInMemoryDataset(sample=self, extents=region) + tiles = xm._tile(tile_width, tile_height, tile_size) + if frac < 1: + tiles = random.sample(tiles, int(len(tiles) * frac)) + for tile in tiles: + # Choose training, test, or validation datasets + data_type = np.random.choice( + a=['train', 'test', 'val'], + p=[1 - (test_prob + val_prob), test_prob, val_prob], + ) + xt = STTile(dataset=xm, extents=tile) + pyg_data = xt.to_pyg_dataset( + k_bd=k_bd, + dist_bd=dist_bd, + k_tx=k_tx, + dist_tx=dist_tx, + neg_sampling_ratio=neg_sampling_ratio, + ) + filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' + torch.save(pyg_data, filepath) + + # TODO: Add Dask backend + regions = self._get_balanced_regions() + pqdm(regions, func, n_jobs=self.n_workers) + + +# TODO: Add documentation for settings +class STInMemoryDataset(): + """ + A class for handling in-memory representations of ST data. + + This class is used to load and manage ST sample data from parquet files, + filter boundaries and transcripts, and provide spatial tiling for further + analysis. The class also pre-loads KDTrees for efficient spatial queries. + + Parameters + ---------- + sample : STSampleParquet + The ST sample containing paths to the data files. + extents : shapely.Polygon + The polygon defining the spatial extents for the dataset. + margin : int, optional, default 10 + The margin to buffer around the extents when filtering data. + + Attributes + ---------- + sample : STSampleParquet + The ST sample from which the data is loaded. + extents : shapely.Polygon + The spatial extents of the dataset. + margin : int + The buffer margin around the extents for filtering. + transcripts : pd.DataFrame + The filtered transcripts within the dataset extents. + boundaries : pd.DataFrame + The filtered boundaries within the dataset extents. + kdtree_tx : KDTree + The KDTree for fast spatial queries on the transcripts. + + Raises + ------ + ValueError + If the transcripts or boundaries could not be loaded or filtered. + """ + + def __init__( + self, + sample: STSampleParquet, + extents: shapely.Polygon, + margin: int = 10, + ): + """ + Initializes the STInMemoryDataset instance by loading transcripts + and boundaries from parquet files and pre-loading a KDTree for fast + spatial queries. + + Parameters + ---------- + sample : STSampleParquet + The ST sample containing paths to the data files. + extents : shapely.Polygon + The polygon defining the spatial extents for the dataset. + margin : int, optional, default 10 + The margin to buffer around the extents when filtering data. + """ + # Set properties + self.sample = sample + self.extents = extents + self.margin = margin + self.settings = self.sample.settings + + # Load data from parquet files + self._load_transcripts(self.sample._transcripts_filepath) + self._load_boundaries(self.sample._boundaries_filepath) + + # Pre-load KDTrees + self.kdtree_tx = KDTree( + self.transcripts[self.settings.transcripts.xy], + leafsize=100 + ) + + + def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): + """ + Loads and filters the transcripts dataframe for the dataset. + + Parameters + ---------- + path : os.PathLike + The file path to the transcripts parquet file. + min_qv : float, optional, default 30.0 + The minimum quality value (QV) for filtering transcripts. + + Raises + ------ + ValueError + If the transcripts dataframe cannot be loaded or filtered. + """ + # Load and filter transcripts dataframe + bounds = self.extents.buffer(self.margin, join_style='mitre') + transcripts = utils.read_parquet_region( + path, + x=self.settings.transcripts.x, + y=self.settings.transcripts.y, + bounds=bounds, + extra_columns=self.settings.transcripts.columns, + ) + transcripts = utils.filter_transcripts( + transcripts, + self.settings.transcripts.label, + self.settings.transcripts.filter_substrings, + min_qv, + ) + + # Only set object properties once everything finishes successfully + self.transcripts = transcripts + + + def _load_boundaries(self, path: os.PathLike): + """ + Loads and filters the boundaries dataframe for the dataset. + + Parameters + ---------- + path : os.PathLike + The file path to the boundaries parquet file. + + Raises + ------ + ValueError + If the boundaries dataframe cannot be loaded or filtered. + """ + # Load and filter boundaries dataframe + outset = self.extents.buffer(self.margin, join_style='mitre') + boundaries = utils.read_parquet_region( + path, + x=self.settings.boundaries.x, + y=self.settings.boundaries.y, + bounds=outset, + extra_columns=self.settings.boundaries.columns, + ) + boundaries = utils.filter_boundaries( + boundaries, + inset=self.extents, + outset=outset, + x=self.settings.boundaries.x, + y=self.settings.boundaries.y, + label=self.settings.boundaries.label, + ) + self.boundaries = boundaries + + + def _get_rectangular_tile_bounds( + self, + tile_width: float, + tile_height: float, + ) -> List[shapely.Polygon]: + """ + Generates rectangular tiles for the dataset based on the extents. + + Parameters + ---------- + tile_width : float + The width of each tile. + tile_height : float + The height of each tile. + + Returns + ------- + List[shapely.Polygon] + A list of polygons representing the rectangular tiles. + """ + # Generate the x and y coordinates for the tile boundaries + x_min, y_min, x_max, y_max = self.extents.bounds + x_coords = np.arange(x_min, x_max, tile_width) + x_coords = np.append(x_coords, x_max) + y_coords = np.arange(y_min, y_max, tile_height) + y_coords = np.append(y_coords, y_max) + + # Generate tiles from grid points + tiles = [] + for x_min, x_max in zip(x_coords[:-1], x_coords[1:]): + for y_min, y_max in zip(y_coords[:-1], y_coords[1:]): + tiles.append(shapely.box(x_min, y_min, x_max, y_max)) + + return tiles + + + def _get_balanced_tile_bounds( + self, + max_size: Optional[int], + ) -> List[shapely.Polygon]: + """ + Generates spatially balanced tiles based on KDTree partitioning. + + Parameters + ---------- + max_size : Optional[int] + The maximum number of points in each tile. + + Returns + ------- + List[shapely.Polygon] + A list of polygons representing balanced tile bounds. + + Raises + ------ + ValueError + If `max_size` is smaller than the KDTree's leaf size. + """ + # Can only request up to brute force resolution of KDTree + leafsize = self.kdtree_tx.leafsize + if max_size < leafsize: + msg = f"Arg 'max_size' less than KDTree 'leafsize', {leafsize}." + raise ValueError(msg) + + # DFS search to construct tile bounds + def recurse(node, bounds): + if node.children <= max_size: + bounds = shapely.box(*bounds.mins, *bounds.maxes) + return [bounds] + lb, gb = bounds.split(node.split_dim, node.split) + return recurse(node.less, lb) + recurse(node.greater, gb) + + node = self.kdtree_tx.tree + bounds = Rectangle(self.kdtree_tx.mins, self.kdtree_tx.maxes) + return recurse(node, bounds) + + + def _tile(self, + width: Optional[float] = None, + height: Optional[float] = None, + max_size: Optional[int] = None, + ) -> List[shapely.Polygon]: + """ + Generates tiles based on either fixed dimensions or balanced + partitioning. + + Parameters + ---------- + width : Optional[float] + The width of each tile. Required if `max_size` is not provided. + height : Optional[float] + The height of each tile. Required if `max_size` is not provided. + max_size : Optional[int] + The maximum number of points in each tile. Required if `width` and + `height` are not provided. + + Returns + ------- + List[shapely.Polygon] + A list of polygons representing the tiles. + + Raises + ------ + ValueError + If both `width`/`height` and `max_size` are provided or none are + provided. + """ + # Square tiling kwargs provided + if not max_size and (width and height): + return self._get_rectangular_tile_bounds(width, height) + # Balanced tiling kwargs provided or None + elif not (width or height): + return self._get_balanced_tile_bounds(max_size) + # Bad set of kwargs + else: + args = list(compress(locals().keys(), locals().values())) + args.remove('self') + msg = ( + "Function requires either 'max_size' or both " + f"'width' and 'height'. Found: {', '.join(args)}." + ) + logging.error(msg) + raise ValueError + + +# TODO: Add documentation for settings +class STTile: + """ + A class representing a tile of a ST sample. + + Attributes + ---------- + dataset : STInMemoryDataset + The ST dataset containing data. + extents : shapely.Polygon + The extents of the tile in the sample. + boundaries : pd.DataFrame + Filtered boundaries within the tile extents. + transcripts : pd.DataFrame + Filtered transcripts within the tile extents. + """ + + def __init__( + self, + dataset: STInMemoryDataset, + extents: shapely.Polygon, + ): + """ + Initializes a STTile instance. + + Parameters + ---------- + dataset : STInMemoryDataset + The ST dataset containing data. + extents : shapely.Polygon + The extents of the tile in the sample. + + Notes + ----- + The `boundaries` and `transcripts` attributes are cached to avoid the + overhead of filtering when tiles are instantiated. This is particularly + useful in multiprocessing settings where generating tiles in parallel + could lead to high overhead. + + Internal Attributes + -------------------- + _boundaries : pd.DataFrame, optional + Cached DataFrame of filtered boundaries. Initially set to None. + _transcripts : pd.DataFrame, optional + Cached DataFrame of filtered transcripts. Initially set to None. + """ + self.dataset = dataset + self.extents = extents + self.margin = dataset.margin + self.settings = self.dataset.settings + + # Internal caches for filtered data + self._boundaries = None + self._transcripts = None + + + @property + def uid(self) -> str: + """ + Generates a unique identifier for the tile based on its extents. This + UID is particularly useful for saving or indexing tiles in distributed + processing environments. + + The UID is constructed using the minimum and maximum x and y coordinates + of the tile's bounding box, representing its position and size in the + sample. + + Returns + ------- + str + A unique identifier string in the format + 'x=_y=_w=_h=' where: + - ``: Minimum x-coordinate of the tile's extents. + - ``: Minimum y-coordinate of the tile's extents. + - ``: Width of the tile. + - ``: Height of the tile. + + Example + ------- + If the tile's extents are bounded by (x_min, y_min) = (100, 200) and + (x_max, y_max) = (150, 250), the generated UID would be: + 'x=100_y=200_w=50_h=50' + """ + x_min, y_min, x_max, y_max = map(int, self.extents.bounds) + uid = f'x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}' + return uid + + + @cached_property + def boundaries(self) -> pd.DataFrame: + """ + Returns the filtered boundaries within the tile extents, cached for + efficiency. + + The boundaries are computed only once and cached. If the boundaries + have not been computed yet, they are computed using + `get_filtered_boundaries()`. + + Returns + ------- + pd.DataFrame + A DataFrame containing the filtered boundaries within the tile + extents. + """ + if self._boundaries is None: + self._boundaries = self.get_filtered_boundaries() + return self._boundaries + + + @cached_property + def transcripts(self) -> pd.DataFrame: + """ + Returns the filtered transcripts within the tile extents, cached for + efficiency. + + The transcripts are computed only once and cached. If the transcripts + have not been computed yet, they are computed using + `get_filtered_transcripts()`. + + Returns + ------- + pd.DataFrame + A DataFrame containing the filtered transcripts within the tile + extents. + """ + if self._transcripts is None: + self._transcripts = self.get_filtered_transcripts() + return self._transcripts + + + def get_filtered_boundaries(self) -> pd.DataFrame: + """ + Filters the boundaries in the sample to include only those within + the specified tile extents. + + Returns + ------- + pd.DataFrame + A DataFrame containing the filtered boundaries within the tile + extents. + """ + filtered_boundaries = utils.filter_boundaries( + boundaries=self.dataset.boundaries, + inset=self.extents, + outset=self.extents.buffer(self.margin, join_style='mitre'), + x=self.settings.boundaries.x, + y=self.settings.boundaries.y, + label=self.settings.boundaries.label, + ) + return filtered_boundaries + + + def get_filtered_transcripts(self) -> pd.DataFrame: + """ + Filters the transcripts in the sample to include only those within + the specified tile extents. + + Returns + ------- + pd.DataFrame + A DataFrame containing the filtered transcripts within the tile + extents. + """ + + # Buffer tile bounds to include transcripts around boundary + outset = self.extents.buffer(self.margin, join_style='mitre') + xmin, ymin, xmax, ymax = outset.bounds + + # Get transcripts inside buffered region + x, y = self.settings.transcripts.xy + mask = self.dataset.transcripts[x].between(xmin, xmax) + mask &= self.dataset.transcripts[y].between(ymin, ymax) + filtered_transcripts = self.dataset.transcripts[mask] + + return filtered_transcripts + + + def get_transcript_props(self) -> torch.Tensor: + """ + Encodes transcript features in a sparse format. + + Returns + ------- + props : torch.Tensor + A sparse tensor containing the encoded transcript features. + + Notes + ----- + The intention is for this function to simplify testing new strategies + for 'tx' node representations. For example, the encoder can be any type + of encoder that transforms the transcript labels into a numerical + matrix (in sparse format). + """ + # Encode transcript features in sparse format + embedding = self.dataset.sample._transcript_embedding + label = self.settings.transcripts.label + props = embedding.embed(self.transcripts[label]) + + return props + + + @staticmethod + def get_polygon_props( + polygons: gpd.GeoSeries, + area: bool = True, + convexity: bool = True, + elongation: bool = True, + circularity: bool = True, + ) -> pd.DataFrame: + """ + Computes geometric properties of polygons. + + Parameters + ---------- + polygons : gpd.GeoSeries + A GeoSeries containing polygon geometries. + area : bool, optional + If True, compute the area of each polygon (default is True). + convexity : bool, optional + If True, compute the convexity of each polygon (default is True). + elongation : bool, optional + If True, compute the elongation of each polygon (default is True). + circularity : bool, optional + If True, compute the circularity of each polygon (default is True). + + Returns + ------- + props : pd.DataFrame + A DataFrame containing the computed properties for each polygon. + """ + props = pd.DataFrame(index=polygons.index, dtype=float) + if area: + props['area'] = polygons.area + if convexity: + props['convexity'] = polygons.convex_hull.area / polygons.area + if elongation: + rects = polygons.minimum_rotated_rectangle() + props['elongation'] = rects.area / polygons.envelope.area + if circularity: + r = polygons.minimum_bounding_radius() + props["circularity"] = polygons.area / r ** 2 + + return props + + + @staticmethod + def get_kdtree_edge_index( + index_coords: np.ndarray, + query_coords: np.ndarray, + k: int, + max_distance: float, + ): + """ + Computes the k-nearest neighbor edge indices using a KDTree. + + Parameters + ---------- + index_coords : np.ndarray + An array of shape (n_samples, n_features) representing the + coordinates of the points to be indexed. + query_coords : np.ndarray + An array of shape (m_samples, n_features) representing the + coordinates of the query points. + k : int + The number of nearest neighbors to find for each query point. + max_distance : float + The maximum distance to consider for neighbors. + + Returns + ------- + torch.Tensor + An array of shape (2, n_edges) containing the edge indices. Each + column represents an edge between two points, where the first row + contains the source indices and the second row contains the target + indices. + """ + # KDTree search + tree = KDTree(index_coords) + dist, idx = tree.query(query_coords, k, max_distance) + + # To sparse adjacency + edge_index = np.argwhere(dist != np.inf).T + edge_index[1] = idx[dist != np.inf] + edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous() + + return edge_index + + + def get_boundary_props( + self, + area: bool = True, + convexity: bool = True, + elongation: bool = True, + circularity: bool = True, + ) -> torch.Tensor: + """ + Computes geometric properties of boundary polygons. + + Parameters + ---------- + area : bool, optional + If True, compute the area of each boundary polygon (default is + True). + convexity : bool, optional + If True, compute the convexity of each boundary polygon (default is + True). + elongation : bool, optional + If True, compute the elongation of each boundary polygon (default is + True). + circularity : bool, optional + If True, compute the circularity of each boundary polygon (default + is True). + + Returns + ------- + torch.Tensor + A tensor containing the computed properties for each boundary + polygon. + + Notes + ----- + The intention is for this function to simplify testing new strategies + for 'bd' node representations. You can just change the function body to + return another torch.Tensor without worrying about changes to the rest + of the code. + """ + # Get polygons from coordinates + polygons = utils.get_polygons_from_xy( + self.boundaries, + x=self.settings.boundaries.x, + y=self.settings.boundaries.y, + label=self.settings.boundaries.label, + ) + # Geometric properties of polygons + props = self.get_polygon_props(polygons) + props = torch.as_tensor(props.values).float() + + return props + + + def to_pyg_dataset( + self, + #train: bool, + neg_sampling_ratio: float = 5, + k_bd: int = 3, + dist_bd: float = 15, + k_tx: int = 3, + dist_tx: float = 5, + area: bool = True, + convexity: bool = True, + elongation: bool = True, + circularity: bool = True, + ) -> HeteroData: + """ + Converts the sample data to a PyG HeteroData object (more information + on the structure of the object below). + + Parameters + ---------- + train: bool + Whether a sample is part of the training dataset. If True, add + negative edges to dataset. + k_bd : int, optional + The number of nearest neighbors for the 'bd' nodes (default is 4). + dist_bd : float, optional + The maximum distance for neighbors of 'bd' nodes (default is 20). + k_tx : int, optional + The number of nearest neighbors for the 'tx' nodes (default is 4). + dist_tx : float, optional + The maximum distance for neighbors of 'tx' nodes (default is 20). + area : bool, optional + If True, compute the area of each polygon (default is True). + convexity : bool, optional + If True, compute the convexity of each polygon (default is True). + elongation : bool, optional + If True, compute the elongation of each polygon (default is True). + circularity : bool, optional + If True, compute the circularity of each polygon (default is True). + + Returns + ------- + data : HeteroData + A PyG HeteroData object containing the sample data. + + Segger PyG HeteroData Spec + -------------------------- + A heterogenous graph with two node types and two edge types. + + Node Types + ---------- + 1. Boundary ("bd") + Represents boundaries (typically cells) in the ST dataset. + + Attributes + ---------- + id : str + Cell ID originating from the ST sample. + pos : np.ndarray + X, Y coordinates of the centroid of the polygon boundary. + x : torch.tensor + May include area, convexity, elongation, and circularity + of the polygon boundary (user-specified). + + 2. Transcript ("tx") + Represents transcripts in the ST dataset. + + Attributes + ---------- + id : int64 + Transcript ID originating from ST sample. + pos : np.ndarray + X, Y, Z coordinates of the transcript. + x : torch.tensor + Sparse one-hot encoding of the transcript gene name. + + Edge Types + ---------- + 1. ("tx", "belongs", "bd") + Represents the relationship where a transcript is contained within + a boundary. + + Attributes + ---------- + edge_index : torch.Tensor + Edge indices in COO format between transcripts and nuclei + + 2. ("tx", "neighbors", "bd") + Represents the relationship where a transcript is nearby but not + within a boundary. + + Attributes + ---------- + edge_index : torch.Tensor + Edge indices in COO format between transcripts and boundaries + + 3. ("tx", "neighbors", "tx") + Represents the relationship where a transcript is nearby another + transcript. + + Attributes + ---------- + edge_index : torch.Tensor + Edge indices in COO format between transcripts and transcripts + """ + # Initialize an empty HeteroData object + pyg_data = HeteroData() + + # Set up Boundary nodes + polygons = utils.get_polygons_from_xy( + self.boundaries, + self.settings.boundaries.x, + self.settings.boundaries.y, + self.settings.boundaries.label, + ) + centroids = polygons.centroid.get_coordinates() + pyg_data['bd'].id = polygons.index.to_numpy() + pyg_data['bd'].pos = centroids.values + pyg_data['bd'].x = self.get_boundary_props( + area, convexity, elongation, circularity + ) + + # Set up Transcript nodes + pyg_data['tx'].id = self.transcripts[ + self.settings.transcripts.id + ].values + pyg_data['tx'].pos = self.transcripts[ + self.settings.transcripts.xyz + ].values + pyg_data['tx'].x = self.get_transcript_props() + + # Set up Boundary-Transcript neighbor edges + dist = np.sqrt(polygons.area.max()) * 10 # heuristic distance + nbrs_edge_idx = self.get_kdtree_edge_index( + centroids, + self.transcripts[self.settings.transcripts.xy], + k=k_bd, + max_distance=dist, + ) + pyg_data["tx", "neighbors", "bd"].edge_index = nbrs_edge_idx + + # Set up Transcript-Transcript neighbor edges + nbrs_edge_idx = self.get_kdtree_edge_index( + self.transcripts[self.settings.transcripts.xy], + self.transcripts[self.settings.transcripts.xy], + k=k_tx, + max_distance=dist_tx, + ) + pyg_data["tx", "neighbors", "tx"].edge_index = nbrs_edge_idx + + # Find nuclear transcripts + tx_cell_ids = self.transcripts[self.settings.boundaries.id] + cell_ids_map = {idx: i for (i, idx) in enumerate(polygons.index)} + is_nuclear = self.transcripts[ + self.settings.transcripts.nuclear + ].astype(bool) + is_nuclear &= tx_cell_ids.isin(polygons.index) + + # Set up overlap edges + row_idx = np.where(is_nuclear)[0] + col_idx = tx_cell_ids.iloc[row_idx].map(cell_ids_map) + blng_edge_idx = torch.tensor(np.stack([row_idx, col_idx])).long() + pyg_data["tx", "belongs", "bd"].edge_index = blng_edge_idx + + # Add negative edges for training + # Need more time-efficient solution than this + edge_type = ('tx', 'belongs', 'bd') + transform = RandomLinkSplit( + num_val=0, + num_test=0, + is_undirected=True, + edge_types=[edge_type], + neg_sampling_ratio=neg_sampling_ratio, + ) + pyg_data, _, _ = transform(pyg_data) + + # Refilter negative edges to include only transcripts in the + # original positive edges (still need a memory-efficient solution) + edges = pyg_data[edge_type] + mask = edges.edge_label_index[0].unsqueeze(1) == \ + edges.edge_index[0].unsqueeze(0) + mask = torch.nonzero(torch.any(mask, 1)).squeeze() + edges.edge_label_index = edges.edge_label_index[:, mask] + edges.edge_label = edges.edge_label[mask] + + return pyg_data diff --git a/src/segger/data/parquet/transcript_embedding.py b/src/segger/data/parquet/transcript_embedding.py new file mode 100644 index 0000000..2f8085c --- /dev/null +++ b/src/segger/data/parquet/transcript_embedding.py @@ -0,0 +1,71 @@ +import torch.nn +import torch.nn.functional as F +from torch import Tensor, LongTensor +from sklearn.preprocessing import LabelEncoder +from typing import Optional, Union +from numpy.typing import ArrayLike +import pandas as pd + +# TODO: Add documentation +class TranscriptEmbedding(torch.nn.Module): + ''' + Utility class to handle transcript embeddings in PyTorch so that they are + optionally learnable in the future. + + Default behavior is to use the index of gene names. + ''' + + # TODO: Add documentation + @staticmethod + def _check_inputs( + classes: ArrayLike, + weights: Union[pd.DataFrame, None], + ): + # Classes is a 1D array + if len(classes.shape) > 1: + msg = ( + "'classes' should be a 1D array, got an array of shape " + f"{classes.shape} instead." + ) + raise ValueError(msg) + # Items appear exactly once + if len(classes) != len(set(classes)): + msg = ( + "All embedding classes must be unique. One or more items in " + "'classes' appears twice." + ) + raise ValueError(msg) + # All classes have an entry in weights + elif weights is not None: + missing = set(classes).difference(weights.index) + if len(missing) > 0: + msg = ( + f"Index of 'weights' DataFrame is missing {len(missing)} " + "entries compared to classes." + ) + raise ValueError(msg) + + # TODO: Add documentation + def __init__( + self, + classes: ArrayLike, + weights: Optional[pd.DataFrame] = None, + ): + # check input arguments + TranscriptEmbedding._check_inputs(classes, weights) + # Setup as PyTorch module + super(TranscriptEmbedding, self).__init__() + self._encoder = LabelEncoder().fit(classes) + if weights is None: + self._weights = None + else: + self._weights = Tensor(weights.loc[classes].values) + + # TODO: Add documentation + def embed(self, classes: ArrayLike): + indices = LongTensor(self._encoder.transform(classes)) + # Default, one-hot encoding + if self._weights is None: + return indices #F.one_hot(indices, len(self._encoder.classes_)) + else: + return F.embedding(indices, self._weights) From 49755cff75163970fcefaa368699021100ee17b7 Mon Sep 17 00:00:00 2001 From: andrewmoorman Date: Fri, 27 Sep 2024 17:06:27 -0400 Subject: [PATCH 059/156] Bug fixes in parquet-based data creation to match new prediction code --- src/segger/data/parquet/pyg_dataset.py | 3 +- src/segger/data/parquet/sample.py | 41 +++++++++++++---------- src/segger/data/utils.py | 6 +++- src/segger/prediction/predict.py | 4 +-- src/segger/training/segger_data_module.py | 14 +++----- 5 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index cfc95a5..a464e72 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -18,7 +18,6 @@ def __init__( pre_filter: Optional[Callable] = None ): super().__init__(root, transform, pre_transform, pre_filter) - os.makedirs(os.path.join(self.processed_dir, 'raw'), exist_ok=True) @property def raw_file_names(self) -> List[str]: @@ -38,7 +37,7 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - paths = glob.glob(f'{self.processed_dir}/*.pt') + paths = glob.glob(f'{self.processed_dir}/x=*_y=*_w=*_h=*.pt') file_names = list(map(os.path.basename, paths)) return file_names diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 314573f..4c79e20 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -280,11 +280,11 @@ def _setup_directory( Directory structure created: ---------------------------- data_dir/ - ├── train/ + ├── train_tiles/ │ └── processed/ - ├── test/ + ├── test_tiles/ │ └── processed/ - └── val/ + └── val_tiles/ └── processed/ Parameters @@ -298,12 +298,13 @@ def _setup_directory( If any of the 'processed' directories already contain files. """ data_dir = Path(data_dir) # by default, convert to Path object - for dt in ['train', 'test', 'val']: - tile_dir = data_dir / dt / 'processed' - tile_dir.mkdir(parents=True, exist_ok=True) - if os.listdir(tile_dir): - msg = f"Directory '{tile_dir}' must be empty." - raise AssertionError(msg) + for tile_type in ['train_tiles', 'test_tiles', 'val_tiles']: + for stage in ['raw', 'processed']: + tile_dir = data_dir / tile_type / stage + tile_dir.mkdir(parents=True, exist_ok=True) + if os.listdir(tile_dir): + msg = f"Directory '{tile_dir}' must be empty." + raise AssertionError(msg) def set_transcript_embedding(self, weights: pd.DataFrame): @@ -410,7 +411,7 @@ def func(region): for tile in tiles: # Choose training, test, or validation datasets data_type = np.random.choice( - a=['train', 'test', 'val'], + a=['train_tiles', 'test_tiles', 'val_tiles'], p=[1 - (test_prob + val_prob), test_prob, val_prob], ) xt = STTile(dataset=xm, extents=tile) @@ -426,7 +427,9 @@ def func(region): # TODO: Add Dask backend regions = self._get_balanced_regions() - pqdm(regions, func, n_jobs=self.n_workers) + for region in regions: + func(region) + #pqdm(regions, func, n_jobs=self.n_workers) # TODO: Add documentation for settings @@ -1156,18 +1159,20 @@ def to_pyg_dataset( ) centroids = polygons.centroid.get_coordinates() pyg_data['bd'].id = polygons.index.to_numpy() - pyg_data['bd'].pos = centroids.values + pyg_data['bd'].pos = torch.tensor(centroids.values, dtype=torch.float32) pyg_data['bd'].x = self.get_boundary_props( area, convexity, elongation, circularity ) # Set up Transcript nodes - pyg_data['tx'].id = self.transcripts[ - self.settings.transcripts.id - ].values - pyg_data['tx'].pos = self.transcripts[ - self.settings.transcripts.xyz - ].values + pyg_data['tx'].id = torch.tensor( + self.transcripts[self.settings.transcripts.id].values.astype(int), + dtype=torch.int, + ) + pyg_data['tx'].pos = torch.tensor( + self.transcripts[self.settings.transcripts.xyz].values, + dtype=torch.float32, + ) pyg_data['tx'].x = self.get_transcript_props() # Set up Boundary-Transcript neighbor edges diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index a847f5c..ad5cdf4 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -285,7 +285,11 @@ def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: pass # return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) else: - raise ValueError(f"Unknown method {method}") + msg = ( + f"Unknown method {method}. Valid methods include: 'kd_tree', " + "'faiss', and 'cuda'." + ) + raise ValueError() diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index 90d85d2..2fbd9ad 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -297,7 +297,7 @@ def segment( use_cc: bool = True, file_format: str = 'anndata', receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, - knn_method: str = 'cuda', + knn_method: str = 'kd_tree', verbose: bool = False, **anndata_kwargs ) -> None: @@ -323,7 +323,7 @@ def segment( use_cc : bool, optional If to further re-group transcripts that have not been assigned to any nucleus. knn_method : str, optional - The method to use for nearest neighbors ('cuda' by default). + The method to use for nearest neighbors ('kd_tree' by default). **anndata_kwargs : dict, optional Additional keyword arguments passed to the create_anndata function. diff --git a/src/segger/training/segger_data_module.py b/src/segger/training/segger_data_module.py index 678194e..c1be43d 100644 --- a/src/segger/training/segger_data_module.py +++ b/src/segger/training/segger_data_module.py @@ -2,7 +2,7 @@ from torch_geometric.loader import DataLoader import os from pathlib import Path -from segger.data.io import SpatialTranscriptomicsDataset +from segger.data.parquet.pyg_dataset import STPyGDataset # TODO: Add documentation @@ -21,15 +21,9 @@ def __init__( # TODO: Add documentation def setup(self, stage=None): - self.train = SpatialTranscriptomicsDataset( - root=self.data_dir / 'train_tiles' - ) - self.test = SpatialTranscriptomicsDataset( - root=self.data_dir / 'test_tiles' - ) - self.val = SpatialTranscriptomicsDataset( - root=self.data_dir / 'val_tiles' - ) + self.train = STPyGDataset(root=self.data_dir / 'train_tiles') + self.test = STPyGDataset(root=self.data_dir / 'test_tiles') + self.val = STPyGDataset(root=self.data_dir / 'val_tiles') self.loader_kwargs = dict( batch_size=self.batch_size, num_workers=self.num_workers, From 0c03cc97cd11c7beb6352032c8aa1d1150abd9a4 Mon Sep 17 00:00:00 2001 From: andrewmoorman Date: Fri, 27 Sep 2024 17:20:31 -0400 Subject: [PATCH 060/156] Re-added pqdm for parallelization :/ --- src/segger/data/parquet/sample.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 4c79e20..970f17f 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -427,9 +427,7 @@ def func(region): # TODO: Add Dask backend regions = self._get_balanced_regions() - for region in regions: - func(region) - #pqdm(regions, func, n_jobs=self.n_workers) + pqdm(regions, func, n_jobs=self.n_workers) # TODO: Add documentation for settings From e5eba4e4b3ccb39831c0e4e8d65e85305e6d3dc3 Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Sun, 6 Oct 2024 10:24:57 +0000 Subject: [PATCH 061/156] Synchronize tile naming conventions --- src/segger/data/io.py | 2 +- src/segger/data/parquet/pyg_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segger/data/io.py b/src/segger/data/io.py index 3d45afd..1f00431 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -815,7 +815,7 @@ def _process_tile(self, tile_params: Tuple) -> None: # Save the tile data to the appropriate directory based on split if self.verbose: print(f"Saving data for tile at (x_min: {x_loc}, y_min: {y_loc})...") - filename = f"tiles_x{x_loc}_y{y_loc}_{x_size}_{y_size}.pt" + filename = f"tiles_x={x_loc}_y={y_loc}_w={x_size}_h={y_size}.pt" if prob > val_prob + test_prob: torch.save(data, processed_dir / 'train_tiles' / 'processed' / filename) elif prob > test_prob: diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index a464e72..3d3e117 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -37,7 +37,7 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - paths = glob.glob(f'{self.processed_dir}/x=*_y=*_w=*_h=*.pt') + paths = glob.glob(f'{self.processed_dir}/tiles_x=*_y=*_w=*_h=*.pt') file_names = list(map(os.path.basename, paths)) return file_names From 6abc9e951ffdf29ff345e62b37d17a39fcf7b7f4 Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Mon, 7 Oct 2024 11:45:18 +0000 Subject: [PATCH 062/156] Fix pytorch lightning import in model training script --- scripts/train_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/train_model.py b/scripts/train_model.py index 4e1e361..7c25bc3 100644 --- a/scripts/train_model.py +++ b/scripts/train_model.py @@ -3,7 +3,7 @@ import argparse from pathlib import Path import torch -import lightning as L +from pytorch_lightning import Trainer from torch_geometric.loader import DataLoader from segger.data.utils import SpatialTranscriptomicsDataset # Updated dataset class from segger.models.segger_model import Segger @@ -57,7 +57,7 @@ def main(args): litsegger = LitSegger(model=model) # Initialize the PyTorch Lightning trainer - trainer = L.Trainer( + trainer = Trainer( accelerator=args.accelerator, strategy=args.strategy, precision=args.precision, From 228258bdb1a2989c9f9dd9e53ae3664d168806e7 Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Mon, 7 Oct 2024 16:18:08 +0000 Subject: [PATCH 063/156] Synchronize tile naming in STSampleParquet pipeline --- src/segger/data/parquet/sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 970f17f..9407ced 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -785,7 +785,7 @@ def uid(self) -> str: 'x=100_y=200_w=50_h=50' """ x_min, y_min, x_max, y_max = map(int, self.extents.bounds) - uid = f'x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}' + uid = f'tiles_x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}' return uid From e4508573ddbc54284213ed91f35c63f0e1d77c9a Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 30 Sep 2024 01:13:57 +0200 Subject: [PATCH 064/156] towards faster prediction, still ongoing --- scripts/create_data_sample.py | 63 ++++++ scripts/predict_model_cuda.py | 170 ++++++++++++++ scripts/predict_model_sample.py | 80 +++++-- src/segger/data/io.py | 15 +- src/segger/data/utils.py | 10 +- src/segger/prediction/predict.py | 371 +++++++++++++++++-------------- 6 files changed, 513 insertions(+), 196 deletions(-) create mode 100644 scripts/create_data_sample.py create mode 100644 scripts/predict_model_cuda.py diff --git a/scripts/create_data_sample.py b/scripts/create_data_sample.py new file mode 100644 index 0000000..c4bc1ff --- /dev/null +++ b/scripts/create_data_sample.py @@ -0,0 +1,63 @@ +from segger.data.io import XeniumSample +from segger.training.train import LitSegger +from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.predict import predict, load_model +from lightning.pytorch.loggers import CSVLogger +from pytorch_lightning import Trainer +from pathlib import Path +from lightning.pytorch.plugins.environments import LightningEnvironment +from matplotlib import pyplot as plt +import seaborn as sns +# import pandas as pd +from segger.data.utils import calculate_gene_celltype_abundance_embedding +import scanpy as sc +import os + +os.environ['DASK_DAEMON'] = 'False' + +xenium_data_dir = Path('./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1') +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0926') +models_dir = Path('./models/bc_embedding_0926') + +scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' + +scRNAseq = sc.read(scRNAseq_path) + +sc.pp.subsample(scRNAseq, 0.1) + +# Step 1: Calculate the gene cell type abundance embedding +celltype_column = 'celltype_minor' +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) + + + + +# Setup Xenium sample to create dataset +xs = XeniumSample(verbose=False, embedding_df=gene_celltype_abundance_embedding) # , embedding_df=gene_celltype_abundance_embedding) +xs.set_file_paths( + transcripts_path=xenium_data_dir / 'transcripts.parquet', + boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', +) +xs.set_metadata() +# xs.x_max = 1000 +# xs.y_max = 1000 + +try: + xs.save_dataset_for_segger( + processed_dir=segger_data_dir, + x_size=120, + y_size=120, + d_x=100, + d_y=100, + margin_x=10, + margin_y=10, + compute_labels=True, # Set to True if you need to compute labels + r_tx=5, + k_tx=5, + val_prob=0.4, + test_prob=0.1, + num_workers=6 + ) +except AssertionError as err: + print(f'Dataset already exists at {segger_data_dir}') + diff --git a/scripts/predict_model_cuda.py b/scripts/predict_model_cuda.py new file mode 100644 index 0000000..302da40 --- /dev/null +++ b/scripts/predict_model_cuda.py @@ -0,0 +1,170 @@ +import torch +import cupy as cp +import dask.array as da +from dask.distributed import Client, wait +from dask_cuda import LocalCUDACluster +from torch.utils.dlpack import to_dlpack, from_dlpack # DLPack conversion +from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.predict import load_model +from cupyx.scipy.sparse import coo_matrix +import torch.distributed as dist +from pathlib import Path +from cuvs.neighbors import cagra +import os +import cuvs +import rmm + +# Initialize RMM +# from rmm.allocators.cupy import rmm_cupy_allocator + +# Initialize RMM with a pool allocator +rmm.reinitialize( + pool_allocator=True, # Enable memory pool + initial_pool_size=2**30 # Set 1GB initial pool size, adjust as needed +) + +# Set RMM as the allocator for CuPy +# cp.cuda.set_allocator(rmm_cupy_allocator) + + + +# Function to compute edge indices using spatial locations +def get_edge_index_cuda(coords_1: torch.Tensor, coords_2: torch.Tensor, k: int = 10, dist: float = 10.0) -> torch.Tensor: + def cupy_to_torch(cupy_array): + return torch.from_dlpack((cupy_array.toDlpack())) + def torch_to_cupy(tensor): + if not tensor.is_contiguous(): + tensor = tensor.contiguous() # Ensure tensor is contiguous + return cp.fromDlpack(to_dlpack(tensor)) # Convert PyTorch tensor to CuPy + print("Converting tensors to CuPy...") # Debug log + cp_coords_1 = torch_to_cupy(coords_1) + cp_coords_2 = torch_to_cupy(coords_2) + cp_dist = cp.float32(dist) + print("Building index...") # Debug log + index_params = cagra.IndexParams() + search_params = cagra.SearchParams() + try: + # Build index and search for nearest neighbors + index = cagra.build_index(index_params, cp_coords_1) + D, I = cagra.search(search_params, index, cp_coords_2, k) + except cuvs.common.exceptions.CuvsException as e: + print(f"cuVS Exception: {e}") + raise + print("Processing search results...") # Debug log + valid_mask = cp.asarray(D < cp_dist ** 2) + repeats = valid_mask.sum(axis=1).tolist() + row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats) + valid_indices = cp.asarray(I)[cp.where(valid_mask)] + edges = cp.vstack((row_indices, valid_indices)).T + edge_index = cupy_to_torch(edges).long().contiguous() + return edge_index + +# Set up a Dask cluster with local GPUs +cluster = LocalCUDACluster(rmm_pool_size="5GB", scheduler_port=8786, dashboard_address=":8787", worker_port=(9000, 9100)) +client = Client(cluster, timeout='500s') + + + +def initialize_distributed(rank, world_size): + os.environ['RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(world_size) + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '29500' # Any free port can be used here + dist.init_process_group(backend='nccl', rank=rank, world_size=world_size) + + + +# Initialize the PyTorch distributed environment (NCCL backend) +# dist.init_process_group(backend='nccl') + +# Load the model (only done once on the main process) +model_version = 2 +models_dir = Path('./models/bc_embedding_0919') +model_path = models_dir / 'lightning_logs' / f'version_{model_version}' +model = load_model(model_path / 'checkpoints') + +# Scatter the model to all workers (GPUs) +# scattered_model = client.scatter(model) + +# Define the sparse_multiply function +def sparse_multiply(mat1, mat2_T, edge_index, block_info=None): + mat1 = cp.asarray(mat1) + mat2_T = cp.asarray(mat2_T) + # If block_info is provided, we adjust the edge indices for the local chunk + if block_info is not None: + row_block_start, row_block_end = block_info[0]['array-location'][0] + col_block_start, col_block_end = block_info[1]['array-location'][1] + rows, cols = edge_index + row_mask = (rows >= row_block_start) & (rows < row_block_end) + col_mask = (cols >= col_block_start) & (cols < col_block_end) + mask = row_mask & col_mask + # Adjust to local chunk indices for rows and columns + rows = rows[mask] - row_block_start + cols = cols[mask] - col_block_start + else: + # If block_info is None, assume we use the entire matrix + rows, cols = edge_index + # Perform dense multiplication for the current chunk or the full matrix + dense_result = cp.dot(mat1, mat2_T) + # Create the sparse result using the provided edge index + sparse_result = coo_matrix((dense_result[rows, cols], (rows, cols)), shape=dense_result.shape) + # Free GPU memory after each chunk computation + cp.cuda.Stream.null.synchronize() + cp.get_default_memory_pool().free_all_blocks() + return sparse_result + + +def inference_with_sparse_multiply(batch, model, rank, world_size, k=10, dist_r=10.0): + # Initialize the distributed process group inside the worker + if not dist.is_initialized(): + initialize_distributed(rank, world_size) + batch.to(f'cuda:{rank}') + # Load model inside the function to avoid pickling it + model = model.to(f'cuda:{rank}') # Make sure the model is on the correct GPU + # Perform inference using the loaded model + with torch.no_grad(): + output = model.model(batch.x_dict, batch.edge_index_dict) + # Convert output to CuPy arrays using dlpack for further operations + mat1 = cp.fromDlpack(to_dlpack(output['tx'])) + mat2 = cp.fromDlpack(to_dlpack(output['tx'])) + # Transpose mat2 for matrix multiplication + mat2_T = cp.transpose(mat2) + # Compute edge_index based on the 2D positions of tx nodes + coords_1 = batch['tx'].pos[:, :2] # Extract 2D positions + coords_2 = batch['tx'].pos[:, :2] # Assuming the same set of coordinates for the example + edge_index = get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist_r) + # Perform sparse multiplication using the function + result = sparse_multiply(mat1, mat2_T, edge_index) + return result + +# Initialize DataLoader +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') +dm = SeggerDataModule( + data_dir=segger_data_dir, + batch_size=1, + num_workers=1, +) +dm.setup() + + +world_size = 1 # Adjust based on number of GPUs + +futures = [] +for i, batch in enumerate(dm.train_dataloader()): + # Scatter the batch to each GPU worker + scattered_batch = client.scatter(batch) + for rank in range(world_size): + futures.append(client.submit(inference_with_sparse_multiply, scattered_batch, model, rank, world_size, k=10, dist_r=3, retries=3)) + # Gather results from all GPUs + print(f"Batch {i} processed with dynamic edge index and sparse multiplication.") + +with ProgressBar(): + results = client.gather(futures) + + +# Call the function and get results in memory +all_results = process_all_batches() +print("All batches processed.") + +# Clean up NCCL +dist.destroy_process_group() diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index d5b0a6d..82b6787 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -1,5 +1,5 @@ from segger.training.segger_data_module import SeggerDataModule -from segger.prediction.predict import segment, load_model +from segger.prediction.predict import segment, get_similarity_scores, load_model, predict_batch from pathlib import Path from matplotlib import pyplot as plt import seaborn as sns @@ -8,7 +8,7 @@ import dask.dataframe as dd import pandas as pd from pathlib import Path - +os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') models_dir = Path('./models/bc_embedding_0919') @@ -29,20 +29,64 @@ # Load in latest checkpoint model_path = models_dir / 'lightning_logs' / f'version_{model_version}' model = load_model(model_path / 'checkpoints') -dm.setup() -receptive_field = {'k_bd': 4, 'dist_bd': 15,'k_tx': 5, 'dist_tx': 3} - -segment( - model, - dm, - save_dir=benchmarks_dir, - seg_tag='test_segger_segment', - transcript_file=transcripts_file, - file_format='anndata', - receptive_field = receptive_field, - min_transcripts=10, - max_transcripts=1000, - cell_id_col='segger_cell_id', - knn_method='kd_tree' -) \ No newline at end of file +# batch = next(iter(dm.train_dataloader())).to('cuda') +# print(batch) +# outs = model.model(batch.x_dict, batch.edge_index_dict) + + +# def get_similarity_scores( +# model: torch.nn.Module, +# batch: Batch, +# from_type: str, +# to_type: str, +# receptive_field: dict +# ) + + +# def predict_batch( +# lit_segger: torch.nn.Module, +# batch: Batch, +# score_cut: float, +# receptive_field: Dict[str, float], +# use_cc: bool = True, +# knn_method: str = 'cuda' +# ) -> pd.DataFrame: + +for batch in dm.train_dataloader(): + batch = batch.to('cuda') + # outs = get_similarity_scores( + # model= model.model + # batch=batch, + # from_type='tx', + # to_type='bd', + # receptive_field={'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} + # ) + outs = predict_batch( + lit_segger=model, + batch=batch, + score_cut=.5, + receptive_field={'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3}, + use_cc = False, + knn_method= 'cuda' + ) + print(outs) + +# dm.setup() + +# receptive_field = {'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} + +# segment( +# model, +# dm, +# save_dir=benchmarks_dir, +# seg_tag='test_segger_segment', +# transcript_file=transcripts_file, +# file_format='anndata', +# receptive_field = receptive_field, +# min_transcripts=10, +# max_transcripts=1000, +# cell_id_col='segger_cell_id', +# use_cc=False, +# knn_method='kd_tree' +# ) \ No newline at end of file diff --git a/src/segger/data/io.py b/src/segger/data/io.py index 1f00431..fe855ed 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -43,6 +43,7 @@ def __init__( transcripts_df: pd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, + embedding_df: pd.DataFrame = None, keys: Dict = None, verbose: bool = True, ): @@ -261,8 +262,7 @@ def load_boundaries( # Convert the cell IDs to strings lazily boundaries_df[self.keys.CELL_ID.value] = boundaries_df[self.keys.CELL_ID.value].apply( - lambda x: str(x) if pd.notnull(x) else None, - meta=(self.keys.CELL_ID.value, 'str') + lambda x: str(x) if pd.notnull(x) else None ) if self.verbose: print(f"Loaded boundaries from '{path}' within bounding box ({x_min}, {x_max}, {y_min}, {y_max}).") @@ -956,8 +956,8 @@ def build_pyg_data_from_tile( class XeniumSample(SpatialTranscriptomicsSample): - def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, verbose: bool = True): - super().__init__(transcripts_df, transcripts_radius, boundaries_graph, XeniumKeys, verbose=verbose) + def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, embedding_df: pd.DataFrame = None, verbose: bool = True): + super().__init__(transcripts_df, transcripts_radius, boundaries_graph, embedding_df, XeniumKeys, verbose=verbose) def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: """ @@ -983,8 +983,7 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) # Handle potential bytes to string conversion for Dask DataFrame if pd.api.types.is_object_dtype(transcripts_df[self.keys.FEATURE_NAME.value]): transcripts_df[self.keys.FEATURE_NAME.value] = transcripts_df[self.keys.FEATURE_NAME.value].apply( - lambda x: x.decode('utf-8') if isinstance(x, bytes) else x, - meta=('feature_name', 'str'), + lambda x: x.decode('utf-8') if isinstance(x, bytes) else x ) # Apply the quality value filter using Dask @@ -1001,8 +1000,8 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) class MerscopeSample(SpatialTranscriptomicsSample): - def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False): - super().__init__(transcripts_df, transcripts_radius, boundaries_graph, MerscopeKeys) + def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, embedding_df: pd.DataFrame = None, verbose: bool = True): + super().__init__(transcripts_df, transcripts_radius, boundaries_graph, embedding_df, MerscopeKeys) def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: """ diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index ad5cdf4..8c42636 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -282,8 +282,8 @@ def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: elif method == 'faiss': return get_edge_index_faiss(coords_1, coords_2, k=k, dist=dist, gpu=gpu) elif method == 'cuda': - pass - # return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) + # pass + return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) else: msg = ( f"Unknown method {method}. Valid methods include: 'kd_tree', " @@ -387,12 +387,12 @@ def cupy_to_torch(cupy_array): def torch_to_cupy(tensor): return cp.fromDlpack(dlpack.to_dlpack(tensor)) # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack - cp_coords_1 = torch_to_cupy(coords_1) - cp_coords_2 = torch_to_cupy(coords_2) + cp_coords_1 = cp.float32(torch_to_cupy(coords_1)) + cp_coords_2 = cp.float32(torch_to_cupy(coords_2)) # Define the distance threshold in CuPy cp_dist = cp.float32(dist) # IndexParams and SearchParams for cagra - index_params = cagra.IndexParams() + index_params = cagra.IndexParams(nn_descent_niter=100) search_params = cagra.SearchParams() # Build index using CuPy coords index = cagra.build_index(index_params, cp_coords_1) diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index 2fbd9ad..d8bb001 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -1,37 +1,35 @@ import os import torch +import cupy as cp import pandas as pd import numpy as np import torch.nn.functional as F import torch._dynamo +import gc +import rmm +import re +import glob +from pathlib import Path from torch_geometric.loader import DataLoader from torch_geometric.data import Batch -from scipy.sparse.csgraph import connected_components as cc from segger.data.utils import ( + get_edge_index_cuda, get_edge_index, - coo_to_dense_adj, - create_anndata, - format_time + format_time, + create_anndata ) -from segger.data.io import XeniumSample -from segger.models.segger_model import Segger from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule -import random -import string -from pathlib import Path -import glob -from typing import Union -import re -from tqdm import tqdm +from scipy.sparse.csgraph import connected_components as cc +from typing import Union, Dict import dask.dataframe as dd -import dask from dask import delayed -from dask.array import from_array from dask.diagnostics import ProgressBar -from pqdm.threads import pqdm -import anndata as ad import time +import dask +from rmm.allocators.cupy import rmm_cupy_allocator +from cupyx.scipy.sparse import coo_matrix +from torch.utils.dlpack import to_dlpack, from_dlpack # CONFIG @@ -39,15 +37,16 @@ os.environ["PYTORCH_USE_CUDA_DSA"] = "1" os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + def load_model(checkpoint_path: str) -> LitSegger: """ Load a LitSegger model from a checkpoint. Parameters ---------- - checkpoint_path : os.Pathlike - Specific checkpoint file to load, or directory where the model - checkpoints are stored. If directory, the latest checkpoint is loaded. + checkpoint_path : str + Specific checkpoint file to load, or directory where the model checkpoints are stored. + If directory, the latest checkpoint is loaded. Returns ------- @@ -59,16 +58,15 @@ def load_model(checkpoint_path: str) -> LitSegger: FileNotFoundError If the specified checkpoint file does not exist. """ - # Get last checkpoint if directory provided checkpoint_path = Path(checkpoint_path) - msg = ( - f"No checkpoint found at {checkpoint_path}. Please make sure " - "you've provided the correct path." - ) + msg = f"No checkpoint found at {checkpoint_path}. Please make sure you've provided the correct path." + + # Get last checkpoint if directory is provided if os.path.isdir(checkpoint_path): checkpoints = glob.glob(str(checkpoint_path / '*.ckpt')) if len(checkpoints) == 0: raise FileNotFoundError(msg) + # Sort checkpoints by epoch and step def sort_order(c): match = re.match(r'.*epoch=(\d+)-step=(\d+).ckpt', c) return int(match[1]), int(match[2]) @@ -76,157 +74,176 @@ def sort_order(c): elif not checkpoint_path.exists(): raise FileExistsError(msg) - # Load model + # Load model from checkpoint lit_segger = LitSegger.load_from_checkpoint( checkpoint_path=checkpoint_path, - #map_location=torch.device("cuda"), ) return lit_segger def get_similarity_scores( - model: Segger, + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, -): + receptive_field: dict +) -> coo_matrix: """ - Compute similarity scores between 'from_type' and 'to_type' embeddings - within a batch. + Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes + using sparse matrix multiplication with CuPy and the 'sees' edge relation. - Parameters - ---------- - model : Segger - The segmentation model used to generate embeddings. - batch : Batch - A batch of data containing input features and edge indices. - from_type : str - The type of node from which the similarity is computed. - to_type : str - The type of node to which the similarity is computed. + Args: + model (torch.nn.Module): The segmentation model used to generate embeddings. + batch (Batch): A batch of data containing input features and edge indices. + from_type (str): The type of node from which the similarity is computed. + to_type (str): The type of node to which the similarity is computed. - Returns - ------- - torch.Tensor - A dense tensor containing the similarity scores between 'from_type' - and 'to_type' nodes. + Returns: + coo_matrix: A sparse matrix containing the similarity scores between + 'from_type' and 'to_type' nodes. """ - # Get embedding spaces from model + # Step 1: Get embeddings from the model batch = batch.to("cuda") - y_hat = model(batch.x_dict, batch.edge_index_dict) - - # Similarity of each 'from_type' to 'to_type' neighbors in embedding - nbr_idx = batch[from_type][f'{to_type}_field'] - m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros - similarity = torch.bmm( - m(y_hat[to_type])[nbr_idx], # 'to' x 'from' neighbors x embed - y_hat[from_type].unsqueeze(-1) # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 - - # Sigmoid to get most similar 'to_type' neighbor - similarity[similarity == 0] = -torch.inf # ensure zero stays zero - similarity = F.sigmoid(similarity) - - # Neighbor-filtered similarity scores - shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(nbr_idx != -1).T - indices[1] = nbr_idx[nbr_idx != -1] - values = similarity[nbr_idx != -1].flatten() - sparse_sim = torch.sparse_coo_tensor(indices, values, shape) - - # Return in dense format for backwards compatibility - scores = sparse_sim.to_dense().detach().cpu() + embeddings = model(batch.x_dict, batch.edge_index_dict) + + # Convert PyTorch embeddings to CuPy using DLPack + mat1 = cp.fromDlpack(to_dlpack(embeddings[from_type])) + mat2_T = cp.fromDlpack(to_dlpack(embeddings[to_type])) + + # Step 2: Get edge indices using the 'sees' relationship from the batch + edge_index = get_edge_index( + batch[to_type].pos[:, :2], # 'tx' positions + batch[from_type].pos[:, :2], # 'bd' positions + k=receptive_field['k_bd'], + dist=receptive_field['dist_bd'], + method='cuda' + ) + + print(edge_index) + + # Step 3: Perform sparse matrix multiplication using CuPy + def sparse_multiply(mat1, mat2_T, edge_index) -> coo_matrix: + rows, cols = edge_index + print(rows) + print(cols) + dense_result = cp.dot(mat1, mat2_T.T) + + # Create sparse result matrix + sparse_result = coo_matrix((dense_result[rows, cols], (rows, cols)), shape=dense_result.shape) + + # Free GPU memory after computation + cp.cuda.Stream.null.synchronize() + cp.get_default_memory_pool().free_all_blocks() + + return sparse_result + + # Call the sparse multiply function + sparse_similarity = sparse_multiply(mat1, mat2_T, edge_index.T) + + + scores = torch.from_dlpack(sparse_similarity.toarray().toDlpack()).to("cuda") + + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() return scores + + def predict_batch( - lit_segger: LitSegger, - batch: object, + lit_segger: torch.nn.Module, + batch: Batch, score_cut: float, - receptive_field: dict, + receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = 'kd_tree' + knn_method: str = 'cuda' ) -> pd.DataFrame: """ Predict cell assignments for a batch of transcript data using a segmentation model. + Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) + or unassigned (bound=0). + + Args: + lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. + batch (Batch): A batch of transcript and cell data. + score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. + receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell + and transcript-transcript relations. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. + Defaults to True. + knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. - Parameters - ---------- - lit_segger : LitSegger - The lightning module wrapping the segmentation model. - batch : object - A batch of transcript and cell data. - score_cut : float - The threshold for assigning transcripts to cells based on similarity scores. - receptive_field : dict - Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. - use_cc : bool, optional - If True, perform connected components analysis for unassigned transcripts. - knn_method : str, optional - The method to use for nearest neighbors ('cuda' by default). - - Returns - ------- - pd.DataFrame - A DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. + Returns: + pd.DataFrame: A DataFrame containing the transcript IDs, similarity scores, + assigned cell IDs, and 'bound' column. """ - # Get random Xenium-style ID def _get_id(): - id_chars = random.choices(string.ascii_lowercase, k=8) - return ''.join(id_chars) + '-nx' + """Generate a random Xenium-style ID.""" + return ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '-nx' with torch.no_grad(): + # Move batch to GPU batch = batch.to("cuda") + print(batch) - # Assignments of cells to nuclei + # Initialize assignments DataFrame transcript_id = batch['tx'].id.cpu().numpy() assignments = pd.DataFrame({'transcript_id': transcript_id}) if len(batch['bd'].id[0]) > 0: - # Step 2.1: Calculate edge index lazily - edge_index = get_edge_index( - batch['bd'].pos[:, :2].cpu(), - batch['tx'].pos[:, :2].cpu(), - k=receptive_field['k_bd'], - dist=receptive_field['dist_bd'], - method=knn_method, - ).T - - # Step 2.2: Compute dense adjacency matrix - batch['tx']['bd_field'] = coo_to_dense_adj( - edge_index, - num_nodes=batch['tx'].id.shape[0], - num_nbrs=receptive_field['k_bd'], - ) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd") - # 1. Get direct assignments from similarity matrix - belongs = scores.max(1) + # Compute edge index using get_edge_index_cuda for 'tx' sees 'bd' + # batch['tx', 'sees', 'bd'].edge_index = get_edge_index( + # batch['tx'].pos[:, :2], # 'tx' positions + # batch['bd'].pos[:, :2], # 'bd' positions + # k=receptive_field['k_bd'], + # dist=receptive_field['dist_bd'], + # method='cuda' + # ) + + # Compute similarity scores between 'tx' and 'bd' + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field) + + # Get direct assignments from similarity matrix + belongs = scores.max(axis=1) assignments['score'] = belongs.values.cpu() mask = assignments['score'] > score_cut - all_ids = np.concatenate(batch['bd'].id)[belongs.indices.cpu()] + + all_ids = np.concatenate(batch['bd'].id)[np.argmax(scores.cpu().numpy(), axis=1)] assignments.loc[mask, 'segger_cell_id'] = all_ids[mask] + # Add 'bound' column (1 for assigned, 0 for unassigned) + assignments['bound'] = 0 + assignments.loc[mask, 'bound'] = 1 + if use_cc: - # Transcript-transcript similarity scores, filtered by neighbors - edge_index = batch['tx', 'neighbors', 'tx'].edge_index - batch['tx']['tx_field'] = coo_to_dense_adj( - edge_index, - num_nodes=batch['tx'].id.shape[0], + # Compute edge index for 'tx' sees 'tx' using get_edge_index + batch['tx', 'sees', 'tx'].edge_index = get_edge_index( + batch['tx'].pos[:, :2], # 'tx' positions + batch['tx'].pos[:, :2], # 'tx' positions (self-reference) + k=receptive_field['k_tx'], + dist=receptive_field['dist_tx'], + method='cuda' ) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "tx") - scores = scores.fill_diagonal_(0) # ignore self-similarity - # 2. Assign remainder using connected components + # Compute similarity scores between 'tx' and 'tx' + scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx") + scores_tx.setdiag(0) # Ignore self-similarity + + # Assign unassigned transcripts using connected components no_id = assignments['segger_cell_id'].isna().values - no_id_scores = scores[no_id][:, no_id] + no_id_scores = scores_tx[no_id][:, no_id] n, comps = cc(no_id_scores, connection="weak", directed=False) new_ids = np.array([_get_id() for _ in range(n)]) assignments.loc[no_id, 'segger_cell_id'] = new_ids[comps] - - return assignments # Ensure this is a pandas DataFrame + # Perform memory cleanup to avoid OOM issues + # rmm.reinitialize(pool_allocator=True) + torch.cuda.empty_cache() + gc.collect() + + return assignments def predict( lit_segger: LitSegger, @@ -239,52 +256,61 @@ def predict( """ Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - Parameters - ---------- - lit_segger : LitSegger - The lightning module wrapping the segmentation model. - data_loader : DataLoader - A data loader providing batches of transcript and cell data. - score_cut : float - The threshold for assigning transcripts to cells based on similarity scores. - receptive_field : dict - Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. - use_cc : bool, optional - If True, perform connected components analysis for unassigned transcripts. - knn_method : str, optional - The method to use for nearest neighbors ('cuda' by default). - - Returns - ------- - dd.DataFrame - A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. + Args: + lit_segger (LitSegger): The lightning module wrapping the segmentation model. + data_loader (DataLoader): A data loader providing batches of transcript and cell data. + score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. + receptive_field (dict): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. + knn_method (str, optional): The method to use for nearest neighbors ('cuda' by default). Defaults to 'cuda'. + + Returns: + dd.DataFrame: A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. """ if len(data_loader) == 0: return None - # Create meta for the DataFrame + # Create a meta DataFrame for the Dask DataFrame meta = pd.DataFrame({ 'transcript_id': pd.Series(dtype='int64'), - 'score': pd.Series(dtype='float64'), - 'segger_cell_id': pd.Series(dtype='object') + 'score': pd.Series(dtype='float32'), + 'segger_cell_id': pd.Series(dtype='object'), + 'bound': pd.Series(dtype='int64') }) # Convert the entire data loader to delayed predictions - delayed_assignments = [delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) - for batch in data_loader] + delayed_assignments = [ + delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) + for batch in data_loader + ] - # Pass the meta to from_delayed + # Build the Dask DataFrame from the delayed assignments assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) - # Modify the logic to compute idxmax within each partition using map_partitions + # Max score selection logic, with fallback to unbound scores if no bound=1 def select_max_score_partition(df): - idx = df.groupby('transcript_id')['score'].idxmax() # Compute idxmax within each partition - return df.loc[idx].reset_index(drop=True) + max_bound_idx = df[df['bound'] == 1].groupby('transcript_id')['score'].idxmax() + max_unbound_idx = df[df['bound'] == 0].groupby('transcript_id')['score'].idxmax() + + # Combine indices, prioritizing bound=1 scores + final_idx = max_bound_idx.combine_first(max_unbound_idx) + result = df.loc[final_idx].reset_index(drop=True) + # Handle cases where there's only one entry per 'segger_cell_id' + single_entry_mask = result.groupby('segger_cell_id').size() == 1 + result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' + + return result + + # Map the logic over each partition using Dask final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) - return final_assignments + # Trigger garbage collection and free GPU memory + # rmm.reinitialize(pool_allocator=True) + torch.cuda.empty_cache() + gc.collect() + return final_assignments def segment( @@ -323,15 +349,16 @@ def segment( use_cc : bool, optional If to further re-group transcripts that have not been assigned to any nucleus. knn_method : str, optional - The method to use for nearest neighbors ('kd_tree' by default). + The method to use for nearest neighbors ('cuda' by default). **anndata_kwargs : dict, optional Additional keyword arguments passed to the create_anndata function. Returns: - ------- - None + None """ start_time = time.time() + # rmm.reinitialize(pool_allocator=True, initial_pool_size=2**26, maximum_pool_size=2**30) + # cp.cuda.set_allocator(rmm_cupy_allocator) # Ensure the save directory exists save_dir = Path(save_dir) @@ -347,28 +374,36 @@ def segment( test_dataloader = dm.test_dataloader() val_dataloader = dm.val_dataloader() - delayed_train = predict(model, train_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - delayed_val = predict(model, val_dataloader , score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + # delayed_train = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + # delayed_val = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) + delayed_test = delayed_test.compute() + # Compute all predictions at once using Dask + # with ProgressBar(): + # segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) if verbose: elapsed_time = format_time(time.time() - step_start_time) print(f"Predictions completed in {elapsed_time}.") - # Step 2: Combine and group by transcript_id (Use Dask to handle large dataframes) + # Step 2: Combine and group by transcript_id step_start_time = time.time() + # Combine the segmentation data seg_combined = dd.concat([segmentation_train, segmentation_val, segmentation_test]) - seg_final = seg_combined.loc[seg_combined.groupby('transcript_id')['score'].idxmax()].compute() + + # No need to handle max score logic here, as it's done inside the `predict` function + seg_final = seg_combined.compute() + + # Drop any unassigned rows seg_final = seg_final.dropna(subset=['segger_cell_id']).reset_index(drop=True) if verbose: elapsed_time = format_time(time.time() - step_start_time) print(f"Segmentation results processed in {elapsed_time}.") - # Step 3: Load transcripts and merge (using Dask) + # Step 3: Load transcripts and merge step_start_time = time.time() transcripts_df = dd.read_parquet(transcript_file) @@ -376,6 +411,7 @@ def segment( if verbose: print("Merging segmentation results with transcripts...") + # Merge the segmentation results with the transcript data seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner').compute() @@ -410,3 +446,8 @@ def segment( if verbose: total_time = format_time(time.time() - start_time) print(f"Total segmentation process completed in {total_time}.") + + # Step 5: Garbage collection and memory cleanup + # rmm.reinitialize(pool_allocator=True) + # torch.cuda.empty_cache() + gc.collect() From 99c6b8f631f43818df14d7526db2accb73fd8d79 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 7 Oct 2024 20:00:29 +0200 Subject: [PATCH 065/156] fixes #17, better memory usage, needs parallelization for fast prediction still --- docs/notebooks/benchmark_bc.py | 13 +- requirements.txt | 22 ++ scripts/create_data_sample.py | 33 +- scripts/create_dataset_cli.sh | 82 ---- scripts/create_tile_parquet.py | 52 --- scripts/create_toy_datasets.py | 85 ----- scripts/predict.py | 31 -- scripts/predict_model_cuda.py | 170 --------- scripts/predict_model_sample.py | 88 ++--- scripts/train_model.sh | 22 -- scripts/train_model_sample.py | 8 +- setup.py | 90 +++++ src/segger/data/io.py | 7 +- src/segger/data/utils.py | 89 +++-- src/segger/models/segger_model.py | 2 +- src/segger/prediction/predict.py | 609 ++++++++++++++++++++++-------- src/segger/validation/utils.py | 56 ++- 17 files changed, 722 insertions(+), 737 deletions(-) create mode 100644 requirements.txt delete mode 100644 scripts/create_dataset_cli.sh delete mode 100644 scripts/create_tile_parquet.py delete mode 100644 scripts/create_toy_datasets.py delete mode 100644 scripts/predict.py delete mode 100644 scripts/predict_model_cuda.py delete mode 100644 scripts/train_model.sh create mode 100644 setup.py diff --git a/docs/notebooks/benchmark_bc.py b/docs/notebooks/benchmark_bc.py index e0da617..31ac0bd 100644 --- a/docs/notebooks/benchmark_bc.py +++ b/docs/notebooks/benchmark_bc.py @@ -58,12 +58,13 @@ markers = find_markers(scRNAseq_adata, cell_type_column='celltype_major', pos_percentile=30, neg_percentile=5) # Annotate spatial segmentations with scRNAseq reference data -for method in segmentations_dict.keys(): - segmentations_dict[method] = annotate_query_with_reference( - reference_adata=scRNAseq_adata, - query_adata=segmentations_dict[method], - transfer_column='celltype_major' - ) +for method in segmentation_paths.keys(): + # segmentations_dict[method] = annotate_query_with_reference( + # reference_adata=scRNAseq_adata, + # query_adata=segmentations_dict[method], + # transfer_column='celltype_major' + # ) + segmentations_dict[method].write(segmentation_paths[method]) # Find mutually exclusive genes based on scRNAseq data exclusive_gene_pairs = find_mutually_exclusive_genes( diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..13b2436 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +numpy>=1.21.0 +pandas>=1.3.0 +scipy>=1.7.0 +matplotlib>=3.4.0 +seaborn>=0.11.0 +tqdm>=4.61.0 +torch>=2.0.0 +torchvision>=0.10.0 +pytorch-lightning>=1.3.0 +torchmetrics>=0.5.0 +# scanpy>=1.8.0 +squidpy==1.2.0 +adjustText>=0.8 +scikit-learn>=0.24.0 +geopandas>=0.9.0 +shapely>=1.7.0 +scanpy>=1.9.3 +torch-geometric>=2.2.0 +# pyg_lib>=0.0.0 +torch_scatter>=2.1.2 +torch_sparse>=0.6.18 +torch_cluster>=1.6.3 \ No newline at end of file diff --git a/scripts/create_data_sample.py b/scripts/create_data_sample.py index c4bc1ff..8cdb137 100644 --- a/scripts/create_data_sample.py +++ b/scripts/create_data_sample.py @@ -12,32 +12,35 @@ from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os +# import Dask.DataFrame as dd os.environ['DASK_DAEMON'] = 'False' -xenium_data_dir = Path('./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1') -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0926') -models_dir = Path('./models/bc_embedding_0926') +xenium_data_dir = Path('/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230831-pan-cns-TMA-Xenium/output-XETG00078__0010722__TMA_AKSI__20230831__151713/') +segger_data_dir = Path('./data_tidy/pyg_datasets/pan_cns_AKSI') +# models_dir = Path('./models/bc_embedding_1001') -scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' +# scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' -scRNAseq = sc.read(scRNAseq_path) +# scRNAseq = sc.read(scRNAseq_path) -sc.pp.subsample(scRNAseq, 0.1) +# sc.pp.subsample(scRNAseq, 0.1) -# Step 1: Calculate the gene cell type abundance embedding -celltype_column = 'celltype_minor' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) +# # Step 1: Calculate the gene cell type abundance embedding +# celltype_column = 'celltype_minor' +# gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) # Setup Xenium sample to create dataset -xs = XeniumSample(verbose=False, embedding_df=gene_celltype_abundance_embedding) # , embedding_df=gene_celltype_abundance_embedding) +xs = XeniumSample(verbose=False) # , embedding_df=gene_celltype_abundance_embedding) xs.set_file_paths( transcripts_path=xenium_data_dir / 'transcripts.parquet', boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', ) + +# dd.read_parquet(transcripts_path[0]) xs.set_metadata() # xs.x_max = 1000 # xs.y_max = 1000 @@ -45,16 +48,16 @@ try: xs.save_dataset_for_segger( processed_dir=segger_data_dir, - x_size=120, - y_size=120, - d_x=100, - d_y=100, + x_size=220, + y_size=220, + d_x=200, + d_y=200, margin_x=10, margin_y=10, compute_labels=True, # Set to True if you need to compute labels r_tx=5, k_tx=5, - val_prob=0.4, + val_prob=0.3, test_prob=0.1, num_workers=6 ) diff --git a/scripts/create_dataset_cli.sh b/scripts/create_dataset_cli.sh deleted file mode 100644 index 9f44959..0000000 --- a/scripts/create_dataset_cli.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash - -# Directory for processed data - -LOG_DIR="logs_cli" -SAMPLE_TAG="Xenium_FFPE_Human_Breast_Cancer_Rep1" -DATASET_DIR="data_raw/xenium" -TRANSCRIPTS_FILE="transcripts.parquet" -BOUNDARIES_FILE="${SAMPLE_TAG}/nucleus_boundaries.parquet" -DATA_DIR="data_tidy/pyg_datasets/${SAMPLE_TAG}" -GRID_SIZE=10 # 10x10 grid -OVERLAP=50 # Overlap between tiles (in the same units as your coordinates) - -# Create logs directory if not exists -mkdir -p $LOG_DIR - -# Load the dataset to calculate the bounds -python -c " -from segger.data.io import XeniumSample -from pathlib import Path -sample = XeniumSample() -sample.load_transcripts(base_path=Path('$DATASET_DIR'), sample='$SAMPLE_TAG', transcripts_filename='$TRANSCRIPTS_FILE', file_format='parquet') -sample.load_boundaries('$DATASET_DIR/$BOUNDARIES_FILE', file_format='parquet') -x_min, y_min, x_max, y_max = sample.x_min, sample.y_min, sample.x_max, sample.y_max -tile_width = (x_max - x_min) / $GRID_SIZE + $OVERLAP -tile_height = (y_max - y_min) / $GRID_SIZE + $OVERLAP -print(f'{x_min},{y_min},{x_max},{y_max},{tile_width},{tile_height}') -" > bounds.txt - - -# python -c " -# from segger.data.io import XeniumSample -# from pathlib import Path -# sample = XeniumSample() -# sample.load_transcripts(base_path=Path('data_raw/xenium'), sample='Xenium_FFPE_Human_Breast_Cancer_Rep1', transcripts_filename='transcripts.parquet', file_format='parquet') -# sample.load_boundaries('data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/nucleus_boundaries.parquet', file_format='parquet') -# x_min, y_min, x_max, y_max = sample.x_min, sample.y_min, sample.x_max, sample.y_max -# tile_width = (x_max - x_min) / 10 + 50 -# tile_height = (y_max - y_min) / 10 + 50 -# print(f'{x_min},{y_min},{x_max},{y_max},{tile_width},{tile_height}') -# " - -# Read the calculated bounds -read X_MIN_GLOBAL Y_MIN_GLOBAL X_MAX_GLOBAL Y_MAX_GLOBAL TILE_WIDTH TILE_HEIGHT < <(cat bounds.txt) - -# Iterate over a GRID_SIZE x GRID_SIZE grid -for i in $(seq 0 $(($GRID_SIZE - 1))) -do - for j in $(seq 0 $(($GRID_SIZE - 1))) - do - # Calculate the bounding box for this tile - X_MIN=$(echo "$X_MIN_GLOBAL + $i * ($TILE_WIDTH - $OVERLAP)" | bc) - Y_MIN=$(echo "$Y_MIN_GLOBAL + $j * ($TILE_HEIGHT - $OVERLAP)" | bc) - X_MAX=$(echo "$X_MIN + $TILE_WIDTH" | bc) - Y_MAX=$(echo "$Y_MIN + $TILE_HEIGHT" | bc) - - # Ensure we don't exceed global bounds - X_MAX=$(echo "if($X_MAX > $X_MAX_GLOBAL) $X_MAX_GLOBAL else $X_MAX" | bc) - Y_MAX=$(echo "if($Y_MAX > $Y_MAX_GLOBAL) $Y_MAX_GLOBAL else $Y_MAX" | bc) - - # Create a job submission script for this tile - JOB_SCRIPT="jobs/job_${i}_${j}.sh" - echo "#!/bin/bash" > $JOB_SCRIPT - echo "source activate segger_dev" >> $JOB_SCRIPT # Activate your conda environment if needed - echo "python -m segger.cli.create_dataset \\" >> $JOB_SCRIPT - echo " --dataset_type xenium \\" >> $JOB_SCRIPT - echo " --sample_tag $SAMPLE_TAG \\" >> $JOB_SCRIPT - echo " --dataset_dir $DATASET_DIR \\" >> $JOB_SCRIPT - echo " --data_dir $DATA_DIR \\" >> $JOB_SCRIPT - echo " --transcripts_file $TRANSCRIPTS_FILE \\" >> $JOB_SCRIPT - echo " --boundaries_file $BOUNDARIES_FILE \\" >> $JOB_SCRIPT - echo " --method kd_tree \\" >> $JOB_SCRIPT - echo " --x_min $X_MIN --y_min $Y_MIN --x_max $X_MAX --y_max $Y_MAX" >> $JOB_SCRIPT - chmod +x $JOB_SCRIPT - - # Submit the job to the cluster - # bsub -R "rusage[mem=200G]" -q long -o "$LOG_DIR/job_${i}_${j}.log" < $JOB_SCRIPT - done -done - -# Clean up -rm bounds.txt diff --git a/scripts/create_tile_parquet.py b/scripts/create_tile_parquet.py deleted file mode 100644 index 904cc84..0000000 --- a/scripts/create_tile_parquet.py +++ /dev/null @@ -1,52 +0,0 @@ -import scanpy as sc -from segger.data.io import * -from segger.data.utils import * -from pathlib import Path -import time -import pandas as pd -import matplotlib.pyplot as plt -from segger.data import XeniumSample, SpatialTranscriptomicsSample -from dask import delayed -import geopandas as gpd - -# Paths for raw and processed data -raw_data_dir = Path('data_raw/xenium/') -processed_data_dir = Path('data_tidy/pyg_datasets') -sample_tag = "Xenium_FFPE_Human_Breast_Cancer_Rep1" - -# Ensure directories exist -raw_data_dir.mkdir(parents=True, exist_ok=True) -processed_data_dir.mkdir(parents=True, exist_ok=True) - -# Define paths for transcripts and nuclei data -transcripts_path = raw_data_dir / sample_tag / "transcripts.parquet" -nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" - -# Step 1: Set paths for transcripts and boundaries -xenium_sample = XeniumSample() -xenium_sample.set_file_paths(transcripts_path=transcripts_path, boundaries_path=nuclei_path) -xenium_sample.set_metadata() - - - -# Step 2: Use save_dataset_for_segger directly to handle tiling, processing, and lazy reading of each tile -start_time = time.time() -xenium_sample.save_dataset_for_segger( - processed_dir=processed_data_dir / 'fixed_0911', - x_size=300, - y_size=300, - d_x=280, - d_y=280, - margin_x=10, - margin_y=10, - compute_labels=True, # Set to True if you need to compute labels - r_tx=5, - k_tx=5, - val_prob=0.1, - test_prob=0.2, - neg_sampling_ratio_approx=5, - sampling_rate=1, - num_workers=1, -) -end_time = time.time() -print(f"Time to save dataset: {end_time - start_time} seconds") \ No newline at end of file diff --git a/scripts/create_toy_datasets.py b/scripts/create_toy_datasets.py deleted file mode 100644 index 65b886d..0000000 --- a/scripts/create_toy_datasets.py +++ /dev/null @@ -1,85 +0,0 @@ -import dask.dataframe as dd -import tifffile as tiff # Use tifffile instead of PIL for OME-TIFF -import matplotlib.pyplot as plt -import matplotlib.patches as patches -from pathlib import Path - -# Define the paths to the input Parquet and TIFF files (update these paths to match your file locations) -transcripts_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/transcripts.parquet" -nuclei_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/nucleus_boundaries.parquet" -cell_boundaries_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/cell_boundaries.parquet" -morphology_tiff_file = "data_raw/xenium/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs/morphology.ome.tif" - -# Define the output directory for the toy dataset -output_dir = Path("data_raw/package_toy_data/xenium_pancreas_cancer") -output_dir.mkdir(parents=True, exist_ok=True) - -def find_fovs_in_square(transcripts_file, square_size=3): - print(f"Loading transcripts from {transcripts_file} using Dask...") - transcripts_df = dd.read_parquet(transcripts_file) - fov_list = transcripts_df['fov_name'].drop_duplicates().compute().tolist() - sorted_fovs = sorted(fov_list) - middle_index = len(sorted_fovs) // 2 - half_size = square_size // 2 - start_index = max(middle_index - half_size, 0) - end_index = min(middle_index + half_size + 1, len(sorted_fovs)) - selected_fovs = sorted_fovs[start_index:end_index] - if len(selected_fovs) < square_size ** 2: - print("Warning: The selected square is smaller than expected due to slide boundaries.") - print(f"Selected FOVs: {selected_fovs}") - return selected_fovs, transcripts_df - -def filter_transcripts_by_fovs(transcripts_df, fovs): - print("Filtering transcripts based on selected FOVs...") - filtered_transcripts = transcripts_df[transcripts_df['fov_name'].isin(fovs)] - return filtered_transcripts.compute() - -def filter_boundaries_by_cells(file_path, cell_ids): - print(f"Loading boundaries from {file_path} using Dask...") - boundaries_df = dd.read_parquet(file_path) - filtered_boundaries = boundaries_df[boundaries_df['cell_id'].isin(cell_ids)] - return filtered_boundaries.compute() - -def save_to_parquet(df, output_file): - print(f"Saving data to {output_file}...") - df.to_parquet(output_file, index=False) - print(f"Data saved to {output_file}.") - -def visualize_fovs_on_tiff(fovs_in_square, filtered_fovs_df, tiff_image_file, output_image_file, fov_column='fov_name', x_column='x', y_column='y', width_column='width', height_column='height'): - print(f"Loading TIFF image from {tiff_image_file}...") - tiff_image = tiff.imread(tiff_image_file) - plt.figure(figsize=(10, 10)) - plt.imshow(tiff_image, cmap='gray') - for _, row in filtered_fovs_df.iterrows(): - x, y, width, height = row[x_column], row[y_column], row[width_column], row[height_column] - rect = patches.Rectangle((x, y), width, height, linewidth=1, edgecolor='r', facecolor='none') - plt.gca().add_patch(rect) - plt.title("Selected FOVs over Morphology TIFF Image") - plt.savefig(output_image_file) - plt.show() - -# Step 1: Get FOVs from the transcripts file for the middle square -square_size = 4 # Example: 4x4 FOVs -fovs_in_square, transcripts_df = find_fovs_in_square(transcripts_file, square_size) - -# Step 2: Filter transcripts for the selected FOVs -filtered_transcripts = filter_transcripts_by_fovs(transcripts_df, fovs_in_square) - -# Step 3: Get the cell_ids from the filtered transcripts -cell_ids_in_fovs = filtered_transcripts['cell_id'].unique() - -# Step 4: Process and save filtered cell boundaries for the selected FOVs -cell_boundaries_df = filter_boundaries_by_cells(cell_boundaries_file, cell_ids_in_fovs) -save_to_parquet(cell_boundaries_df, output_dir / f"cell_boundaries.parquet") - -# Step 5: Process and save filtered nuclei boundaries for the selected FOVs -nuclei_boundaries_df = filter_boundaries_by_cells(nuclei_file, cell_ids_in_fovs) -save_to_parquet(nuclei_boundaries_df, output_dir / f"nuclei_boundaries.parquet") - -# Step 6: Process and save filtered transcripts for the selected FOVs -save_to_parquet(filtered_transcripts, output_dir / f"transcripts.parquet") - -# Step 7: Visualize the selected FOVs as squares on top of the TIFF image and save the plot -visualize_fovs_on_tiff(fovs_in_square, filtered_transcripts, morphology_tiff_file, output_dir / "fovs_on_tiff.png") - -print("Toy dataset generation and visualization complete!") diff --git a/scripts/predict.py b/scripts/predict.py deleted file mode 100644 index 318d26c..0000000 --- a/scripts/predict.py +++ /dev/null @@ -1,31 +0,0 @@ -import argparse -from segger.prediction.predict import load_model, predict - -def main(args: argparse.Namespace) -> None: - """ - Main function to load the model and perform predictions. - - Args: - args (argparse.Namespace): Command line arguments. - """ - litsegger = load_model(args.checkpoint_path, args.init_emb, args.hidden_channels, args.out_channels, args.heads, args.aggr) - predict(litsegger, args.dataset_path, args.output_path, args.score_cut, args.k_nc, args.dist_nc, args.k_tx, args.dist_tx) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Predict using the Segger model") - parser.add_argument('--dataset_path', type=str, required=True, help='Path to the dataset directory') - parser.add_argument('--output_path', type=str, required=True, help='Path to save the predictions') - parser.add_argument('--checkpoint_path', type=str, required=True, help='Path to the model checkpoint') - parser.add_argument('--init_emb', type=int, default=8, help='Initial embedding size') - parser.add_argument('--hidden_channels', type=int, default=64, help='Number of hidden channels') - parser.add_argument('--out_channels', type=int, default=16, help='Number of output channels') - parser.add_argument('--heads', type=int, default=4, help='Number of attention heads') - parser.add_argument('--aggr', type=str, default='sum', help='Aggregation method') - parser.add_argument('--score_cut', type=float, default=0.5, help='Score cut-off for predictions') - parser.add_argument('--k_nc', type=int, default=4, help='Number of nearest neighbors for nuclei') - parser.add_argument('--dist_nc', type=int, default=20, help='Distance threshold for nuclei') - parser.add_argument('--k_tx', type=int, default=5, help='Number of nearest neighbors for transcripts') - parser.add_argument('--dist_tx', type=int, default=10, help='Distance threshold for transcripts') - - args = parser.parse_args() - main(args) diff --git a/scripts/predict_model_cuda.py b/scripts/predict_model_cuda.py deleted file mode 100644 index 302da40..0000000 --- a/scripts/predict_model_cuda.py +++ /dev/null @@ -1,170 +0,0 @@ -import torch -import cupy as cp -import dask.array as da -from dask.distributed import Client, wait -from dask_cuda import LocalCUDACluster -from torch.utils.dlpack import to_dlpack, from_dlpack # DLPack conversion -from segger.training.segger_data_module import SeggerDataModule -from segger.prediction.predict import load_model -from cupyx.scipy.sparse import coo_matrix -import torch.distributed as dist -from pathlib import Path -from cuvs.neighbors import cagra -import os -import cuvs -import rmm - -# Initialize RMM -# from rmm.allocators.cupy import rmm_cupy_allocator - -# Initialize RMM with a pool allocator -rmm.reinitialize( - pool_allocator=True, # Enable memory pool - initial_pool_size=2**30 # Set 1GB initial pool size, adjust as needed -) - -# Set RMM as the allocator for CuPy -# cp.cuda.set_allocator(rmm_cupy_allocator) - - - -# Function to compute edge indices using spatial locations -def get_edge_index_cuda(coords_1: torch.Tensor, coords_2: torch.Tensor, k: int = 10, dist: float = 10.0) -> torch.Tensor: - def cupy_to_torch(cupy_array): - return torch.from_dlpack((cupy_array.toDlpack())) - def torch_to_cupy(tensor): - if not tensor.is_contiguous(): - tensor = tensor.contiguous() # Ensure tensor is contiguous - return cp.fromDlpack(to_dlpack(tensor)) # Convert PyTorch tensor to CuPy - print("Converting tensors to CuPy...") # Debug log - cp_coords_1 = torch_to_cupy(coords_1) - cp_coords_2 = torch_to_cupy(coords_2) - cp_dist = cp.float32(dist) - print("Building index...") # Debug log - index_params = cagra.IndexParams() - search_params = cagra.SearchParams() - try: - # Build index and search for nearest neighbors - index = cagra.build_index(index_params, cp_coords_1) - D, I = cagra.search(search_params, index, cp_coords_2, k) - except cuvs.common.exceptions.CuvsException as e: - print(f"cuVS Exception: {e}") - raise - print("Processing search results...") # Debug log - valid_mask = cp.asarray(D < cp_dist ** 2) - repeats = valid_mask.sum(axis=1).tolist() - row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats) - valid_indices = cp.asarray(I)[cp.where(valid_mask)] - edges = cp.vstack((row_indices, valid_indices)).T - edge_index = cupy_to_torch(edges).long().contiguous() - return edge_index - -# Set up a Dask cluster with local GPUs -cluster = LocalCUDACluster(rmm_pool_size="5GB", scheduler_port=8786, dashboard_address=":8787", worker_port=(9000, 9100)) -client = Client(cluster, timeout='500s') - - - -def initialize_distributed(rank, world_size): - os.environ['RANK'] = str(rank) - os.environ['WORLD_SIZE'] = str(world_size) - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' # Any free port can be used here - dist.init_process_group(backend='nccl', rank=rank, world_size=world_size) - - - -# Initialize the PyTorch distributed environment (NCCL backend) -# dist.init_process_group(backend='nccl') - -# Load the model (only done once on the main process) -model_version = 2 -models_dir = Path('./models/bc_embedding_0919') -model_path = models_dir / 'lightning_logs' / f'version_{model_version}' -model = load_model(model_path / 'checkpoints') - -# Scatter the model to all workers (GPUs) -# scattered_model = client.scatter(model) - -# Define the sparse_multiply function -def sparse_multiply(mat1, mat2_T, edge_index, block_info=None): - mat1 = cp.asarray(mat1) - mat2_T = cp.asarray(mat2_T) - # If block_info is provided, we adjust the edge indices for the local chunk - if block_info is not None: - row_block_start, row_block_end = block_info[0]['array-location'][0] - col_block_start, col_block_end = block_info[1]['array-location'][1] - rows, cols = edge_index - row_mask = (rows >= row_block_start) & (rows < row_block_end) - col_mask = (cols >= col_block_start) & (cols < col_block_end) - mask = row_mask & col_mask - # Adjust to local chunk indices for rows and columns - rows = rows[mask] - row_block_start - cols = cols[mask] - col_block_start - else: - # If block_info is None, assume we use the entire matrix - rows, cols = edge_index - # Perform dense multiplication for the current chunk or the full matrix - dense_result = cp.dot(mat1, mat2_T) - # Create the sparse result using the provided edge index - sparse_result = coo_matrix((dense_result[rows, cols], (rows, cols)), shape=dense_result.shape) - # Free GPU memory after each chunk computation - cp.cuda.Stream.null.synchronize() - cp.get_default_memory_pool().free_all_blocks() - return sparse_result - - -def inference_with_sparse_multiply(batch, model, rank, world_size, k=10, dist_r=10.0): - # Initialize the distributed process group inside the worker - if not dist.is_initialized(): - initialize_distributed(rank, world_size) - batch.to(f'cuda:{rank}') - # Load model inside the function to avoid pickling it - model = model.to(f'cuda:{rank}') # Make sure the model is on the correct GPU - # Perform inference using the loaded model - with torch.no_grad(): - output = model.model(batch.x_dict, batch.edge_index_dict) - # Convert output to CuPy arrays using dlpack for further operations - mat1 = cp.fromDlpack(to_dlpack(output['tx'])) - mat2 = cp.fromDlpack(to_dlpack(output['tx'])) - # Transpose mat2 for matrix multiplication - mat2_T = cp.transpose(mat2) - # Compute edge_index based on the 2D positions of tx nodes - coords_1 = batch['tx'].pos[:, :2] # Extract 2D positions - coords_2 = batch['tx'].pos[:, :2] # Assuming the same set of coordinates for the example - edge_index = get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist_r) - # Perform sparse multiplication using the function - result = sparse_multiply(mat1, mat2_T, edge_index) - return result - -# Initialize DataLoader -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') -dm = SeggerDataModule( - data_dir=segger_data_dir, - batch_size=1, - num_workers=1, -) -dm.setup() - - -world_size = 1 # Adjust based on number of GPUs - -futures = [] -for i, batch in enumerate(dm.train_dataloader()): - # Scatter the batch to each GPU worker - scattered_batch = client.scatter(batch) - for rank in range(world_size): - futures.append(client.submit(inference_with_sparse_multiply, scattered_batch, model, rank, world_size, k=10, dist_r=3, retries=3)) - # Gather results from all GPUs - print(f"Batch {i} processed with dynamic edge index and sparse multiplication.") - -with ProgressBar(): - results = client.gather(futures) - - -# Call the function and get results in memory -all_results = process_all_batches() -print("All batches processed.") - -# Clean up NCCL -dist.destroy_process_group() diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 82b6787..f875c82 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -1,5 +1,5 @@ from segger.training.segger_data_module import SeggerDataModule -from segger.prediction.predict import segment, get_similarity_scores, load_model, predict_batch +from segger.prediction.predict import segment, get_similarity_scores, load_model, predict_batch, predict from pathlib import Path from matplotlib import pyplot as plt import seaborn as sns @@ -9,9 +9,13 @@ import pandas as pd from pathlib import Path os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' +import cupy as cp +from dask.distributed import Client, LocalCluster +from dask_cuda import LocalCUDACluster +import dask.dataframe as dd -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') -models_dir = Path('./models/bc_embedding_0919') +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') +models_dir = Path('./models/bc_embedding_1001_small') benchmarks_dir = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') transcripts_file = 'data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet' # Initialize the Lightning data module @@ -24,69 +28,25 @@ dm.setup() -model_version = 2 +model_version = 0 # Load in latest checkpoint model_path = models_dir / 'lightning_logs' / f'version_{model_version}' model = load_model(model_path / 'checkpoints') -# batch = next(iter(dm.train_dataloader())).to('cuda') -# print(batch) -# outs = model.model(batch.x_dict, batch.edge_index_dict) - - -# def get_similarity_scores( -# model: torch.nn.Module, -# batch: Batch, -# from_type: str, -# to_type: str, -# receptive_field: dict -# ) - - -# def predict_batch( -# lit_segger: torch.nn.Module, -# batch: Batch, -# score_cut: float, -# receptive_field: Dict[str, float], -# use_cc: bool = True, -# knn_method: str = 'cuda' -# ) -> pd.DataFrame: - -for batch in dm.train_dataloader(): - batch = batch.to('cuda') - # outs = get_similarity_scores( - # model= model.model - # batch=batch, - # from_type='tx', - # to_type='bd', - # receptive_field={'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} - # ) - outs = predict_batch( - lit_segger=model, - batch=batch, - score_cut=.5, - receptive_field={'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3}, - use_cc = False, - knn_method= 'cuda' - ) - print(outs) - -# dm.setup() - -# receptive_field = {'k_bd': 4, 'dist_bd': 10,'k_tx': 5, 'dist_tx': 3} - -# segment( -# model, -# dm, -# save_dir=benchmarks_dir, -# seg_tag='test_segger_segment', -# transcript_file=transcripts_file, -# file_format='anndata', -# receptive_field = receptive_field, -# min_transcripts=10, -# max_transcripts=1000, -# cell_id_col='segger_cell_id', -# use_cc=False, -# knn_method='kd_tree' -# ) \ No newline at end of file +receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 5, 'dist_tx': 5} + +segment( + model, + dm, + save_dir=benchmarks_dir, + seg_tag='segger_embedding_1001_cc_true', + transcript_file=transcripts_file, + file_format='anndata', + receptive_field = receptive_field, + min_transcripts=5, + # max_transcripts=1500, + cell_id_col='segger_cell_id', + use_cc=True, + knn_method='cuda' +) \ No newline at end of file diff --git a/scripts/train_model.sh b/scripts/train_model.sh deleted file mode 100644 index 0b91b67..0000000 --- a/scripts/train_model.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# Submit the job with bsub, requesting 4 GPUs and setting GPU memory limit to 20G -bsub -o logs_0910 -R "tensorcore" -gpu "num=4:gmem=20G" -q gpu CUDA_LAUNCH_BLOCKING=1 \ -python ../segger_dev/scripts/train_model.py \ - --train_dir data_tidy/pyg_datasets/clean_parallel/train_tiles \ - --val_dir data_tidy/pyg_datasets/clean_parallel/val_tiles \ - --batch_size_train 4 \ - --batch_size_val 4 \ - --num_tx_tokens 1000 \ - --init_emb 8 \ - --hidden_channels 64 \ - --out_channels 16 \ - --heads 4 \ - --mid_layers 1 \ - --aggr sum \ - --accelerator cpu \ - --strategy auto \ - --precision 16-mixed \ - --devices 4 \ - --epochs 100 \ - --default_root_dir ./models/clean_parallel diff --git a/scripts/train_model_sample.py b/scripts/train_model_sample.py index 2ee3a55..ec3611a 100644 --- a/scripts/train_model_sample.py +++ b/scripts/train_model_sample.py @@ -14,13 +14,13 @@ import os -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') -models_dir = Path('./models/bc_embedding_0919') +segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') +models_dir = Path('./models/bc_embedding_1001_small') dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=1, - num_workers=1, + batch_size=4, + num_workers=2, ) dm.setup() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..964fe9b --- /dev/null +++ b/setup.py @@ -0,0 +1,90 @@ +from setuptools import setup, find_packages + +setup( + name="segger", + version="0.1.0", + description="Fast and accurate cell segmentation for single-molecule spatial omics", + author="Elyas Heidari", + author_email="elyas.heidari@dkfz-heidelberg.de", + license="MIT", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + python_requires=">=3.10", + keywords=["segmentation", "deep learning", "pytorch", "geometric deep learning"], + install_requires=[ + "torch>=2.0.0", + "numpy>=1.21.0", + "pandas>=1.3.0", + "scipy>=1.7.0", + "matplotlib>=3.4.0", + "seaborn>=0.11.0", + "tqdm>=4.61.0", + "torchvision>=0.10.0", + "lightning>=1.9.0", + "torchmetrics>=0.5.0", + "scanpy>=1.9.3", + "squidpy==1.2.0", + "adjustText>=0.8", + "scikit-learn>=0.24.0", + "geopandas>=0.9.0", + "shapely>=1.7.0", + "path>=17.0.0", + "pyarrow>=17.0.0", + "torch-geometric>=2.2.0", + "dask_geopandas>=0.4.0" + ], + extras_require={ + "gpu": [ + "cuml>=21.08", + "cudf>=21.08", + "cugraph>=21.08", + "cuspatial>=21.08", + "faiss-cpu>=1.7.0", + "faiss-gpu>=1.7.0" + ], + "torch-geometric": [ + "torch-scatter>=2.1.2", + "torch-sparse>=0.6.18", + "torch-cluster>=1.6.3", + ], + "multiprocessing": ["multiprocessing"], + "dev": [ + "pytest", + "black", + "flake8", + "pre-commit", + "twine>=4.0.2", + ], + "docs": [ + "docutils>=0.8,!=0.18.*,!=0.19.*", + "sphinx>=4.1", + "sphinx-book-theme>=1.0.0", + "myst-nb", + "myst-parser", + "sphinxcontrib-bibtex>=1.0.0", + "sphinx-autodoc-typehints", + "sphinx_rtd_theme", + "sphinxext-opengraph", + "sphinx-copybutton", + "sphinx-design", + "sphinx-hoverxref", + "ipykernel", + "ipython", + "pandas", + ], + "tests": [ + "pytest", + "coverage", + ], + }, + url="https://github.com/EliHei2/segger_dev", + project_urls={ + "Bug Tracker": "https://github.com/EliHei2/segger_dev/issues", + "Documentation": "https://EliHei2.github.io/segger_dev", + "Source Code": "https://github.com/EliHei2/segger_dev", + "Homepage": "https://EliHei2.github.io/segger_dev", + "Repository": "https://github.com/EliHei2/segger_dev", + }, + packages=find_packages(where="src"), + package_dir={"": "src"}, +) diff --git a/src/segger/data/io.py b/src/segger/data/io.py index fe855ed..a369b9f 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -59,7 +59,7 @@ def __init__( self.transcripts_radius = transcripts_radius self.boundaries_graph = boundaries_graph self.keys = keys - self.embedding_df = None + self.embedding_df = embedding_df self.current_embedding = 'token' self.verbose = verbose @@ -190,7 +190,7 @@ def load_transcripts( transcripts_df = self.filter_transcripts(transcripts_df) # Handle additional embeddings if provided - if self.embedding_df: + if not self.embedding_df.empty: valid_genes = self.embedding_df.index # Lazily count the number of rows in the DataFrame before filtering initial_count = delayed(lambda df: df.shape[0])(transcripts_df) @@ -895,12 +895,13 @@ def build_pyg_data_from_tile( transcripts_df['token'] = token_encoding # Store the integer tokens in the 'token' column data['tx'].token = torch.as_tensor(token_encoding).int() # Handle additional embeddings lazily as well - if self.embedding_df: + if not self.embedding_df.empty: embeddings = delayed(lambda df: self.embedding_df.loc[ df[self.keys.FEATURE_NAME.value].values ].values)(transcripts_df) else: embeddings = token_encoding + embeddings = embeddings.compute() x_features = torch.as_tensor(embeddings).int() data['tx'].x = x_features diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 8c42636..3abd5b1 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -95,21 +95,19 @@ def compute_transcript_metrics( df_filtered = df[df['qv'] > qv_threshold] total_transcripts = len(df_filtered) assigned_transcripts = df_filtered[df_filtered[cell_id_col] != -1] - percent_assigned = len(assigned_transcripts) / total_transcripts * 100 + percent_assigned = len(assigned_transcripts) / (total_transcripts+1) * 100 cytoplasmic_transcripts = assigned_transcripts[assigned_transcripts['overlaps_nucleus'] != 1] - percent_cytoplasmic = len(cytoplasmic_transcripts) / len(assigned_transcripts) * 100 + percent_cytoplasmic = len(cytoplasmic_transcripts) / (len(assigned_transcripts) + 1)* 100 percent_nucleus = 100 - percent_cytoplasmic non_assigned_transcripts = df_filtered[df_filtered[cell_id_col] == -1] non_assigned_cytoplasmic = non_assigned_transcripts[non_assigned_transcripts['overlaps_nucleus'] != 1] - percent_non_assigned_cytoplasmic = len(non_assigned_cytoplasmic) / len(non_assigned_transcripts) * 100 - + percent_non_assigned_cytoplasmic = len(non_assigned_cytoplasmic) / (len(non_assigned_transcripts)+1) * 100 gene_group_assigned = assigned_transcripts.groupby('feature_name') gene_group_all = df_filtered.groupby('feature_name') - gene_percent_assigned = (gene_group_assigned.size() / gene_group_all.size() * 100).reset_index(name='percent_assigned') + gene_percent_assigned = (gene_group_assigned.size() / (gene_group_all.size()+1) * 100).reset_index(names='percent_assigned') cytoplasmic_gene_group = cytoplasmic_transcripts.groupby('feature_name') - gene_percent_cytoplasmic = (cytoplasmic_gene_group.size() / len(cytoplasmic_transcripts) * 100).reset_index(name='percent_cytoplasmic') + gene_percent_cytoplasmic = (cytoplasmic_gene_group.size() / (len(cytoplasmic_transcripts)+1) * 100).reset_index(name='percent_cytoplasmic') gene_metrics = pd.merge(gene_percent_assigned, gene_percent_cytoplasmic, on='feature_name', how='outer').fillna(0) - results = { 'percent_assigned': percent_assigned, 'percent_cytoplasmic': percent_cytoplasmic, @@ -144,8 +142,9 @@ def create_anndata( Returns: ad.AnnData: The generated AnnData object containing the transcriptomics data and metadata. """ - df_filtered = filter_transcripts(df, min_qv=qv_threshold) - metrics = compute_transcript_metrics(df_filtered, qv_threshold, cell_id_col) + # df_filtered = filter_transcripts(df, min_qv=qv_threshold) + df_filtered = df + # metrics = compute_transcript_metrics(df_filtered, qv_threshold, cell_id_col) df_filtered = df_filtered[df_filtered[cell_id_col].astype(str) != '-1'] pivot_df = df_filtered.rename(columns={ cell_id_col: "cell", @@ -156,28 +155,28 @@ def create_anndata( for cell_id, cell_data in df_filtered.groupby(cell_id_col): if len(cell_data) < min_transcripts: continue - cell_convex_hull = ConvexHull(cell_data[['x_location', 'y_location']]) + cell_convex_hull = ConvexHull(cell_data[['x_location', 'y_location']], qhull_options='QJ') cell_area = cell_convex_hull.area if cell_area < min_cell_area or cell_area > max_cell_area: continue - if 'nucleus_distance' in cell_data: - nucleus_data = cell_data[cell_data['nucleus_distance'] == 0] - else: - nucleus_data = cell_data[cell_data['overlaps_nucleus'] == 1] - if len(nucleus_data) >= 3: - nucleus_convex_hull = ConvexHull(nucleus_data[['x_location', 'y_location']]) - else: - nucleus_convex_hull = None + # if 'nucleus_distance' in cell_data: + # nucleus_data = cell_data[cell_data['nucleus_distance'] == 0] + # else: + # nucleus_data = cell_data[cell_data['overlaps_nucleus'] == 1] + # if len(nucleus_data) >= 3: + # nucleus_convex_hull = ConvexHull(nucleus_data[['x_location', 'y_location']]) + # else: + # nucleus_convex_hull = None cell_summary.append({ "cell": cell_id, "cell_centroid_x": cell_data['x_location'].mean(), "cell_centroid_y": cell_data['y_location'].mean(), "cell_area": cell_area, - "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), - "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), - "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0, - "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100, - "has_nucleus": len(nucleus_data) > 0 + # "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), + # "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), + # "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0, + # "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100, + # "has_nucleus": len(nucleus_data) > 0 }) cell_summary = pd.DataFrame(cell_summary).set_index("cell") if panel_df is not None: @@ -198,8 +197,8 @@ def create_anndata( var_df['feature_types'] = 'Gene Expression' var_df['genome'] = 'Unknown' var_df = var_df.set_index('gene') - gene_metrics = metrics['gene_metrics'].set_index('feature_name') - var_df = var_df.join(gene_metrics, how='left').fillna(0) + # gene_metrics = metrics['gene_metrics'].set_index('feature_name') + # var_df = var_df.join(gene_metrics, how='left').fillna(0) cells = list(set(pivot_df.index) & set(cell_summary.index)) pivot_df = pivot_df.loc[cells,:] cell_summary = cell_summary.loc[cells,:] @@ -209,12 +208,12 @@ def create_anndata( adata.obs['unique_transcripts'] = (pivot_df > 0).sum(axis=1).values adata.obs_names = pivot_df.index.values.tolist() adata.obs = pd.merge(adata.obs, cell_summary.loc[adata.obs_names,:], left_index=True, right_index=True) - adata.uns['metrics'] = { - 'percent_assigned': metrics['percent_assigned'], - 'percent_cytoplasmic': metrics['percent_cytoplasmic'], - 'percent_nucleus': metrics['percent_nucleus'], - 'percent_non_assigned_cytoplasmic': metrics['percent_non_assigned_cytoplasmic'] - } + # adata.uns['metrics'] = { + # 'percent_assigned': metrics['percent_assigned'], + # 'percent_cytoplasmic': metrics['percent_cytoplasmic'], + # 'percent_nucleus': metrics['percent_nucleus'], + # 'percent_non_assigned_cytoplasmic': metrics['percent_non_assigned_cytoplasmic'] + # } return adata @@ -366,7 +365,9 @@ def get_edge_index_cuda( coords_1: torch.Tensor, coords_2: torch.Tensor, k: int = 10, - dist: float = 10.0 + dist: float = 10.0, + metric: str = "sqeuclidean", + nn_descent_niter: int = 100 ) -> torch.Tensor: """ Computes edge indices using RAPIDS cuVS with cagra for vector similarity search, @@ -383,16 +384,17 @@ def get_edge_index_cuda( """ def cupy_to_torch(cupy_array): return torch.from_dlpack((cupy_array.toDlpack())) - + # gg def torch_to_cupy(tensor): return cp.fromDlpack(dlpack.to_dlpack(tensor)) # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack - cp_coords_1 = cp.float32(torch_to_cupy(coords_1)) - cp_coords_2 = cp.float32(torch_to_cupy(coords_2)) + cp_coords_1 = torch_to_cupy(coords_1).astype(cp.float32) + cp_coords_2 = torch_to_cupy(coords_2).astype(cp.float32) # Define the distance threshold in CuPy cp_dist = cp.float32(dist) # IndexParams and SearchParams for cagra - index_params = cagra.IndexParams(nn_descent_niter=100) + # compression_params = cagra.CompressionParams(pq_bits=pq_bits) + index_params = cagra.IndexParams(metric=metric,nn_descent_niter=nn_descent_niter) #, compression=compression_params) search_params = cagra.SearchParams() # Build index using CuPy coords index = cagra.build_index(index_params, cp_coords_1) @@ -567,4 +569,17 @@ def format_time(elapsed: float) -> str: str Formatted time in h:m:s. """ - return str(timedelta(seconds=int(elapsed))) \ No newline at end of file + return str(timedelta(seconds=int(elapsed))) + + + + + + + + + + + + + diff --git a/src/segger/models/segger_model.py b/src/segger/models/segger_model.py index 5ad9af8..d2e13ad 100644 --- a/src/segger/models/segger_model.py +++ b/src/segger/models/segger_model.py @@ -56,7 +56,7 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: """ x = torch.nan_to_num(x, nan = 0) is_one_dim = (x.ndim == 1) * 1 - x = x[:, None] + # x = x[:, None] x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) # First layer x = x.relu() diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index d8bb001..9e5423e 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -16,7 +16,8 @@ get_edge_index_cuda, get_edge_index, format_time, - create_anndata + create_anndata, + coo_to_dense_adj, ) from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule @@ -31,6 +32,17 @@ from cupyx.scipy.sparse import coo_matrix from torch.utils.dlpack import to_dlpack, from_dlpack +from dask.distributed import Client, LocalCluster +import cupy as cp +import numpy as np +import pandas as pd +from cupyx.scipy.sparse import coo_matrix +from cupyx.scipy.sparse import find # To find non-zero elements in sparse matrix +from scipy.sparse.csgraph import connected_components as cc +from scipy.sparse import coo_matrix as scipy_coo_matrix +# Setup Dask cluster with 3 workers + + # CONFIG torch._dynamo.config.suppress_errors = True @@ -82,6 +94,7 @@ def sort_order(c): return lit_segger + def get_similarity_scores( model: torch.nn.Module, batch: Batch, @@ -105,49 +118,61 @@ def get_similarity_scores( """ # Step 1: Get embeddings from the model batch = batch.to("cuda") - embeddings = model(batch.x_dict, batch.edge_index_dict) - - # Convert PyTorch embeddings to CuPy using DLPack - mat1 = cp.fromDlpack(to_dlpack(embeddings[from_type])) - mat2_T = cp.fromDlpack(to_dlpack(embeddings[to_type])) - - # Step 2: Get edge indices using the 'sees' relationship from the batch + shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] edge_index = get_edge_index( - batch[to_type].pos[:, :2], # 'tx' positions - batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field['k_bd'], - dist=receptive_field['dist_bd'], - method='cuda' - ) + batch[to_type].pos[:, :2], # 'tx' positions + batch[from_type].pos[:, :2], # 'bd' positions + k=receptive_field[f'k_{to_type}'], + dist=receptive_field[f'dist_{to_type}'], + method='cuda' + ) + edge_index = coo_to_dense_adj( + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f'k_{to_type}'], + ) - print(edge_index) + with torch.no_grad(): + embeddings = model(batch.x_dict, batch.edge_index_dict) - # Step 3: Perform sparse matrix multiplication using CuPy - def sparse_multiply(mat1, mat2_T, edge_index) -> coo_matrix: - rows, cols = edge_index + del batch + + # print(edge_index) + # print(embeddings) + + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: + m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros + + similarity = torch.bmm( + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 + del embeddings + # Sigmoid to get most similar 'to_type' neighbor + similarity[similarity == 0] = -torch.inf # ensure zero stays zero + similarity = F.sigmoid(similarity) + # Neighbor-filtered similarity scores + # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + indices = torch.argwhere(edge_index != -1).T + indices[1] = edge_index[edge_index != -1] + rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) + columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) print(rows) - print(cols) - dense_result = cp.dot(mat1, mat2_T.T) - - # Create sparse result matrix - sparse_result = coo_matrix((dense_result[rows, cols], (rows, cols)), shape=dense_result.shape) - - # Free GPU memory after computation - cp.cuda.Stream.null.synchronize() - cp.get_default_memory_pool().free_all_blocks() - + del indices + values = similarity[edge_index != -1].flatten() + sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) return sparse_result + # Free GPU memory after computation + # Call the sparse multiply function - sparse_similarity = sparse_multiply(mat1, mat2_T, edge_index.T) - - - scores = torch.from_dlpack(sparse_similarity.toarray().toDlpack()).to("cuda") - - torch.cuda.empty_cache() + sparse_similarity = sparse_multiply(embeddings, edge_index, shape) + gc.collect() + cp.cuda.Stream.null.synchronize() cp.get_default_memory_pool().free_all_blocks() - - return scores + torch.cuda.empty_cache() + # No need to convert to PyTorch tensor; return the CuPy sparse matrix + return sparse_similarity @@ -183,68 +208,78 @@ def _get_id(): """Generate a random Xenium-style ID.""" return ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '-nx' - with torch.no_grad(): + # Use CuPy with GPU context + with cp.cuda.Device(0): # Move batch to GPU batch = batch.to("cuda") - print(batch) - # Initialize assignments DataFrame - transcript_id = batch['tx'].id.cpu().numpy() + # Extract transcript IDs and initialize assignments DataFrame + transcript_id = cp.asnumpy(batch['tx'].id) assignments = pd.DataFrame({'transcript_id': transcript_id}) - if len(batch['bd'].id[0]) > 0: - # Compute edge index using get_edge_index_cuda for 'tx' sees 'bd' - # batch['tx', 'sees', 'bd'].edge_index = get_edge_index( - # batch['tx'].pos[:, :2], # 'tx' positions - # batch['bd'].pos[:, :2], # 'bd' positions - # k=receptive_field['k_bd'], - # dist=receptive_field['dist_bd'], - # method='cuda' - # ) - + if len(batch['bd'].pos) >= 10: # Compute similarity scores between 'tx' and 'bd' scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field) - + torch.cuda.empty_cache() + # Convert sparse matrix to dense format + dense_scores = scores.toarray() # Convert to dense NumPy array + del scores # Remove from memory + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory # Get direct assignments from similarity matrix - belongs = scores.max(axis=1) - assignments['score'] = belongs.values.cpu() - mask = assignments['score'] > score_cut - - all_ids = np.concatenate(batch['bd'].id)[np.argmax(scores.cpu().numpy(), axis=1)] - assignments.loc[mask, 'segger_cell_id'] = all_ids[mask] + belongs = cp.max(dense_scores, axis=1) # Max score per transcript + assignments['score'] = cp.asnumpy(belongs) # Move back to CPU - # Add 'bound' column (1 for assigned, 0 for unassigned) + mask = assignments['score'] > score_cut + all_ids = np.concatenate(batch['bd'].id) # Keep IDs as NumPy array + assignments['segger_cell_id'] = None # Initialize as None + max_indices = cp.argmax(dense_scores, axis=1).get() + assignments['segger_cell_id'][mask] = all_ids[max_indices[mask]] # Assign IDs + + del dense_scores # Remove from memory + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + torch.cuda.empty_cache() +# Move back to CPU assignments['bound'] = 0 - assignments.loc[mask, 'bound'] = 1 - + assignments['bound'][mask] = 1 + + if use_cc: - # Compute edge index for 'tx' sees 'tx' using get_edge_index - batch['tx', 'sees', 'tx'].edge_index = get_edge_index( - batch['tx'].pos[:, :2], # 'tx' positions - batch['tx'].pos[:, :2], # 'tx' positions (self-reference) - k=receptive_field['k_tx'], - dist=receptive_field['dist_tx'], - method='cuda' - ) - # Compute similarity scores between 'tx' and 'tx' - scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx") - scores_tx.setdiag(0) # Ignore self-similarity + scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) + # Convert to dense NumPy array + data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) + row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) + col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) + + # dense_scores_tx = scores_tx.toarray().astype(cp.float16) + # Rebuild the matrix on CPU using SciPy + dense_scores_tx = scipy_coo_matrix((data_cpu, (row_cpu, col_cpu)), shape=scores_tx.shape).toarray() + + np.fill_diagonal(dense_scores_tx, 0) # Ignore self-similarity + + del scores_tx # Remove from memory + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory # Assign unassigned transcripts using connected components - no_id = assignments['segger_cell_id'].isna().values - no_id_scores = scores_tx[no_id][:, no_id] - n, comps = cc(no_id_scores, connection="weak", directed=False) - new_ids = np.array([_get_id() for _ in range(n)]) - assignments.loc[no_id, 'segger_cell_id'] = new_ids[comps] + no_id = assignments['segger_cell_id'].isna() + if np.any(no_id): # Only compute if there are unassigned transcripts + no_id_scores = dense_scores_tx[no_id][:, no_id] + del dense_scores_tx # Remove from memory + no_id_scores[no_id_scores < score_cut] = 0 + n, comps = cc(no_id_scores, connection="weak", directed=False) + new_ids = np.array([_get_id() for _ in range(n)]) + assignments['segger_cell_id'][no_id] = new_ids[comps] # Perform memory cleanup to avoid OOM issues - # rmm.reinitialize(pool_allocator=True) + cp.get_default_memory_pool().free_all_blocks() torch.cuda.empty_cache() - gc.collect() return assignments + + + + def predict( lit_segger: LitSegger, data_loader: DataLoader, @@ -252,65 +287,52 @@ def predict( receptive_field: dict, use_cc: bool = True, knn_method: str = 'cuda' -) -> dd.DataFrame: +) -> pd.DataFrame: # Change return type to Dask DataFrame if applicable """ - Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - - Args: - lit_segger (LitSegger): The lightning module wrapping the segmentation model. - data_loader (DataLoader): A data loader providing batches of transcript and cell data. - score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. - receptive_field (dict): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. - use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. - knn_method (str, optional): The method to use for nearest neighbors ('cuda' by default). Defaults to 'cuda'. - - Returns: - dd.DataFrame: A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. + Optimized prediction for multiple batches of transcript data. """ - if len(data_loader) == 0: - return None - - # Create a meta DataFrame for the Dask DataFrame - meta = pd.DataFrame({ - 'transcript_id': pd.Series(dtype='int64'), - 'score': pd.Series(dtype='float32'), - 'segger_cell_id': pd.Series(dtype='object'), - 'bound': pd.Series(dtype='int64') - }) - - # Convert the entire data loader to delayed predictions - delayed_assignments = [ - delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) - for batch in data_loader - ] - - # Build the Dask DataFrame from the delayed assignments - assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) + all_assignments = [] + + for batch in data_loader: + assignments = predict_batch(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) + all_assignments.append(dd.from_pandas(assignments, npartitions=1)) + + cp.get_default_memory_pool().free_all_blocks() + torch.cuda.empty_cache() - # Max score selection logic, with fallback to unbound scores if no bound=1 - def select_max_score_partition(df): - max_bound_idx = df[df['bound'] == 1].groupby('transcript_id')['score'].idxmax() - max_unbound_idx = df[df['bound'] == 0].groupby('transcript_id')['score'].idxmax() + # Concatenate all assignments into a single Dask DataFrame + final_assignments = dd.concat(all_assignments, ignore_index=True) - # Combine indices, prioritizing bound=1 scores - final_idx = max_bound_idx.combine_first(max_unbound_idx) - result = df.loc[final_idx].reset_index(drop=True) + # Sort the Dask DataFrame by 'transcript_id' before setting it as an index + final_assignments = final_assignments.sort_values(by='transcript_id') - # Handle cases where there's only one entry per 'segger_cell_id' - single_entry_mask = result.groupby('segger_cell_id').size() == 1 - result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - - return result + # Set a unique index for Dask DataFrame + final_assignments = final_assignments.set_index('transcript_id', sorted=True) - # Map the logic over each partition using Dask - final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) + # Max score selection logic + max_bound_idx = final_assignments[final_assignments['bound'] == 1].groupby('transcript_id')['score'].idxmax() + max_unbound_idx = final_assignments[final_assignments['bound'] == 0].groupby('transcript_id')['score'].idxmax() - # Trigger garbage collection and free GPU memory - # rmm.reinitialize(pool_allocator=True) - torch.cuda.empty_cache() - gc.collect() + # Combine indices, prioritizing bound=1 scores + final_idx = max_bound_idx.combine_first(max_unbound_idx).compute() # Ensure it's computed + + # Now use the computed final_idx for indexing + result = final_assignments.loc[final_idx].compute().reset_index(names=['transcript_id']) + + # result = results.reset_index() - return final_assignments + # Handle cases where there's only one entry per 'segger_cell_id' + # single_entry_mask = result.groupby('segger_cell_id').size() == 1 +# Handle cases where there's only one entry per 'segger_cell_id' + # single_entry_counts = result['segger_cell_id'].value_counts() # Count occurrences of each ID + # single_entry_mask = single_entry_counts[single_entry_counts == 1].index # Get IDs with a count of 1 + + # # Update 'segger_cell_id' for single entries + # for segger_id in single_entry_mask: + # result.loc[result['segger_cell_id'] == segger_id, 'segger_cell_id'] = 'floating' + + + return result def segment( @@ -357,9 +379,6 @@ def segment( None """ start_time = time.time() - # rmm.reinitialize(pool_allocator=True, initial_pool_size=2**26, maximum_pool_size=2**30) - # cp.cuda.set_allocator(rmm_cupy_allocator) - # Ensure the save directory exists save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) @@ -374,14 +393,20 @@ def segment( test_dataloader = dm.test_dataloader() val_dataloader = dm.val_dataloader() - # delayed_train = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - # delayed_val = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + segmentation_train = predict(model, train_dataloader, score_cut, receptive_field, use_cc, knn_method) + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() + gc.collect() - delayed_test = delayed_test.compute() - # Compute all predictions at once using Dask - # with ProgressBar(): - # segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) + segmentation_val = predict(model, val_dataloader, score_cut, receptive_field, use_cc, knn_method) + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() + gc.collect() + + segmentation_test = predict(model, test_dataloader, score_cut, receptive_field, use_cc, knn_method) + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() + gc.collect() if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -391,13 +416,13 @@ def segment( step_start_time = time.time() # Combine the segmentation data - seg_combined = dd.concat([segmentation_train, segmentation_val, segmentation_test]) - - # No need to handle max score logic here, as it's done inside the `predict` function - seg_final = seg_combined.compute() + seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test], ignore_index=True) + # seg_combined = segmentation_test + print(seg_combined.columns) + # print(transcripts_df.id) # Drop any unassigned rows - seg_final = seg_final.dropna(subset=['segger_cell_id']).reset_index(drop=True) + seg_final = seg_combined.dropna(subset=['segger_cell_id']).reset_index(drop=True) if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -411,9 +436,11 @@ def segment( if verbose: print("Merging segmentation results with transcripts...") - # Merge the segmentation results with the transcript data + # Convert the segmentation results to a Dask DataFrame, keeping npartitions consistent seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) - transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner').compute() + + # Merge the segmentation results with the transcript data (still as Dask DataFrame) + transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -421,22 +448,23 @@ def segment( # Step 4: Save the merged result step_start_time = time.time() - + if verbose: print(f"Saving results in {file_format} format...") if file_format == 'csv': save_path = save_dir / f'{seg_tag}_segmentation.csv' - transcripts_df_filtered.to_csv(save_path, index=False) + transcripts_df_filtered.compute().to_csv(save_path, index=False) # Use pandas after computing elif file_format == 'parquet': save_path = save_dir / f'{seg_tag}_segmentation.parquet' - transcripts_df_filtered.to_parquet(save_path, index=False) + transcripts_df_filtered.to_parquet(save_path, index=False) # Dask handles Parquet fine elif file_format == 'anndata': save_path = save_dir / f'{seg_tag}_segmentation.h5ad' - segger_adata = create_anndata(transcripts_df_filtered, **anndata_kwargs) + segger_adata = create_anndata(transcripts_df_filtered.compute(), **anndata_kwargs) # Compute for AnnData segger_adata.write(save_path) else: raise ValueError(f"Unsupported file format: {file_format}") + # raise ValueError(f"Unsupported file format: {file_format}") if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -448,6 +476,283 @@ def segment( print(f"Total segmentation process completed in {total_time}.") # Step 5: Garbage collection and memory cleanup - # rmm.reinitialize(pool_allocator=True) - # torch.cuda.empty_cache() + torch.cuda.empty_cache() gc.collect() + + + + + +# def predict( +# lit_segger: LitSegger, +# data_loader: DataLoader, +# score_cut: float, +# receptive_field: dict, +# use_cc: bool = True, +# knn_method: str = 'cuda' +# ) -> dd.DataFrame: +# """ +# Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. + +# Args: +# lit_segger (LitSegger): The lightning module wrapping the segmentation model. +# data_loader (DataLoader): A data loader providing batches of transcript and cell data. +# score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. +# receptive_field (dict): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. +# use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. +# knn_method (str, optional): The method to use for nearest neighbors ('cuda' by default). Defaults to 'cuda'. + +# Returns: +# dd.DataFrame: A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. +# """ + + +# if len(data_loader) == 0: +# return None + +# # Create a meta DataFrame for the Dask DataFrame +# meta = pd.DataFrame({ +# 'transcript_id': pd.Series(dtype='int64'), +# 'score': pd.Series(dtype='float32'), +# 'segger_cell_id': pd.Series(dtype='object'), +# 'bound': pd.Series(dtype='int64') +# }) + +# # Convert the entire data loader to delayed predictions +# delayed_assignments = [ +# delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) +# for batch in data_loader +# ] + +# # Build the Dask DataFrame from the delayed assignments +# assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) + +# # Max score selection logic, with fallback to unbound scores if no bound=1 +# def select_max_score_partition(df): +# max_bound_idx = df[df['bound'] == 1].groupby('transcript_id')['score'].idxmax() +# max_unbound_idx = df[df['bound'] == 0].groupby('transcript_id')['score'].idxmax() + +# # Combine indices, prioritizing bound=1 scores +# final_idx = max_bound_idx.combine_first(max_unbound_idx) +# result = df.loc[final_idx].reset_index(drop=True) + +# # Handle cases where there's only one entry per 'segger_cell_id' +# single_entry_mask = result.groupby('segger_cell_id').size() == 1 +# result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' + +# return result + +# # Map the logic over each partition using Dask +# final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) + +# # Trigger garbage collection and free GPU memory +# torch.cuda.empty_cache() +# gc.collect() + +# final_assignments = final_assignments.compute() + + + +# return final_assignments + + + + +# # def predict( +# # lit_segger: LitSegger, +# # data_loader: DataLoader, +# # score_cut: float, +# # receptive_field: dict, +# # use_cc: bool = True, +# # knn_method: str = 'cuda' +# # ) -> dd.DataFrame: +# # """ +# # Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. + +# # Args: +# # lit_segger (LitSegger): The lightning module wrapping the segmentation model. +# # data_loader (DataLoader): A data loader providing batches of transcript and cell data. +# # score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. +# # receptive_field (dict): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. +# # use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. +# # knn_method (str, optional): The method to use for nearest neighbors ('cuda' by default). Defaults to 'cuda'. + +# # Returns: +# # dd.DataFrame: A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. +# # """ +# # if len(data_loader) == 0: +# # return None + +# # # Create a meta DataFrame for the Dask DataFrame +# # meta = pd.DataFrame({ +# # 'transcript_id': pd.Series(dtype='int64'), +# # 'score': pd.Series(dtype='float32'), +# # 'segger_cell_id': pd.Series(dtype='object'), +# # 'bound': pd.Series(dtype='int64') +# # }) + +# # # Convert the entire data loader to delayed predictions +# # delayed_assignments = [ +# # delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) +# # for batch in data_loader +# # ] + +# # # Build the Dask DataFrame from the delayed assignments +# # assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) + +# # # Max score selection logic, with fallback to unbound scores if no bound=1 +# # def select_max_score_partition(df): +# # max_bound_idx = df[df['bound'] == 1].groupby('transcript_id')['score'].idxmax() +# # max_unbound_idx = df[df['bound'] == 0].groupby('transcript_id')['score'].idxmax() + +# # # Combine indices, prioritizing bound=1 scores +# # final_idx = max_bound_idx.combine_first(max_unbound_idx) +# # result = df.loc[final_idx].reset_index(drop=True) + +# # # Handle cases where there's only one entry per 'segger_cell_id' +# # single_entry_mask = result.groupby('segger_cell_id').size() == 1 +# # result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' + +# # return result + +# # # Map the logic over each partition using Dask +# # final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) + +# # # Trigger garbage collection and free GPU memory +# # # rmm.reinitialize(pool_allocator=True) +# # torch.cuda.empty_cache() +# # gc.collect() + +# # return final_assignments + + +# def segment( +# model: LitSegger, +# dm: SeggerDataModule, +# save_dir: Union[str, Path], +# seg_tag: str, +# transcript_file: Union[str, Path], +# score_cut: float = .25, +# use_cc: bool = True, +# file_format: str = 'anndata', +# receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, +# knn_method: str = 'kd_tree', +# verbose: bool = False, +# **anndata_kwargs +# ) -> None: +# """ +# Perform segmentation using the model, merge segmentation results with transcripts_df, +# and save in the specified format. Memory is managed efficiently using Dask and GPU +# memory optimizations. + +# Args: +# model (LitSegger): The trained segmentation model. +# dm (SeggerDataModule): The SeggerDataModule instance for data loading. +# save_dir (Union[str, Path]): Directory to save the final segmentation results. +# seg_tag (str): Tag to include in the saved filename. +# transcript_file (Union[str, Path]): Path to the transcripts parquet file. +# score_cut (float, optional): The threshold for assigning transcripts to cells based on similarity scores. Defaults to 0.25. +# use_cc (bool, optional): If True, re-group transcripts that have not been assigned to any nucleus. Defaults to True. +# file_format (str, optional): File format to save the results ('csv', 'parquet', or 'anndata'). Defaults to 'anndata'. +# receptive_field (dict, optional): Defines the receptive field for transcript-cell and transcript-transcript relations. +# knn_method (str, optional): The method to use for nearest neighbors ('kd_tree' by default). +# **anndata_kwargs: Additional keyword arguments passed to the create_anndata function. + +# Returns: +# None +# """ +# start_time = time.time() +# # rmm.reinitialize(pool_allocator=True, initial_pool_size=2**26, maximum_pool_size=2**30) +# # cp.cuda.set_allocator(rmm_cupy_allocator) + +# # Ensure the save directory exists +# save_dir = Path(save_dir) +# save_dir.mkdir(parents=True, exist_ok=True) + +# if verbose: +# print(f"Starting segmentation for {seg_tag}...") + +# # Step 1: Prediction +# step_start_time = time.time() + +# train_dataloader = dm.train_dataloader() +# test_dataloader = dm.test_dataloader() +# val_dataloader = dm.val_dataloader() + +# # delayed_train = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) +# # delayed_val = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) +# delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) + +# delayed_test = delayed_test.compute() +# # Compute all predictions at once using Dask +# # with ProgressBar(): +# # segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) + +# if verbose: +# elapsed_time = format_time(time.time() - step_start_time) +# print(f"Predictions completed in {elapsed_time}.") + +# # Step 2: Combine and group by transcript_id +# step_start_time = time.time() + +# # Combine the segmentation data +# seg_combined = dd.concat([segmentation_train, segmentation_val, segmentation_test]) + +# # No need to handle max score logic here, as it's done inside the `predict` function +# seg_final = seg_combined.compute() + +# # Drop any unassigned rows +# seg_final = seg_final.dropna(subset=['segger_cell_id']).reset_index(drop=True) + +# if verbose: +# elapsed_time = format_time(time.time() - step_start_time) +# print(f"Segmentation results processed in {elapsed_time}.") + +# # Step 3: Load transcripts and merge +# step_start_time = time.time() + +# transcripts_df = dd.read_parquet(transcript_file) + +# if verbose: +# print("Merging segmentation results with transcripts...") + +# # Merge the segmentation results with the transcript data +# seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) +# transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner').compute() + +# if verbose: +# elapsed_time = format_time(time.time() - step_start_time) +# print(f"Transcripts merged in {elapsed_time}.") + +# # Step 4: Save the merged result +# step_start_time = time.time() + +# if verbose: +# print(f"Saving results in {file_format} format...") + +# if file_format == 'csv': +# save_path = save_dir / f'{seg_tag}_segmentation.csv' +# transcripts_df_filtered.to_csv(save_path, index=False) +# elif file_format == 'parquet': +# save_path = save_dir / f'{seg_tag}_segmentation.parquet' +# transcripts_df_filtered.to_parquet(save_path, index=False) +# elif file_format == 'anndata': +# save_path = save_dir / f'{seg_tag}_segmentation.h5ad' +# segger_adata = create_anndata(transcripts_df_filtered, **anndata_kwargs) +# segger_adata.write(save_path) +# else: +# raise ValueError(f"Unsupported file format: {file_format}") + +# if verbose: +# elapsed_time = format_time(time.time() - step_start_time) +# print(f"Results saved in {elapsed_time} at {save_path}.") + +# # Total time +# if verbose: +# total_time = format_time(time.time() - start_time) +# print(f"Total segmentation process completed in {total_time}.") + +# # Step 5: Garbage collection and memory cleanup +# # rmm.reinitialize(pool_allocator=True) +# # torch.cuda.empty_cache() +# gc.collect() diff --git a/src/segger/validation/utils.py b/src/segger/validation/utils.py index 0e1551b..b283b00 100644 --- a/src/segger/validation/utils.py +++ b/src/segger/validation/utils.py @@ -526,9 +526,10 @@ def plot_metric_comparison( metric: str, label: str, method1: str, - method2: str + method2: str, + output_path: Path ) -> None: - """Plot a comparison of a specific metric between two methods. + """Plot a comparison of a specific metric between two methods and save the comparison data. Args: - ax: plt.Axes @@ -543,14 +544,21 @@ def plot_metric_comparison( The first method to compare. - method2: str The second method to compare. + - output_path: Path + Path to save the merged DataFrame as a CSV. """ subset1 = data[data['method'] == method1] subset2 = data[data['method'] == method2] merged_data = pd.merge(subset1, subset2, on='celltype_major', suffixes=(f'_{method1}', f'_{method2}')) + + # Save the merged data used in the plot to CSV + merged_data.to_csv(output_path / f'metric_comparison_{metric}_{method1}_vs_{method2}.csv', index=False) + for cell_type in merged_data['celltype_major'].unique(): cell_data = merged_data[merged_data['celltype_major'] == cell_type] ax.scatter(cell_data[f'{metric}_{method1}'], cell_data[f'{metric}_{method2}'], label=cell_type) + max_value = max(merged_data[f'{metric}_{method1}'].max(), merged_data[f'{metric}_{method2}'].max()) ax.plot([0, max_value], [0, max_value], 'k--', alpha=0.5) ax.set_xlabel(f'{label} ({method1})') @@ -559,6 +567,7 @@ def plot_metric_comparison( + def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnData]: """Load segmentation data from provided paths and handle special cases like separating 'segger' into 'segger_n0' and 'segger_n1'. @@ -582,8 +591,8 @@ def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnD -def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] ) -> None: - """Plot the number of cells per segmentation method. +def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: + """Plot the number of cells per segmentation method and save the cell count data as a CSV. Args: segmentations_dict (Dict[str, sc.AnnData]): Dictionary mapping segmentation method names to loaded AnnData objects. @@ -595,6 +604,9 @@ def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat # Create a DataFrame for the bar plot df = pd.DataFrame(cell_counts, index=['Number of Cells']).T + # Save the DataFrame to CSV + df.to_csv(output_path / 'cell_counts_data.csv', index=True) + # Generate the bar plot ax = df.plot(kind='bar', stacked=False, color=[palette.get(key, '#333333') for key in df.index], figsize=(3, 6), width=0.9) @@ -613,6 +625,7 @@ def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat plt.savefig(output_path / 'cell_counts_bar_plot.pdf', bbox_inches='tight') plt.show() + def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the percentage of assigned transcripts (normalized) for each segmentation method. @@ -637,6 +650,8 @@ def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path 'Segmentation Method': [], 'Percent Assigned (Normalized)': [] }) + + # Add normalized percent_assigned data for each method for method in segmentations_dict.keys(): @@ -646,6 +661,8 @@ def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path 'Percent Assigned (Normalized)': method_data.values }) violin_data = pd.concat([violin_data, method_df], axis=0) + + violin_data.to_csv(output_path / 'percent_assigned_normalized.csv', index=True) # Plot the violin plots plt.figure(figsize=(12, 8)) @@ -698,6 +715,8 @@ def plot_gene_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat 'Normalized Counts': method_counts.values }) boxplot_data = pd.concat([boxplot_data, method_df], axis=0) + + boxplot_data.to_csv(output_path / 'gene_counts_normalized_data.csv', index=True) # Plot the box plots plt.figure(figsize=(3, 6)) @@ -737,6 +756,8 @@ def plot_counts_per_cell(segmentations_dict: Dict[str, sc.AnnData], output_path: 'Counts per Cell (log2)': method_counts.values }) violin_data = pd.concat([violin_data, method_df], axis=0) + + violin_data.to_csv(output_path / 'counts_per_cell_data.csv', index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) ax = sns.violinplot(x='Segmentation Method', y='Counts per Cell (log2)', data=violin_data, palette=palette) @@ -774,6 +795,7 @@ def plot_cell_area(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, 'Cell Area (log2)': method_area.values }) violin_data = pd.concat([violin_data, method_df], axis=0) + violin_data.to_csv(output_path / 'cell_area_log2_data.csv', index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) ax = sns.violinplot(x='Segmentation Method', y='Cell Area (log2)', data=violin_data, palette=palette) @@ -813,6 +835,8 @@ def plot_transcript_density(segmentations_dict: Dict[str, sc.AnnData], output_pa 'Transcript Density (log2)': method_density_log2.values }) violin_data = pd.concat([violin_data, method_df], axis=0) + + violin_data.to_csv(output_path / 'transcript_density_log2_data.csv', index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) @@ -882,6 +906,7 @@ def plot_mecr_results(mecr_results: Dict[str, Dict[Tuple[str, str], float]], out 'MECR': mecr_value }) df = pd.DataFrame(plot_data) + df.to_csv(output_path / 'mcer_box.csv', index=True) plt.figure(figsize=(3, 6)) sns.boxplot(x='Segmentation Method', y='MECR', data=df, palette=palette) plt.title('Mutually Exclusive Co-expression Rate (MECR)') @@ -898,10 +923,11 @@ def plot_quantized_mecr_counts(quantized_mecr_counts: Dict[str, pd.DataFrame], o """Plot the quantized MECR values against transcript counts for each segmentation method, with point size proportional to the variance of MECR. Args: - quantized_mecr_counts (Dict[str, pd.DataFrame]): Dictionary of quantized MECR count data for each segmentation method. - output_path (Path): Path to the directory where the plot will be saved. - palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. + quantized_mecr_counts (Dict[str, pd.DataFrame]): Dictionary of quantized MECR count data for each segmentation method. + output_path (Path): Path to the directory where the plot will be saved. + palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ + quantized_mecr_counts.to_csv(output_path / 'quantized_mecr_counts.csv', index=True) plt.figure(figsize=(9, 6)) for method, df in quantized_mecr_counts.items(): plt.plot( @@ -940,6 +966,7 @@ def plot_quantized_mecr_area(quantized_mecr_area: Dict[str, pd.DataFrame], outpu output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ + quantized_mecr_area.to_csv(output_path / 'quantized_mecr_area.csv', index=True) plt.figure(figsize=(6, 4)) for method, df in quantized_mecr_area.items(): plt.plot( @@ -976,10 +1003,11 @@ def plot_contamination_results(contamination_results: Dict[str, pd.DataFrame], o """Plot contamination results for each segmentation method. Args: - contamination_results (Dict[str, pd.DataFrame]): Dictionary of contamination data for each segmentation method. - output_path (Path): Path to the directory where the plot will be saved. - palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. + contamination_results (Dict[str, pd.DataFrame]): Dictionary of contamination data for each segmentation method. + output_path (Path): Path to the directory where the plot will be saved. + palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ + contamination_results.to_csv(output_path / 'contamination_results.csv', index=True) for method, df in contamination_results.items(): plt.figure(figsize=(10, 6)) sns.heatmap(df, annot=True, cmap='coolwarm', linewidths=0.5) @@ -999,6 +1027,7 @@ def plot_contamination_boxplots(boxplot_data: pd.DataFrame, output_path: Path, p output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ + boxplot_data.to_csv(output_path / 'contamination_box_results.csv', index=True) plt.figure(figsize=(14, 8)) sns.boxplot( x='Source Cell Type', @@ -1082,10 +1111,11 @@ def plot_entropy_boxplots(entropy_boxplot_data: pd.DataFrame, output_path: Path, def plot_sensitivity_boxplots(sensitivity_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: """Plot boxplots for sensitivity across different segmentation methods by cell type. Args: - sensitivity_boxplot_data (pd.DataFrame): DataFrame containing sensitivity data for all segmentation methods. - output_path (Path): Path to the directory where the plot will be saved. - palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. + sensitivity_boxplot_data (pd.DataFrame): DataFrame containing sensitivity data for all segmentation methods. + output_path (Path): Path to the directory where the plot will be saved. + palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ + sensitivity_boxplot_data.to_csv(output_path / 'sensitivity_results.csv', index=True) plt.figure(figsize=(14, 8)) sns.boxplot( x='Cell Type', From 1339480b457c480303a32ea3f21bb44650d6e1d6 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 7 Oct 2024 20:16:52 +0200 Subject: [PATCH 066/156] fixes #17, some changes that I hope didnt mess up --- requirements.txt | 22 ------------ setup.py | 90 ------------------------------------------------ 2 files changed, 112 deletions(-) delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 13b2436..0000000 --- a/requirements.txt +++ /dev/null @@ -1,22 +0,0 @@ -numpy>=1.21.0 -pandas>=1.3.0 -scipy>=1.7.0 -matplotlib>=3.4.0 -seaborn>=0.11.0 -tqdm>=4.61.0 -torch>=2.0.0 -torchvision>=0.10.0 -pytorch-lightning>=1.3.0 -torchmetrics>=0.5.0 -# scanpy>=1.8.0 -squidpy==1.2.0 -adjustText>=0.8 -scikit-learn>=0.24.0 -geopandas>=0.9.0 -shapely>=1.7.0 -scanpy>=1.9.3 -torch-geometric>=2.2.0 -# pyg_lib>=0.0.0 -torch_scatter>=2.1.2 -torch_sparse>=0.6.18 -torch_cluster>=1.6.3 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 964fe9b..0000000 --- a/setup.py +++ /dev/null @@ -1,90 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="segger", - version="0.1.0", - description="Fast and accurate cell segmentation for single-molecule spatial omics", - author="Elyas Heidari", - author_email="elyas.heidari@dkfz-heidelberg.de", - license="MIT", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - python_requires=">=3.10", - keywords=["segmentation", "deep learning", "pytorch", "geometric deep learning"], - install_requires=[ - "torch>=2.0.0", - "numpy>=1.21.0", - "pandas>=1.3.0", - "scipy>=1.7.0", - "matplotlib>=3.4.0", - "seaborn>=0.11.0", - "tqdm>=4.61.0", - "torchvision>=0.10.0", - "lightning>=1.9.0", - "torchmetrics>=0.5.0", - "scanpy>=1.9.3", - "squidpy==1.2.0", - "adjustText>=0.8", - "scikit-learn>=0.24.0", - "geopandas>=0.9.0", - "shapely>=1.7.0", - "path>=17.0.0", - "pyarrow>=17.0.0", - "torch-geometric>=2.2.0", - "dask_geopandas>=0.4.0" - ], - extras_require={ - "gpu": [ - "cuml>=21.08", - "cudf>=21.08", - "cugraph>=21.08", - "cuspatial>=21.08", - "faiss-cpu>=1.7.0", - "faiss-gpu>=1.7.0" - ], - "torch-geometric": [ - "torch-scatter>=2.1.2", - "torch-sparse>=0.6.18", - "torch-cluster>=1.6.3", - ], - "multiprocessing": ["multiprocessing"], - "dev": [ - "pytest", - "black", - "flake8", - "pre-commit", - "twine>=4.0.2", - ], - "docs": [ - "docutils>=0.8,!=0.18.*,!=0.19.*", - "sphinx>=4.1", - "sphinx-book-theme>=1.0.0", - "myst-nb", - "myst-parser", - "sphinxcontrib-bibtex>=1.0.0", - "sphinx-autodoc-typehints", - "sphinx_rtd_theme", - "sphinxext-opengraph", - "sphinx-copybutton", - "sphinx-design", - "sphinx-hoverxref", - "ipykernel", - "ipython", - "pandas", - ], - "tests": [ - "pytest", - "coverage", - ], - }, - url="https://github.com/EliHei2/segger_dev", - project_urls={ - "Bug Tracker": "https://github.com/EliHei2/segger_dev/issues", - "Documentation": "https://EliHei2.github.io/segger_dev", - "Source Code": "https://github.com/EliHei2/segger_dev", - "Homepage": "https://EliHei2.github.io/segger_dev", - "Repository": "https://github.com/EliHei2/segger_dev", - }, - packages=find_packages(where="src"), - package_dir={"": "src"}, -) From e1a7b0516005cac4a240a300ed6674348b960645 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 7 Oct 2024 23:21:53 +0200 Subject: [PATCH 067/156] updated CLI and fixed #19 --- scripts/predict_model_sample.py | 4 +- src/segger/cli/predict.py | 113 +++++++++++++++++-------------- src/segger/cli/train_model.py | 103 ++++++++++++++++------------ src/segger/prediction/predict.py | 2 +- 4 files changed, 125 insertions(+), 97 deletions(-) diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index f875c82..b0a45aa 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -40,13 +40,13 @@ model, dm, save_dir=benchmarks_dir, - seg_tag='segger_embedding_1001_cc_true', + seg_tag='segger_embedding_1001_0.5', transcript_file=transcripts_file, file_format='anndata', receptive_field = receptive_field, min_transcripts=5, # max_transcripts=1500, cell_id_col='segger_cell_id', - use_cc=True, + use_cc=False, knn_method='cuda' ) \ No newline at end of file diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py index 70790d8..eca5a4b 100644 --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -1,60 +1,73 @@ import click -import typing -import os -from segger.cli.utils import add_options, CustomFormatter +from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.predict import segment, load_model from pathlib import Path import logging +import os +os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' -predict_yml = Path(__file__).parent / 'configs' / 'predict' / 'default.yaml' - - -@click.command(name="predict", help="Predict using the Segger model") -#@click.option('--foo', default="bar") # add more options above, not below -@add_options(config_path=predict_yml) -def predict(args): - - # Setup - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - ch.setFormatter(CustomFormatter()) - logging.basicConfig(level=logging.INFO, handlers=[ch]) - - # Import packages - logging.info("Importing packages...") - from segger.data.utils import XeniumDataset - from torch_geometric.loader import DataLoader - from segger.prediction.predict import load_model, predict - logging.info("Done.") +@click.command(name="run_segmentation", help="Run the Segger segmentation model.") +@click.option('--segger_data_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') +@click.option('--models_dir', type=Path, required=True, help='Directory containing the trained models.') +@click.option('--benchmarks_dir', type=Path, required=True, help='Directory to save the segmentation results.') +@click.option('--transcripts_file', type=str, required=True, help='Path to the transcripts file.') +@click.option('--batch_size', type=int, default=1, help='Batch size for processing.') +@click.option('--num_workers', type=int, default=1, help='Number of workers for data loading.') +@click.option('--model_version', type=int, default=0, help='Model version to load.') +@click.option('--save_tag', type=str, default='segger_embedding_1001_0.5', help='Tag for saving segmentation results.') +@click.option('--min_transcripts', type=int, default=5, help='Minimum number of transcripts for segmentation.') +@click.option('--cell_id_col', type=str, default='segger_cell_id', help='Column name for cell IDs.') +@click.option('--use_cc', is_flag=True, default=False, help='Use connected components if specified.') +@click.option('--knn_method', type=str, default='cuda', help='Method for KNN computation.') +@click.option('--file_format', type=str, default='anndata', help='File format for output data.') +@click.option('--k_bd', type=int, default=4, help='K value for boundary computation.') +@click.option('--dist_bd', type=int, default=12, help='Distance for boundary computation.') +@click.option('--k_tx', type=int, default=5, help='K value for transcript computation.') +@click.option('--dist_tx', type=int, default=5, help='Distance for transcript computation.') +def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Path, + transcripts_file: str, batch_size: int = 1, num_workers: int = 1, + model_version: int = 0, save_tag: str = 'segger_embedding_1001_0.5', + min_transcripts: int = 5, cell_id_col: str = 'segger_cell_id', + use_cc: bool = False, knn_method: str = 'cuda', + file_format: str = 'anndata', k_bd: int = 4, dist_bd: int = 12, + k_tx: int = 5, dist_tx: int = 5): + + # Setup logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) - # Load datasets and model - logging.info("Loading Xenium datasets and Segger model...") - dataset = XeniumDataset(args.dataset_path) - data_loader = DataLoader( - dataset, - batch_size=args.batch_size, - num_workers=args.workers, - pin_memory=True, - shuffle=False, + logger.info("Initializing Segger data module...") + # Initialize the Lightning data module + dm = SeggerDataModule( + data_dir=segger_data_dir, + batch_size=batch_size, + num_workers=num_workers, ) - if len(data_loader) == 0: - msg = f"Nothing to predict: No data found at '{args.dataset_path}'." - logging.warning(msg) - return - lit_segger = load_model(args.checkpoint_path) - logging.info("Done.") + + dm.setup() + + logger.info("Loading the model...") + # Load in the latest checkpoint + model_path = models_dir / 'lightning_logs' / f'version_{model_version}' + model = load_model(model_path / 'checkpoints') - # Make prediction on dataset - logging.info("Making predictions on data") - predictions = predict( - lit_segger=lit_segger, - data_loader=data_loader, - score_cut=args.score_cut, - use_cc=args.use_cc, + logger.info("Running segmentation...") + segment( + model, + dm, + save_dir=benchmarks_dir, + seg_tag=save_tag, + transcript_file=transcripts_file, + file_format=file_format, + receptive_field={'k_bd': k_bd, 'dist_bd': dist_bd, 'k_tx': k_tx, 'dist_tx': dist_tx}, + min_transcripts=min_transcripts, + cell_id_col=cell_id_col, + use_cc=use_cc, + knn_method=knn_method, ) - logging.info("Done.") + + logger.info("Segmentation completed.") - # Write predictions to file - logging.info("Saving predictions to file") - predictions.to_csv(args.output_path, index=False) - logging.info("Done.") +if __name__ == '__main__': + run_segmentation() diff --git a/src/segger/cli/train_model.py b/src/segger/cli/train_model.py index 80bf06e..d2edde5 100644 --- a/src/segger/cli/train_model.py +++ b/src/segger/cli/train_model.py @@ -5,10 +5,34 @@ from pathlib import Path import logging +help_msg = "Train the Segger segmentation model." -def train_model(args): - - # Setup +@click.command(name="train_model", help=help_msg) +@add_options(config_path=train_yml) +@click.option('--dataset_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') +@click.option('--models_dir', type=Path, required=True, help='Directory to save the trained model and the training logs.') +@click.option('--sample_tag', type=str, required=True, help='Sample tag for the dataset.') +@click.option('--init_emb', type=int, default=8, help='Size of the embedding layer.') +@click.option('--hidden_channels', type=int, default=32, help='Size of hidden channels in the model.') +@click.option('--num_tx_tokens', type=int, default=500, help='Number of transcript tokens.') +@click.option('--out_channels', type=int, default=8, help='Number of output channels.') +@click.option('--heads', type=int, default=2, help='Number of attention heads.') +@click.option('--num_mid_layers', type=int, default=2, help='Number of mid layers in the model.') +@click.option('--batch_size', type=int, default=4, help='Batch size for training.') +@click.option('--num_workers', type=int, default=2, help='Number of workers for data loading.') +@click.option('--accelerator', type=str, default='cuda', help='Device type to use for training (e.g., "cuda", "cpu").') # Ask for accelerator +@click.option('--max_epochs', type=int, default=200, help='Number of epochs for training.') +@click.option('--devices', type=int, default=4, help='Number of devices (GPUs) to use.') +@click.option('--strategy', type=str, default='auto', help='Training strategy for the trainer.') +@click.option('--precision', type=str, default='16-mixed', help='Precision for training.') +def train_model(dataset_dir: Path, models_dir: Path, sample_tag: str, + init_emb: int = 8, hidden_channels: int = 32, num_tx_tokens: int = 500, + out_channels: int = 8, heads: int = 2, num_mid_layers: int = 2, + batch_size: int = 4, num_workers: int = 2, + accelerator: str = 'cuda', max_epochs: int = 200, + devices: int = 4, strategy: str = 'auto', precision: str = '16-mixed'): + + # Setup logging ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(CustomFormatter()) @@ -16,77 +40,68 @@ def train_model(args): # Import packages logging.info("Importing packages...") - from segger.data.utils import XeniumDataset - from torch_geometric.loader import DataLoader - from torch_geometric.nn import to_hetero + from segger.data.io import XeniumSample from segger.training.train import LitSegger - from lightning import Trainer + from segger.training.segger_data_module import SeggerDataModule + from lightning.pytorch.loggers import CSVLogger + from pytorch_lightning import Trainer logging.info("Done.") # Load datasets logging.info("Loading Xenium datasets...") - trn_ds = XeniumDataset(root=Path(args.data_dir) / 'train_tiles') - val_ds = XeniumDataset(root=Path(args.data_dir) / 'val_tiles') - kwargs = dict( - num_workers=0, - pin_memory=True, - ) - trn_loader = DataLoader( - trn_ds, batch_size=args.batch_size_train, shuffle=True, **kwargs - ) - val_loader = DataLoader( - val_ds, batch_size=args.batch_size_val, shuffle=False, **kwargs + dm = SeggerDataModule( + data_dir=dataset_dir, + batch_size=batch_size, # Hard-coded batch size + num_workers=num_workers, # Hard-coded number of workers ) + + dm.setup() logging.info("Done.") # Initialize model logging.info("Initializing Segger model and trainer...") - metadata = ( - ["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")] - ) - lit_segger = LitSegger( - init_emb=args.init_emb, - hidden_channels=args.hidden_channels, - out_channels=args.out_channels, - heads=args.heads, - aggr=args.aggr, + metadata = (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]) + ls = LitSegger( + num_tx_tokens=num_tx_tokens, + init_emb=init_emb, + hidden_channels=hidden_channels, + out_channels=out_channels, # Hard-coded value + heads=heads, # Hard-coded value + num_mid_layers=num_mid_layers, # Hard-coded value + aggr='sum', # Hard-coded value metadata=metadata, ) - # Initialize lightning trainer + # Initialize the Lightning trainer trainer = Trainer( - accelerator=args.accelerator, - strategy=args.strategy, - precision=args.precision, - devices=args.devices, - max_epochs=args.epochs, - default_root_dir=args.model_dir, + accelerator=accelerator, # Directly use the specified accelerator + strategy=strategy, # Hard-coded value + precision=precision, # Hard-coded value + devices=devices, # Hard-coded value + max_epochs=max_epochs, # Hard-coded value + default_root_dir=models_dir, + logger=CSVLogger(models_dir), ) + logging.info("Done.") # Train model logging.info("Training model...") trainer.fit( - model=lit_segger, - train_dataloaders=trn_loader, - val_dataloaders=val_loader, + model=ls, + datamodule=dm ) - logging.info("Done...") - + logging.info("Done.") train_yml = Path(__file__).parent / 'configs' / 'train' / 'default.yaml' - @click.command(name="slurm", help="Train on Slurm cluster") -#@click.option('--foo', default="bar") # add more options above, not below @add_options(config_path=train_yml) def train_slurm(args): train_model(args) - @click.group(help="Train the Segger model") def train(): pass - -train.add_command(train_slurm) \ No newline at end of file +train.add_command(train_slurm) diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index 9e5423e..8165b88 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -341,7 +341,7 @@ def segment( save_dir: Union[str, Path], seg_tag: str, transcript_file: Union[str, Path], - score_cut: float = .25, + score_cut: float = .5, use_cc: bool = True, file_format: str = 'anndata', receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, From b2adc96adb68c82416b798f3bc4dd7991d6b2389 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 8 Oct 2024 09:51:08 +0200 Subject: [PATCH 068/156] fixed discrepancy between old and new data versions --- scripts/predict_model_sample.py | 4 ++-- src/segger/data/parquet/pyg_dataset.py | 3 ++- src/segger/prediction/predict.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index b0a45aa..11c5e89 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -40,13 +40,13 @@ model, dm, save_dir=benchmarks_dir, - seg_tag='segger_embedding_1001_0.5', + seg_tag='segger_embedding_1001_0.5_cc', transcript_file=transcripts_file, file_format='anndata', receptive_field = receptive_field, min_transcripts=5, # max_transcripts=1500, cell_id_col='segger_cell_id', - use_cc=False, + use_cc=True, knn_method='cuda' ) \ No newline at end of file diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index 3d3e117..8642ae3 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -37,7 +37,8 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - paths = glob.glob(f'{self.processed_dir}/tiles_x=*_y=*_w=*_h=*.pt') + paths = glob.glob(f'{self.processed_dir}/tiles_x*_y*_*_*.pt') + # paths = paths.append(paths = glob.glob(f'{self.processed_dir}/tiles_x*_y*_*_*.pt')) file_names = list(map(os.path.basename, paths)) return file_names diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index 8165b88..cf73116 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -157,7 +157,7 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: indices[1] = edge_index[edge_index != -1] rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) - print(rows) + # print(rows) del indices values = similarity[edge_index != -1].flatten() sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) @@ -419,7 +419,7 @@ def segment( seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test], ignore_index=True) # seg_combined = segmentation_test - print(seg_combined.columns) + # print(seg_combined.columns) # print(transcripts_df.id) # Drop any unassigned rows seg_final = seg_combined.dropna(subset=['segger_cell_id']).reset_index(drop=True) From 752d811f668b4befc3e7681ec8720dce2fe32634 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 8 Oct 2024 09:53:14 +0200 Subject: [PATCH 069/156] updated pyprojct for proper installation --- pyproject.toml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3c538b3..afa95a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,20 +48,12 @@ faiss = [ rapids11 = [ "cupy-cuda11x", - "cudf-cu11==24.8.*", - "cugraph-cu11==24.8.*", - "cuvs-cu11==24.8.*", - "nvidia-nccl-cu11==2.18.*", - "cutensor-cu11" + "cuvs-cu11==24.4.*", ] rapids12 = [ "cupy-cuda12x", - "cudf-cu12==24.8.*", - "cugraph-cu12==24.8.*", - "cuvs-cu12==24.8.*", - "nvidia-nccl-cu12==2.18.*", - "cutensor-cu12" + "cuvs-cu12==24.4.*", ] multiprocessing = ["multiprocessing"] From 7a9c53c16f98e44ae90254b13a763978e813269b Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 8 Oct 2024 09:54:32 +0200 Subject: [PATCH 070/156] updated pyprojct for proper installation --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index afa95a7..cc504fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,12 +48,12 @@ faiss = [ rapids11 = [ "cupy-cuda11x", - "cuvs-cu11==24.4.*", + "cuvs-cu11>=24.4.*", ] rapids12 = [ "cupy-cuda12x", - "cuvs-cu12==24.4.*", + "cuvs-cu12>=24.4.*", ] multiprocessing = ["multiprocessing"] From e2cce4422323395edb7ee474e79ea7ba6b9bb0cd Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Tue, 8 Oct 2024 12:42:46 +0000 Subject: [PATCH 071/156] Don't divide by tile_size if it's None in STSampleParquet pipeline --- src/segger/data/parquet/sample.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 9407ced..11cace5 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -389,10 +389,11 @@ def save( if frac > 1: msg = f"Arg 'frac' should be <= 1.0, but got {frac}." raise ValueError(msg) - n_tiles = self.n_transcripts / tile_size / self.n_workers * frac - if int(n_tiles) == 0: - msg = f"Sampling parameters would yield 0 total tiles." - raise ValueError(msg) + if tile_size is not None: + n_tiles = self.n_transcripts / tile_size / self.n_workers * frac + if int(n_tiles) == 0: + msg = f"Sampling parameters would yield 0 total tiles." + raise ValueError(msg) # Propagate errors to logging except Exception as e: self.logger.error(str(e), exc_info=True) From b1dc03fe226b46b2b1a34ab4a714270c1979d82e Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Tue, 8 Oct 2024 15:34:31 +0000 Subject: [PATCH 072/156] Don't save a tile if there are no transcript-nucleus edges or transcript-transcript edges in it. --- src/segger/data/parquet/sample.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 11cace5..1223bef 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -423,8 +423,9 @@ def func(region): dist_tx=dist_tx, neg_sampling_ratio=neg_sampling_ratio, ) - filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' - torch.save(pyg_data, filepath) + if pyg_data is not None: + filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' + torch.save(pyg_data, filepath) # TODO: Add Dask backend regions = self._get_balanced_regions() @@ -1182,6 +1183,9 @@ def to_pyg_dataset( k=k_bd, max_distance=dist, ) + if nbrs_edge_idx.size(1) == 0: + logging.warning('No boundary-transcript neighbors found in tile "%s."', self.uid) + return None pyg_data["tx", "neighbors", "bd"].edge_index = nbrs_edge_idx # Set up Transcript-Transcript neighbor edges @@ -1191,6 +1195,9 @@ def to_pyg_dataset( k=k_tx, max_distance=dist_tx, ) + if nbrs_edge_idx.size(1) == 0: + logging.warning('No transcript-transcript neighbors found in tile "%s."', self.uid) + return None pyg_data["tx", "neighbors", "tx"].edge_index = nbrs_edge_idx # Find nuclear transcripts From 247de832bbd69e215ae2891bfed5568f63b51f1e Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Wed, 9 Oct 2024 06:47:55 +0000 Subject: [PATCH 073/156] If there are no tx-bd edges in a tile, it's only used for prediction --- src/segger/data/parquet/sample.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 1223bef..bd0a6a2 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -423,9 +423,10 @@ def func(region): dist_tx=dist_tx, neg_sampling_ratio=neg_sampling_ratio, ) - if pyg_data is not None: - filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' - torch.save(pyg_data, filepath) + if pyg_data.test_flag: + data_type = 'test_tiles' + filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' + torch.save(pyg_data, filepath) # TODO: Add Dask backend regions = self._get_balanced_regions() @@ -1183,9 +1184,19 @@ def to_pyg_dataset( k=k_bd, max_distance=dist, ) - if nbrs_edge_idx.size(1) == 0: - logging.warning('No boundary-transcript neighbors found in tile "%s."', self.uid) - return None + + num_edges = nbrs_edge_idx.shape[1] + # If there are no tx-bd edges, we set a flag to indicate that this data can only be used for prediction + pyg_data.test_flag = num_edges == 0 + if pyg_data.test_flag: + return pyg_data + + # num_possible_edges = pyg_data['tx'].id.shape[0] * pyg_data['bd'].id.shape[0] + # if num_possible_edges <= num_edges * neg_sampling_ratio: + # logging.warning( + # 'Not enough negative edges to sample in tile "%s".', self.uid + # ) + pyg_data["tx", "neighbors", "bd"].edge_index = nbrs_edge_idx # Set up Transcript-Transcript neighbor edges @@ -1195,9 +1206,7 @@ def to_pyg_dataset( k=k_tx, max_distance=dist_tx, ) - if nbrs_edge_idx.size(1) == 0: - logging.warning('No transcript-transcript neighbors found in tile "%s."', self.uid) - return None + pyg_data["tx", "neighbors", "tx"].edge_index = nbrs_edge_idx # Find nuclear transcripts From c841be5284819672587d7e2aa563386601b0d79c Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Wed, 9 Oct 2024 08:55:10 +0000 Subject: [PATCH 074/156] Further fixes in saving pyg data --- src/segger/data/parquet/sample.py | 85 +++++++++++++++++-------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index bd0a6a2..5e21366 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -423,10 +423,12 @@ def func(region): dist_tx=dist_tx, neg_sampling_ratio=neg_sampling_ratio, ) - if pyg_data.test_flag: - data_type = 'test_tiles' - filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' - torch.save(pyg_data, filepath) + if pyg_data is not None: + if pyg_data["tx", "belongs", "bd"].edge_index.numel() == 0: + # this tile is only for testing + data_type = 'test_tiles' + filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' + torch.save(pyg_data, filepath) # TODO: Add Dask backend regions = self._get_balanced_regions() @@ -1151,6 +1153,32 @@ def to_pyg_dataset( # Initialize an empty HeteroData object pyg_data = HeteroData() + # Set up Transcript nodes + pyg_data['tx'].id = torch.tensor( + self.transcripts[self.settings.transcripts.id].values.astype(int), + dtype=torch.int, + ) + pyg_data['tx'].pos = torch.tensor( + self.transcripts[self.settings.transcripts.xyz].values, + dtype=torch.float32, + ) + pyg_data['tx'].x = self.get_transcript_props() + + # Set up Transcript-Transcript neighbor edges + nbrs_edge_idx = self.get_kdtree_edge_index( + self.transcripts[self.settings.transcripts.xy], + self.transcripts[self.settings.transcripts.xy], + k=k_tx, + max_distance=dist_tx, + ) + + # If there are no tx-neighbors-tx edges, skip saving tile + if nbrs_edge_idx.shape[1] == 0: + logging.warning(f"No tx-neighbors-tx edges found in tile {self.uid}.") + return None + + pyg_data["tx", "neighbors", "tx"].edge_index = nbrs_edge_idx + # Set up Boundary nodes polygons = utils.get_polygons_from_xy( self.boundaries, @@ -1165,17 +1193,6 @@ def to_pyg_dataset( area, convexity, elongation, circularity ) - # Set up Transcript nodes - pyg_data['tx'].id = torch.tensor( - self.transcripts[self.settings.transcripts.id].values.astype(int), - dtype=torch.int, - ) - pyg_data['tx'].pos = torch.tensor( - self.transcripts[self.settings.transcripts.xyz].values, - dtype=torch.float32, - ) - pyg_data['tx'].x = self.get_transcript_props() - # Set up Boundary-Transcript neighbor edges dist = np.sqrt(polygons.area.max()) * 10 # heuristic distance nbrs_edge_idx = self.get_kdtree_edge_index( @@ -1184,30 +1201,16 @@ def to_pyg_dataset( k=k_bd, max_distance=dist, ) + pyg_data["tx", "neighbors", "bd"].edge_index = nbrs_edge_idx - num_edges = nbrs_edge_idx.shape[1] - # If there are no tx-bd edges, we set a flag to indicate that this data can only be used for prediction - pyg_data.test_flag = num_edges == 0 - if pyg_data.test_flag: + # If there are no tx-neighbors-bd edges, we put the tile automatically in test set + if nbrs_edge_idx.numel() == 0: + logging.warning(f"No tx-neighbors-bd edges found in tile {self.uid}.") + pyg_data["tx", "belongs", "bd"].edge_index = torch.tensor([], dtype=torch.long) return pyg_data - # num_possible_edges = pyg_data['tx'].id.shape[0] * pyg_data['bd'].id.shape[0] - # if num_possible_edges <= num_edges * neg_sampling_ratio: - # logging.warning( - # 'Not enough negative edges to sample in tile "%s".', self.uid - # ) - - pyg_data["tx", "neighbors", "bd"].edge_index = nbrs_edge_idx - - # Set up Transcript-Transcript neighbor edges - nbrs_edge_idx = self.get_kdtree_edge_index( - self.transcripts[self.settings.transcripts.xy], - self.transcripts[self.settings.transcripts.xy], - k=k_tx, - max_distance=dist_tx, - ) - - pyg_data["tx", "neighbors", "tx"].edge_index = nbrs_edge_idx + # Now we identify and split the tx-belongs-bd edges + edge_type = ('tx', 'belongs', 'bd') # Find nuclear transcripts tx_cell_ids = self.transcripts[self.settings.boundaries.id] @@ -1221,11 +1224,15 @@ def to_pyg_dataset( row_idx = np.where(is_nuclear)[0] col_idx = tx_cell_ids.iloc[row_idx].map(cell_ids_map) blng_edge_idx = torch.tensor(np.stack([row_idx, col_idx])).long() - pyg_data["tx", "belongs", "bd"].edge_index = blng_edge_idx + pyg_data[edge_type].edge_index = blng_edge_idx - # Add negative edges for training + # If there are no tx-belongs-bd edges, flag tile as test only (cannot be used for training) + if blng_edge_idx.numel() == 0: + logging.warning(f"No tx-belongs-bd edges found in tile {self.uid}.") + return pyg_data + + # If there are tx-bd edges, add negative edges for training # Need more time-efficient solution than this - edge_type = ('tx', 'belongs', 'bd') transform = RandomLinkSplit( num_val=0, num_test=0, From 6b4725386c7f36806420d99d53375433a698fa74 Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Wed, 9 Oct 2024 11:28:23 +0000 Subject: [PATCH 075/156] CLI for fast dataset creation --- .../configs/create_dataset/default_fast.yaml | 60 ++++++++++++++++ src/segger/cli/create_dataset_fast.py | 71 +++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 src/segger/cli/configs/create_dataset/default_fast.yaml create mode 100644 src/segger/cli/create_dataset_fast.py diff --git a/src/segger/cli/configs/create_dataset/default_fast.yaml b/src/segger/cli/configs/create_dataset/default_fast.yaml new file mode 100644 index 0000000..72f6294 --- /dev/null +++ b/src/segger/cli/configs/create_dataset/default_fast.yaml @@ -0,0 +1,60 @@ +base_dir: + type: Path + required: true + help: Directory containing the raw dataset. +data_dir: + type: Path + required: true + help: Directory to save the processed Segger dataset. +sample_type: + type: str + default: null + help: The sample type of the raw data, e.g., "xenium" or "merscope". +k_bd: + type: int + default: 3 + help: Number of nearest neighbors for boundary nodes. +dist_bd: + type: float + default: 15. + help: Maximum distance for boundary neighbors. +k_tx: + type: int + default: 3 + help: Number of nearest neighbors for transcript nodes. +dist_tx: + type: float + default: 5. + help: Maximum distance for transcript neighbors. +tile_size: + type: int + default: null + help: If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`. +tile_width: + type: int + default: null + help: Width of the tiles in pixels. Ignored if `tile_size` is provided. +tile_height: + type: int + default: null + help: Height of the tiles in pixels. Ignored if `tile_size` is provided. +neg_sampling_ratio: + type: float + default: 5. + help: Ratio of negative samples. +frac: + type: float + default: 1. + help: Fraction of the dataset to process. +val_prob: + type: float + default: 0.1 + help: Proportion of data for use for validation split. +test_prob: + type: float + default: 0.2 + help: Proportion of data for use for test split. +n_workers: + type: int + default: 1 + help: Number of workers for parallel processing. diff --git a/src/segger/cli/create_dataset_fast.py b/src/segger/cli/create_dataset_fast.py new file mode 100644 index 0000000..8e6e9ee --- /dev/null +++ b/src/segger/cli/create_dataset_fast.py @@ -0,0 +1,71 @@ +import click +import os +from segger.cli.utils import add_options, CustomFormatter +from pathlib import Path +import logging +from argparse import Namespace +from segger.data.parquet.sample import STSampleParquet +from typing import Optional +import time + +# Path to default YAML configuration file +data_yml = Path(__file__).parent / 'configs' / 'create_dataset' / 'default_fast.yaml' + +# CLI command to create a Segger dataset +help_msg = "Create Segger dataset from spatial transcriptomics data (Xenium or MERSCOPE)" +@click.command(name="create_dataset", help=help_msg) +@add_options(config_path=data_yml) +@click.option('--base_dir', type=Path, required=True, help='Directory containing the raw dataset.') +@click.option('--data_dir', type=Path, required=True, help='Directory to save the processed Segger dataset.') +@click.option('--sample_type', type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".') +@click.option('--k_bd', type=int, default=3, help='Number of nearest neighbors for boundary nodes.') +@click.option('--dist_bd', type=float, default=15., help='Maximum distance for boundary neighbors.') +@click.option('--k_tx', type=int, default=3, help='Number of nearest neighbors for transcript nodes.') +@click.option('--dist_tx', type=float, default=5., help='Maximum distance for transcript neighbors.') +@click.option('--tile_size', type=int, default=None, help='If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`.') +@click.option('--tile_width', type=int, default=None, help='Width of the tiles in pixels. Ignored if `tile_size` is provided.') +@click.option('--tile_height', type=int, default=None, help='Height of the tiles in pixels. Ignored if `tile_size` is provided.') +@click.option('--neg_sampling_ratio', type=float, default=5., help='Ratio of negative samples.') +@click.option('--frac', type=float, default=1., help='Fraction of the dataset to process.') +@click.option('--val_prob', type=float, default=0.1, help='Proportion of data for use for validation split.') +@click.option('--test_prob', type=float, default=0.2, help='Proportion of data for use for test split.') +@click.option('--n_workers', type=int, default=1, help='Number of workers for parallel processing.') +def create_dataset(args: Namespace): + + # Setup logging + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(CustomFormatter()) + logging.basicConfig(level=logging.INFO, handlers=[ch]) + + # Initialize the sample class + logging.info("Initializing sample...") + sample = STSampleParquet( + base_dir=args.base_dir, + n_workers=args.n_workers, + sample_type=args.sample_type, + ) + + # Save Segger dataset + logging.info("Saving dataset for Segger...") + start_time = time.time() + sample.save( + data_dir=args.data_dir, + k_bd=args.k_bd, + dist_bd=args.dist_bd, + k_tx=args.k_tx, + dist_tx=args.dist_tx, + tile_size=args.tile_size, + tile_width=args.tile_width, + tile_height=args.tile_height, + neg_sampling_ratio=args.neg_sampling_ratio, + frac=args.frac, + val_prob=args.val_prob, + test_prob=args.test_prob, + ) + end_time = time.time() + logging.info(f"Time to save dataset: {end_time - start_time} seconds") + logging.info("Dataset saved successfully.") + +if __name__ == '__main__': + create_dataset() \ No newline at end of file From db561c4fb0294900ce0f75594e24ddc91433cd01 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:38:45 +0200 Subject: [PATCH 076/156] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cc504fd..afa95a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,12 +48,12 @@ faiss = [ rapids11 = [ "cupy-cuda11x", - "cuvs-cu11>=24.4.*", + "cuvs-cu11==24.4.*", ] rapids12 = [ "cupy-cuda12x", - "cuvs-cu12>=24.4.*", + "cuvs-cu12==24.4.*", ] multiprocessing = ["multiprocessing"] From a09a39bb6c71270ea6ea4588f3a9906113a9270e Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Thu, 10 Oct 2024 11:45:22 +0000 Subject: [PATCH 077/156] Adjustments in model training --- src/segger/cli/configs/train/default.yaml | 64 +++++++++++++++++++++++ src/segger/cli/train_model.py | 48 ++++++++--------- src/segger/data/parquet/pyg_dataset.py | 6 ++- 3 files changed, 93 insertions(+), 25 deletions(-) create mode 100644 src/segger/cli/configs/train/default.yaml diff --git a/src/segger/cli/configs/train/default.yaml b/src/segger/cli/configs/train/default.yaml new file mode 100644 index 0000000..b685eac --- /dev/null +++ b/src/segger/cli/configs/train/default.yaml @@ -0,0 +1,64 @@ +dataset_dir: + type: Path + required: true + help: Directory containing the processed Segger dataset. +models_dir: + type: Path + required: true + help: Directory to save the trained model and the training logs. +sample_tag: + type: str + required: true + help: Sample tag for the dataset. +init_emb: + type: int + default: 8 + help: Size of the embedding layer. +hidden_channels: + type: int + default: 32 + help: Size of hidden channels in the model. +num_tx_tokens: + type: int + default: 500 + help: Number of transcript tokens. +out_channels: + type: int + default: 8 + help: Number of output channels. +heads: + type: int + default: 2 + help: Number of attention heads. +num_mid_layers: + type: int + default: 2 + help: Number of mid layers in the model. +batch_size: + type: int + default: 4 + help: Batch size for training. +num_workers: + type: int + default: 2 + help: Number of workers for data loading. +accelerator: + type: str + default: 'cuda' + help: Device type to use for training (e.g., "cuda", "cpu"). +max_epochs: + type: int + default: 200 + help: Number of epochs for training. +devices: + type: int + default: 4 + help: Number of devices (GPUs) to use. +strategy: + type: str + default: 'auto' + help: Training strategy for the trainer. +precision: + type: str + default: '16-mixed' + help: Precision for training. diff --git a/src/segger/cli/train_model.py b/src/segger/cli/train_model.py index d2edde5..78bd5d7 100644 --- a/src/segger/cli/train_model.py +++ b/src/segger/cli/train_model.py @@ -4,9 +4,12 @@ from segger.cli.utils import add_options, CustomFormatter from pathlib import Path import logging +from argparse import Namespace -help_msg = "Train the Segger segmentation model." +# Path to default YAML configuration file +train_yml = Path(__file__).parent / 'configs' / 'train' / 'default.yaml' +help_msg = "Train the Segger segmentation model." @click.command(name="train_model", help=help_msg) @add_options(config_path=train_yml) @click.option('--dataset_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') @@ -25,12 +28,7 @@ @click.option('--devices', type=int, default=4, help='Number of devices (GPUs) to use.') @click.option('--strategy', type=str, default='auto', help='Training strategy for the trainer.') @click.option('--precision', type=str, default='16-mixed', help='Precision for training.') -def train_model(dataset_dir: Path, models_dir: Path, sample_tag: str, - init_emb: int = 8, hidden_channels: int = 32, num_tx_tokens: int = 500, - out_channels: int = 8, heads: int = 2, num_mid_layers: int = 2, - batch_size: int = 4, num_workers: int = 2, - accelerator: str = 'cuda', max_epochs: int = 200, - devices: int = 4, strategy: str = 'auto', precision: str = '16-mixed'): +def train_model(args: Namespace): # Setup logging ch = logging.StreamHandler() @@ -50,9 +48,9 @@ def train_model(dataset_dir: Path, models_dir: Path, sample_tag: str, # Load datasets logging.info("Loading Xenium datasets...") dm = SeggerDataModule( - data_dir=dataset_dir, - batch_size=batch_size, # Hard-coded batch size - num_workers=num_workers, # Hard-coded number of workers + data_dir=args.dataset_dir, + batch_size=args.batch_size, # Hard-coded batch size + num_workers=args.num_workers, # Hard-coded number of workers ) dm.setup() @@ -62,25 +60,25 @@ def train_model(dataset_dir: Path, models_dir: Path, sample_tag: str, logging.info("Initializing Segger model and trainer...") metadata = (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]) ls = LitSegger( - num_tx_tokens=num_tx_tokens, - init_emb=init_emb, - hidden_channels=hidden_channels, - out_channels=out_channels, # Hard-coded value - heads=heads, # Hard-coded value - num_mid_layers=num_mid_layers, # Hard-coded value + num_tx_tokens=args.num_tx_tokens, + init_emb=args.init_emb, + hidden_channels=args.hidden_channels, + out_channels=args.out_channels, # Hard-coded value + heads=args.heads, # Hard-coded value + num_mid_layers=args.num_mid_layers, # Hard-coded value aggr='sum', # Hard-coded value metadata=metadata, ) # Initialize the Lightning trainer trainer = Trainer( - accelerator=accelerator, # Directly use the specified accelerator - strategy=strategy, # Hard-coded value - precision=precision, # Hard-coded value - devices=devices, # Hard-coded value - max_epochs=max_epochs, # Hard-coded value - default_root_dir=models_dir, - logger=CSVLogger(models_dir), + accelerator=args.accelerator, # Directly use the specified accelerator + strategy=args.strategy, # Hard-coded value + precision=args.precision, # Hard-coded value + devices=args.devices, # Hard-coded value + max_epochs=args.max_epochs, # Hard-coded value + default_root_dir=args.models_dir, + logger=CSVLogger(args.models_dir), ) logging.info("Done.") @@ -93,7 +91,6 @@ def train_model(dataset_dir: Path, models_dir: Path, sample_tag: str, ) logging.info("Done.") -train_yml = Path(__file__).parent / 'configs' / 'train' / 'default.yaml' @click.command(name="slurm", help="Train on Slurm cluster") @add_options(config_path=train_yml) @@ -105,3 +102,6 @@ def train(): pass train.add_command(train_slurm) + +if __name__ == '__main__': + train_model() \ No newline at end of file diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index 8642ae3..c4c4602 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -63,5 +63,9 @@ def get(self, idx: int) -> Data: """ filepath = Path(self.processed_dir) / self.processed_file_names[idx] data = torch.load(filepath) - data['tx'].x = data['tx'].x.to_dense() + data['tx'].x = data['tx'].x.to_dense().unsqueeze(1) + # this is an issue in PyG's RandomLinkSplit, dimensions are not consistent if there is only one edge in the graph + if data['tx', 'belongs', 'bd'].edge_label_index.shape.__len__() < 2: + data['tx', 'belongs', 'bd'].edge_label_index = data['tx', 'belongs', 'bd'].edge_label_index.unsqueeze(1) + data['tx', 'belongs', 'bd'].edge_label = data['tx', 'belongs', 'bd'].edge_label.unsqueeze(0) return data \ No newline at end of file From f2f245d18be09bc9483130e7c9b4ed57f2f6a46e Mon Sep 17 00:00:00 2001 From: "daniel.unyi.42" Date: Thu, 10 Oct 2024 13:43:35 +0000 Subject: [PATCH 078/156] PyG dataset dimension check --- src/segger/data/parquet/pyg_dataset.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index c4c4602..5599cb3 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -63,9 +63,13 @@ def get(self, idx: int) -> Data: """ filepath = Path(self.processed_dir) / self.processed_file_names[idx] data = torch.load(filepath) - data['tx'].x = data['tx'].x.to_dense().unsqueeze(1) + data['tx'].x = data['tx'].x.to_dense() + if data['tx'].x.dim() == 1: + data['tx'].x = data['tx'].x.unsqueeze(1) + assert data['tx'].x.dim() == 2 # this is an issue in PyG's RandomLinkSplit, dimensions are not consistent if there is only one edge in the graph - if data['tx', 'belongs', 'bd'].edge_label_index.shape.__len__() < 2: + if data['tx', 'belongs', 'bd'].edge_label_index.dim() == 1: data['tx', 'belongs', 'bd'].edge_label_index = data['tx', 'belongs', 'bd'].edge_label_index.unsqueeze(1) data['tx', 'belongs', 'bd'].edge_label = data['tx', 'belongs', 'bd'].edge_label.unsqueeze(0) - return data \ No newline at end of file + assert data['tx', 'belongs', 'bd'].edge_label_index.dim() == 2 + return data From b8f4cca4a16c40b0837aff4372110f3b376d81e3 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 18:02:04 +0200 Subject: [PATCH 079/156] added pre-commit: black, blacken-docs, prettier --- .pre-commit-config.yaml | 22 ++++++++++++++++++++++ .prettierignore | 3 +++ pyproject.toml | 20 ++++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 .pre-commit-config.yaml create mode 100644 .prettierignore diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..781c996 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +fail_fast: false +default_language_version: + python: python3 +default_stages: + - commit + - push +minimum_pre_commit_version: 2.16.0 +ci: + skip: [] +repos: + - repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v4.0.0-alpha.8 + hooks: + - id: prettier + - repo: https://github.com/asottile/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..f37d787 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,3 @@ +# Ignore artifacts: +docs/* +mkdocs.yml diff --git a/pyproject.toml b/pyproject.toml index afa95a7..c073c3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,3 +99,23 @@ repository = "https://github.com/EliHei2/segger_dev" [tool.setuptools] packages = ["segger"] package-dir = {"" = "src"} + +[tool.black] +line-length = 120 +target-version = ['py39'] +include = '\.pyi?$' +exclude = ''' +( + /( + \.eggs + | \.git + | \.hg + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + )/ +) +''' From abf9fd8d79ffc2bcc71a305a961c1dc1a1d2bdec Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 18:05:36 +0200 Subject: [PATCH 080/156] test run pre-commit From 77cb46d25839b1c0d241b4b10e7322bd65c25d24 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 18:16:22 +0200 Subject: [PATCH 081/156] added badge --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a079a15..a077fa5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # 🍳 Welcome to segger! +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/EliHei2/segger_dev/main.svg)](https://results.pre-commit.ci/latest/github/EliHei2/segger_dev/main) + **segger** is a cutting-edge tool for **cell segmentation** in **single-molecule spatial omics** datasets. By leveraging **graph neural networks (GNNs)** and heterogeneous graphs, segger offers unmatched accuracy and scalability. # How segger Works @@ -47,7 +49,6 @@ segger tackles these with a **graph-based approach**, achieving superior segment ## Installation Options - ### Important: PyTorch Geometric Dependencies Segger **highly depends** on PyTorch Geometric. One **must** install its dependencies (such as `torch-sparse` and `torch-scatter`) based on their system’s specifications, especially CUDA and PyTorch versions. @@ -70,10 +71,8 @@ pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu120.html pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu120.html ``` - Afterwards choose the installation method that best suits your needs. - ### Micromamba Installation To set up Segger with `micromamba` and install the required dependencies, use the following commands: From 2a1494d115040136d2ff24dda3430531899c037a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 18:48:06 +0200 Subject: [PATCH 082/156] black on the whole repo; wip preliminary spatialdata support --- .../\342\236\225-performance-improvement.md" | 5 +- .../\360\237\220\233-bug-report.md" | 6 +- .../\360\237\232\200-feature-request.md" | 7 +- .github/workflows/python-publish.yml | 33 +- .github/workflows/static.yml | 2 +- .gitignore | 3 + .pre-commit-config.yaml | 32 +- .scripts/create_dataset.py | 46 +- .scripts/predict.py | 24 +- .scripts/train_model.py | 36 +- docs/notebooks/benchmark_bc.py | 114 ++- docs/source/conf.py | 15 +- docs/user_guide/data_creation.md | 41 +- docs/user_guide/training.md | 12 +- scripts/create_data_sample.py | 23 +- scripts/predict_model_sample.py | 33 +- scripts/train_model.py | 62 +- scripts/train_model_sample.py | 26 +- src/segger/__init__.py | 9 +- src/segger/cli/cli.py | 4 +- src/segger/cli/configs/train/default.yaml | 6 +- src/segger/cli/create_dataset.py | 75 +- src/segger/cli/create_dataset_fast.py | 52 +- src/segger/cli/predict.py | 86 ++- src/segger/cli/train_model.py | 56 +- src/segger/cli/utils.py | 34 +- src/segger/data/README.md | 89 +-- src/segger/data/__init__.py | 37 +- src/segger/data/constants.py | 7 +- src/segger/data/io.py | 578 ++++++++------- src/segger/data/parquet/__init__.py | 0 src/segger/data/parquet/_experimental.py | 36 +- src/segger/data/parquet/_ndtree.py | 37 +- src/segger/data/parquet/_settings/xenium.yaml | 10 +- src/segger/data/parquet/_utils.py | 87 ++- src/segger/data/parquet/pyg_dataset.py | 24 +- src/segger/data/parquet/sample.py | 250 +++---- .../data/parquet/transcript_embedding.py | 24 +- src/segger/data/utils.py | 259 ++++--- src/segger/models/README.md | 20 +- src/segger/models/__init__.py | 4 +- src/segger/models/segger_model.py | 29 +- src/segger/prediction/__init__.py | 5 +- src/segger/prediction/predict.py | 228 +++--- src/segger/training/README.md | 6 + src/segger/training/segger_data_module.py | 6 +- src/segger/training/train.py | 40 +- src/segger/validation/__init__.py | 2 +- src/segger/validation/utils.py | 678 +++++++++--------- src/segger/validation/xenium_explorer.py | 282 +++++--- tests/test_data.py | 64 +- tests/test_model.py | 25 +- tests/test_prediction.py | 14 +- tests/test_training.py | 25 +- 54 files changed, 1885 insertions(+), 1823 deletions(-) create mode 100644 src/segger/data/parquet/__init__.py diff --git "a/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" "b/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" index b281b2f..9b5cb1f 100644 --- "a/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" +++ "b/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" @@ -1,10 +1,9 @@ --- name: "➕ Performance Improvement" about: Suggest an improvement in the performance -title: '' -labels: '' +title: "" +labels: "" assignees: andrewmoorman, EliHei2 - --- **Describe the issue with the current implementation** diff --git "a/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" "b/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" index b899e5a..5809219 100644 --- "a/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" +++ "b/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" @@ -2,12 +2,12 @@ name: "\U0001F41B Bug Report" about: Create a report to help us improve title: "[BUG]" -labels: '' +labels: "" assignees: andrewmoorman, EliHei2 - --- --- + name: Bug Report about: Report a bug or unexpected behavior title: "[BUG] " @@ -21,6 +21,7 @@ A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: + 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' @@ -33,6 +34,7 @@ A clear and concise description of what you expected to happen. If applicable, add screenshots or logs to help explain your problem. **Environment (please complete the following information):** + - OS: [e.g. macOS, Windows, Linux] - Python version: [e.g. 3.9] - Package version: [e.g. 1.2.3] diff --git "a/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" "b/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" index 08679f6..67644f2 100644 --- "a/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" +++ "b/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" @@ -1,10 +1,9 @@ --- name: "\U0001F680 Feature Request" about: Suggest an idea for this project -title: '' -labels: '' -assignees: '' - +title: "" +labels: "" +assignees: "" --- **Is your feature request related to a problem? Please describe.** diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index b7a704b..c16ebea 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -17,23 +17,22 @@ permissions: jobs: deploy: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index b6a7e3a..146ad51 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -25,7 +25,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - name: Install package and documentation dependencies run: | diff --git a/.gitignore b/.gitignore index d5292d6..84adb42 100644 --- a/.gitignore +++ b/.gitignore @@ -163,9 +163,12 @@ cython_debug/ data_* +data/* model_* *.egg_info figure* dev* +.DS_Store +.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 781c996..a1e1760 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,22 @@ fail_fast: false default_language_version: - python: python3 + python: python3 default_stages: - - commit - - push + - commit + - push minimum_pre_commit_version: 2.16.0 ci: - skip: [] + skip: [] repos: - - repo: https://github.com/psf/black - rev: 24.8.0 - hooks: - - id: black - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v4.0.0-alpha.8 - hooks: - - id: prettier - - repo: https://github.com/asottile/blacken-docs - rev: 1.18.0 - hooks: - - id: blacken-docs + - repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v4.0.0-alpha.8 + hooks: + - id: prettier + - repo: https://github.com/asottile/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs diff --git a/.scripts/create_dataset.py b/.scripts/create_dataset.py index 27de8af..0692eb4 100644 --- a/.scripts/create_dataset.py +++ b/.scripts/create_dataset.py @@ -2,7 +2,7 @@ import os from pathlib import Path from urllib import request -from segger.data.utils import XeniumSample +from segger.data.io import XeniumSample def download_file(url, dest): @@ -30,9 +30,7 @@ def main(args): download_file(transcripts_url, transcripts_path) download_file(nuclei_url, nuclei_path) - xs = XeniumSample().load_transcripts( - path=transcripts_path, min_qv=args.min_qv - ) + xs = XeniumSample().load_transcripts(path=transcripts_path, min_qv=args.min_qv) xs.load_nuclei(path=nuclei_path) if args.parallel: @@ -83,9 +81,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Create dataset from Xenium Human Pancreatic data." - ) + parser = argparse.ArgumentParser(description="Create dataset from Xenium Human Pancreatic data.") parser.add_argument( "--raw_data_dir", type=str, @@ -104,9 +100,7 @@ def main(args): required=True, help="URL for transcripts data.", ) - parser.add_argument( - "--nuclei_url", type=str, required=True, help="URL for nuclei data." - ) + parser.add_argument("--nuclei_url", type=str, required=True, help="URL for nuclei data.") parser.add_argument( "--min_qv", type=int, @@ -125,21 +119,11 @@ def main(args): default=180, help="Step size in y direction for tiles.", ) - parser.add_argument( - "--x_size", type=int, default=200, help="Width of each tile." - ) - parser.add_argument( - "--y_size", type=int, default=200, help="Height of each tile." - ) - parser.add_argument( - "--margin_x", type=int, default=None, help="Margin in x direction." - ) - parser.add_argument( - "--margin_y", type=int, default=None, help="Margin in y direction." - ) - parser.add_argument( - "--r_tx", type=int, default=3, help="Radius for building the graph." - ) + parser.add_argument("--x_size", type=int, default=200, help="Width of each tile.") + parser.add_argument("--y_size", type=int, default=200, help="Height of each tile.") + parser.add_argument("--margin_x", type=int, default=None, help="Margin in x direction.") + parser.add_argument("--margin_y", type=int, default=None, help="Margin in y direction.") + parser.add_argument("--r_tx", type=int, default=3, help="Radius for building the graph.") parser.add_argument( "--val_prob", type=float, @@ -158,9 +142,7 @@ def main(args): default=3, help="Number of nearest neighbors for nuclei.", ) - parser.add_argument( - "--dist_nc", type=int, default=10, help="Distance threshold for nuclei." - ) + parser.add_argument("--dist_nc", type=int, default=10, help="Distance threshold for nuclei.") parser.add_argument( "--k_tx", type=int, @@ -179,12 +161,8 @@ def main(args): default=True, help="Whether to compute edge labels.", ) - parser.add_argument( - "--sampling_rate", type=float, default=1, help="Rate of sampling tiles." - ) - parser.add_argument( - "--parallel", action="store_true", help="Use parallel processing." - ) + parser.add_argument("--sampling_rate", type=float, default=1, help="Rate of sampling tiles.") + parser.add_argument("--parallel", action="store_true", help="Use parallel processing.") parser.add_argument( "--num_workers", type=int, diff --git a/.scripts/predict.py b/.scripts/predict.py index f812822..9a095f4 100644 --- a/.scripts/predict.py +++ b/.scripts/predict.py @@ -30,9 +30,7 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Predict using the Segger model" - ) + parser = argparse.ArgumentParser(description="Predict using the Segger model") parser.add_argument( "--dataset_path", type=str, @@ -51,24 +49,16 @@ def main(args: argparse.Namespace) -> None: required=True, help="Path to the model checkpoint", ) - parser.add_argument( - "--init_emb", type=int, default=8, help="Initial embedding size" - ) + parser.add_argument("--init_emb", type=int, default=8, help="Initial embedding size") parser.add_argument( "--hidden_channels", type=int, default=64, help="Number of hidden channels", ) - parser.add_argument( - "--out_channels", type=int, default=16, help="Number of output channels" - ) - parser.add_argument( - "--heads", type=int, default=4, help="Number of attention heads" - ) - parser.add_argument( - "--aggr", type=str, default="sum", help="Aggregation method" - ) + parser.add_argument("--out_channels", type=int, default=16, help="Number of output channels") + parser.add_argument("--heads", type=int, default=4, help="Number of attention heads") + parser.add_argument("--aggr", type=str, default="sum", help="Aggregation method") parser.add_argument( "--score_cut", type=float, @@ -81,9 +71,7 @@ def main(args: argparse.Namespace) -> None: default=4, help="Number of nearest neighbors for nuclei", ) - parser.add_argument( - "--dist_nc", type=int, default=20, help="Distance threshold for nuclei" - ) + parser.add_argument("--dist_nc", type=int, default=20, help="Distance threshold for nuclei") parser.add_argument( "--k_tx", type=int, diff --git a/.scripts/train_model.py b/.scripts/train_model.py index 8a6ee85..2515a71 100644 --- a/.scripts/train_model.py +++ b/.scripts/train_model.py @@ -95,39 +95,21 @@ def main(args): default=4, help="Batch size for validation", ) - parser.add_argument( - "--init_emb", type=int, default=8, help="Initial embedding size" - ) + parser.add_argument("--init_emb", type=int, default=8, help="Initial embedding size") parser.add_argument( "--hidden_channels", type=int, default=64, help="Number of hidden channels", ) - parser.add_argument( - "--out_channels", type=int, default=16, help="Number of output channels" - ) - parser.add_argument( - "--heads", type=int, default=4, help="Number of attention heads" - ) - parser.add_argument( - "--aggr", type=str, default="sum", help="Aggregation method" - ) - parser.add_argument( - "--accelerator", type=str, default="cuda", help="Type of accelerator" - ) - parser.add_argument( - "--strategy", type=str, default="auto", help="Training strategy" - ) - parser.add_argument( - "--precision", type=str, default="16-mixed", help="Precision mode" - ) - parser.add_argument( - "--devices", type=int, default=4, help="Number of devices" - ) - parser.add_argument( - "--epochs", type=int, default=100, help="Number of epochs" - ) + parser.add_argument("--out_channels", type=int, default=16, help="Number of output channels") + parser.add_argument("--heads", type=int, default=4, help="Number of attention heads") + parser.add_argument("--aggr", type=str, default="sum", help="Aggregation method") + parser.add_argument("--accelerator", type=str, default="cuda", help="Type of accelerator") + parser.add_argument("--strategy", type=str, default="auto", help="Training strategy") + parser.add_argument("--precision", type=str, default="16-mixed", help="Precision mode") + parser.add_argument("--devices", type=int, default=4, help="Number of devices") + parser.add_argument("--epochs", type=int, default=100, help="Number of epochs") parser.add_argument( "--default_root_dir", type=str, diff --git a/docs/notebooks/benchmark_bc.py b/docs/notebooks/benchmark_bc.py index 31ac0bd..8b9a3fc 100644 --- a/docs/notebooks/benchmark_bc.py +++ b/docs/notebooks/benchmark_bc.py @@ -8,54 +8,54 @@ from segger.validation.utils import * # Define paths and output directories -benchmarks_path = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') -output_path = benchmarks_path / 'results+' -figures_path = output_path / 'figures' +benchmarks_path = Path("/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc") +output_path = benchmarks_path / "results+" +figures_path = output_path / "figures" figures_path.mkdir(parents=True, exist_ok=True) # Ensure the figures directory exists # Define colors for segmentation methods method_colors = { - 'segger': '#D55E00', - 'segger_n0': '#E69F00', - 'segger_n1': '#F0E442', - 'Baysor': '#0072B2', - '10X': '#009E73', - '10X-nucleus': '#CC79A7', - 'BIDCell': '#8B008B' + "segger": "#D55E00", + "segger_n0": "#E69F00", + "segger_n1": "#F0E442", + "Baysor": "#0072B2", + "10X": "#009E73", + "10X-nucleus": "#CC79A7", + "BIDCell": "#8B008B", } # Define colors for cell types major_colors = { - 'B-cells': '#d8f55e', - 'CAFs': '#532C8A', - 'Cancer Epithelial': '#C72228', - 'Endothelial': '#9e6762', - 'Myeloid': '#ffe012', - 'T-cells': '#3cb44b', - 'Normal Epithelial': '#0F4A9C', - 'PVL': '#c09d9a', - 'Plasmablasts': '#000075' + "B-cells": "#d8f55e", + "CAFs": "#532C8A", + "Cancer Epithelial": "#C72228", + "Endothelial": "#9e6762", + "Myeloid": "#ffe012", + "T-cells": "#3cb44b", + "Normal Epithelial": "#0F4A9C", + "PVL": "#c09d9a", + "Plasmablasts": "#000075", } # Define segmentation file paths segmentation_paths = { - 'segger': benchmarks_path / 'adata_segger.h5ad', - 'Baysor': benchmarks_path / 'adata_baysor.h5ad', - '10X': benchmarks_path / 'adata_10X.h5ad', - '10X-nucleus': benchmarks_path / 'adata_10X_nuc.h5ad', - 'BIDCell': benchmarks_path / 'adata_BIDCell.h5ad' + "segger": benchmarks_path / "adata_segger.h5ad", + "Baysor": benchmarks_path / "adata_baysor.h5ad", + "10X": benchmarks_path / "adata_10X.h5ad", + "10X-nucleus": benchmarks_path / "adata_10X_nuc.h5ad", + "BIDCell": benchmarks_path / "adata_BIDCell.h5ad", } # Load the segmentations and the scRNAseq data segmentations_dict = load_segmentations(segmentation_paths) segmentations_dict = {k: segmentations_dict[k] for k in method_colors.keys() if k in segmentations_dict} -scRNAseq_adata = sc.read(benchmarks_path / 'scRNAseq.h5ad') +scRNAseq_adata = sc.read(benchmarks_path / "scRNAseq.h5ad") # Generate general statistics plots plot_general_statistics_plots(segmentations_dict, figures_path, method_colors) # Find markers for scRNAseq data -markers = find_markers(scRNAseq_adata, cell_type_column='celltype_major', pos_percentile=30, neg_percentile=5) +markers = find_markers(scRNAseq_adata, cell_type_column="celltype_major", pos_percentile=30, neg_percentile=5) # Annotate spatial segmentations with scRNAseq reference data for method in segmentation_paths.keys(): @@ -68,9 +68,7 @@ # Find mutually exclusive genes based on scRNAseq data exclusive_gene_pairs = find_mutually_exclusive_genes( - adata=scRNAseq_adata, - markers=markers, - cell_type_column='celltype_major' + adata=scRNAseq_adata, markers=markers, cell_type_column="celltype_major" ) # Compute MECR for each segmentation method @@ -83,14 +81,12 @@ quantized_mecr_counts = {} for method in segmentations_dict.keys(): - if 'cell_area' in segmentations_dict[method].obs.columns: + if "cell_area" in segmentations_dict[method].obs.columns: quantized_mecr_area[method] = compute_quantized_mecr_area( - adata=segmentations_dict[method], - gene_pairs=exclusive_gene_pairs + adata=segmentations_dict[method], gene_pairs=exclusive_gene_pairs ) quantized_mecr_counts[method] = compute_quantized_mecr_counts( - adata=segmentations_dict[method], - gene_pairs=exclusive_gene_pairs + adata=segmentations_dict[method], gene_pairs=exclusive_gene_pairs ) # Plot MECR results @@ -99,26 +95,30 @@ plot_quantized_mecr_counts(quantized_mecr_counts, output_path=figures_path, palette=method_colors) # Filter segmentation methods for contamination analysis -new_segmentations_dict = {k: v for k, v in segmentations_dict.items() if k in ['segger', 'Baysor', '10X', '10X-nucleus', 'BIDCell']} +new_segmentations_dict = { + k: v for k, v in segmentations_dict.items() if k in ["segger", "Baysor", "10X", "10X-nucleus", "BIDCell"] +} # Compute contamination results contamination_results = {} for method, adata in new_segmentations_dict.items(): - if 'cell_centroid_x' in adata.obs.columns and 'cell_centroid_y' in adata.obs.columns: + if "cell_centroid_x" in adata.obs.columns and "cell_centroid_y" in adata.obs.columns: contamination_results[method] = calculate_contamination( adata=adata, markers=markers, # Assuming you have a dictionary of markers for cell types radius=15, n_neighs=20, - celltype_column='celltype_major', - num_cells=10000 + celltype_column="celltype_major", + num_cells=10000, ) # Prepare contamination data for boxplots boxplot_data = [] for method, df in contamination_results.items(): - melted_df = df.reset_index().melt(id_vars=['Source Cell Type'], var_name='Target Cell Type', value_name='Contamination') - melted_df['Segmentation Method'] = method + melted_df = df.reset_index().melt( + id_vars=["Source Cell Type"], var_name="Target Cell Type", value_name="Contamination" + ) + melted_df["Segmentation Method"] = method boxplot_data.append(melted_df) # Concatenate all contamination dataframes into one @@ -129,13 +129,13 @@ plot_contamination_boxplots(boxplot_data, output_path=figures_path, palette=method_colors) # Separate Segger into nucleus-positive and nucleus-negative cells -segmentations_dict['segger_n1'] = segmentations_dict['segger'][segmentations_dict['segger'].obs.has_nucleus] -segmentations_dict['segger_n0'] = segmentations_dict['segger'][~segmentations_dict['segger'].obs.has_nucleus] +segmentations_dict["segger_n1"] = segmentations_dict["segger"][segmentations_dict["segger"].obs.has_nucleus] +segmentations_dict["segger_n0"] = segmentations_dict["segger"][~segmentations_dict["segger"].obs.has_nucleus] # Compute clustering scores for all segmentation methods clustering_scores = {} for method, adata in segmentations_dict.items(): - ch_score, sh_score = compute_clustering_scores(adata, cell_type_column='celltype_major') + ch_score, sh_score = compute_clustering_scores(adata, cell_type_column="celltype_major") clustering_scores[method] = (ch_score, sh_score) # Plot UMAPs with clustering scores in the title @@ -143,20 +143,22 @@ # Compute neighborhood metrics for methods with spatial data for method, adata in segmentations_dict.items(): - if 'spatial' in list(adata.obsm.keys()): - compute_neighborhood_metrics(adata, radius=15, celltype_column='celltype_major') + if "spatial" in list(adata.obsm.keys()): + compute_neighborhood_metrics(adata, radius=15, celltype_column="celltype_major") # Prepare neighborhood entropy data for boxplots entropy_boxplot_data = [] for method, adata in segmentations_dict.items(): - if 'neighborhood_entropy' in adata.obs.columns: - entropy_df = pd.DataFrame({ - 'Cell Type': adata.obs['celltype_major'], - 'Neighborhood Entropy': adata.obs['neighborhood_entropy'], - 'Segmentation Method': method - }) + if "neighborhood_entropy" in adata.obs.columns: + entropy_df = pd.DataFrame( + { + "Cell Type": adata.obs["celltype_major"], + "Neighborhood Entropy": adata.obs["neighborhood_entropy"], + "Segmentation Method": method, + } + ) # Filter out NaN values, keeping only the subsetted cells - entropy_df = entropy_df.dropna(subset=['Neighborhood Entropy']) + entropy_df = entropy_df.dropna(subset=["Neighborhood Entropy"]) entropy_boxplot_data.append(entropy_df) # Concatenate all entropy dataframes into one @@ -166,7 +168,7 @@ plot_entropy_boxplots(entropy_boxplot_data, figures_path, palette=method_colors) # Find markers for sensitivity calculation -purified_markers = find_markers(scRNAseq_adata, 'celltype_major', pos_percentile=20, percentage=75) +purified_markers = find_markers(scRNAseq_adata, "celltype_major", pos_percentile=20, percentage=75) # Calculate sensitivity for each segmentation method sensitivity_results_per_method = {} @@ -178,11 +180,7 @@ sensitivity_boxplot_data = [] for method, sensitivity_results in sensitivity_results_per_method.items(): for cell_type, sensitivities in sensitivity_results.items(): - method_df = pd.DataFrame({ - 'Cell Type': cell_type, - 'Sensitivity': sensitivities, - 'Segmentation Method': method - }) + method_df = pd.DataFrame({"Cell Type": cell_type, "Sensitivity": sensitivities, "Segmentation Method": method}) sensitivity_boxplot_data.append(method_df) # Concatenate all sensitivity dataframes into one diff --git a/docs/source/conf.py b/docs/source/conf.py index 0cc5f81..17e7f88 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,23 +6,22 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'segger' -copyright = '2024, Elyas Heidari' -author = 'Elyas Heidari' -release = '0.01' +project = "segger" +copyright = "2024, Elyas Heidari" +author = "Elyas Heidari" +release = "0.01" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [] -templates_path = ['_templates'] +templates_path = ["_templates"] exclude_patterns = [] - # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'alabaster' -html_static_path = ['_static'] +html_theme = "alabaster" +html_static_path = ["_static"] diff --git a/docs/user_guide/data_creation.md b/docs/user_guide/data_creation.md index 571f9ef..8d27140 100644 --- a/docs/user_guide/data_creation.md +++ b/docs/user_guide/data_creation.md @@ -140,19 +140,22 @@ from pathlib import Path import scanpy as sc # Set up the file paths -raw_data_dir = Path('/path/to/xenium_output') -processed_data_dir = Path('path/to/processed_files') +raw_data_dir = Path("/path/to/xenium_output") +processed_data_dir = Path("path/to/processed_files") sample_tag = "sample/tag" # Load scRNA-seq data using Scanpy and subsample for efficiency -scRNAseq_path = 'path/to/scRNAseq.h5ad' +scRNAseq_path = "path/to/scRNAseq.h5ad" scRNAseq = sc.read(scRNAseq_path) sc.pp.subsample(scRNAseq, fraction=0.1) # Calculate gene cell type abundance embedding from scRNA-seq data from segger.utils import calculate_gene_celltype_abundance_embedding -celltype_column = 'celltype_column' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) + +celltype_column = "celltype_column" +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( + scRNAseq, celltype_column +) # Create a XeniumSample instance for spatial transcriptomics processing xenium_sample = XeniumSample() @@ -161,9 +164,9 @@ xenium_sample = XeniumSample() xenium_sample.load_transcripts( base_path=raw_data_dir, sample=sample_tag, - transcripts_filename='transcripts.parquet', + transcripts_filename="transcripts.parquet", file_format="parquet", - additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} + additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding}, ) # Set the embedding to "cell_type_abundance" to use it in further processing @@ -171,7 +174,7 @@ xenium_sample.set_embedding("cell_type_abundance") # Load nuclei data to define boundaries nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') +xenium_sample.load_boundaries(path=nuclei_path, file_format="parquet") # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = xenium_sample.build_pyg_data_from_tile( @@ -180,7 +183,7 @@ tile_pyg_data = xenium_sample.build_pyg_data_from_tile( r_tx=20, k_tx=20, use_precomputed=False, - workers=1 + workers=1, ) # Save dataset in processed format for segmentation @@ -199,7 +202,7 @@ xenium_sample.save_dataset_for_segger( test_prob=0.2, neg_sampling_ratio_approx=5, sampling_rate=1, - num_workers=1 + num_workers=1, ) ``` @@ -210,8 +213,8 @@ from segger.data import MerscopeSample from pathlib import Path # Set up the file paths -raw_data_dir = Path('path/to/merscope_outputs') -processed_data_dir = Path('path/to/processed_files') +raw_data_dir = Path("path/to/merscope_outputs") +processed_data_dir = Path("path/to/processed_files") sample_tag = "sample_tag" # Create a MerscopeSample instance for spatial transcriptomics processing @@ -221,16 +224,18 @@ merscope_sample = MerscopeSample() merscope_sample.load_transcripts( base_path=raw_data_dir, sample=sample_tag, - transcripts_filename='transcripts.csv', - file_format='csv' + transcripts_filename="transcripts.csv", + file_format="csv", ) # Optionally load cell boundaries cell_boundaries_path = raw_data_dir / sample_tag / "cell_boundaries.parquet" -merscope_sample.load_boundaries(path=cell_boundaries_path, file_format='parquet') +merscope_sample.load_boundaries(path=cell_boundaries_path, file_format="parquet") # Filter transcripts based on specific criteria -filtered_transcripts = merscope_sample.filter_transcripts(merscope_sample.transcripts_df) +filtered_transcripts = merscope_sample.filter_transcripts( + merscope_sample.transcripts_df +) # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = merscope_sample.build_pyg_data_from_tile( @@ -239,7 +244,7 @@ tile_pyg_data = merscope_sample.build_pyg_data_from_tile( r_tx=15, k_tx=15, use_precomputed=True, - workers=2 + workers=2, ) # Save dataset in processed format for segmentation @@ -258,6 +263,6 @@ merscope_sample.save_dataset_for_segger( test_prob=0.2, neg_sampling_ratio_approx=3, sampling_rate=1, - num_workers=2 + num_workers=2, ) ``` diff --git a/docs/user_guide/training.md b/docs/user_guide/training.md index 151fc66..8b78f0c 100644 --- a/docs/user_guide/training.md +++ b/docs/user_guide/training.md @@ -69,12 +69,12 @@ To instantiate and run the `segger` model: ```python model = segger( - num_tx_tokens=5000, # Number of unique 'tx' tokens - init_emb=32, # Initial embedding dimension - hidden_channels=64, # Number of hidden channels - num_mid_layers=2, # Number of middle layers - out_channels=128, # Number of output channels - heads=4 # Number of attention heads + num_tx_tokens=5000, # Number of unique 'tx' tokens + init_emb=32, # Initial embedding dimension + hidden_channels=64, # Number of hidden channels + num_mid_layers=2, # Number of middle layers + out_channels=128, # Number of output channels + heads=4, # Number of attention heads ) output = model(x, edge_index) diff --git a/scripts/create_data_sample.py b/scripts/create_data_sample.py index 8cdb137..3e37d23 100644 --- a/scripts/create_data_sample.py +++ b/scripts/create_data_sample.py @@ -8,16 +8,20 @@ from lightning.pytorch.plugins.environments import LightningEnvironment from matplotlib import pyplot as plt import seaborn as sns + # import pandas as pd from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os + # import Dask.DataFrame as dd -os.environ['DASK_DAEMON'] = 'False' +os.environ["DASK_DAEMON"] = "False" -xenium_data_dir = Path('/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230831-pan-cns-TMA-Xenium/output-XETG00078__0010722__TMA_AKSI__20230831__151713/') -segger_data_dir = Path('./data_tidy/pyg_datasets/pan_cns_AKSI') +xenium_data_dir = Path( + "/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230831-pan-cns-TMA-Xenium/output-XETG00078__0010722__TMA_AKSI__20230831__151713/" +) +segger_data_dir = Path("./data_tidy/pyg_datasets/pan_cns_AKSI") # models_dir = Path('./models/bc_embedding_1001') # scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' @@ -31,13 +35,11 @@ # gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) - - # Setup Xenium sample to create dataset -xs = XeniumSample(verbose=False) # , embedding_df=gene_celltype_abundance_embedding) +xs = XeniumSample(verbose=False) # , embedding_df=gene_celltype_abundance_embedding) xs.set_file_paths( - transcripts_path=xenium_data_dir / 'transcripts.parquet', - boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', + transcripts_path=xenium_data_dir / "transcripts.parquet", + boundaries_path=xenium_data_dir / "nucleus_boundaries.parquet", ) # dd.read_parquet(transcripts_path[0]) @@ -59,8 +61,7 @@ k_tx=5, val_prob=0.3, test_prob=0.1, - num_workers=6 + num_workers=6, ) except AssertionError as err: - print(f'Dataset already exists at {segger_data_dir}') - + print(f"Dataset already exists at {segger_data_dir}") diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 11c5e89..ef013ae 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -8,21 +8,22 @@ import dask.dataframe as dd import pandas as pd from pathlib import Path -os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" import cupy as cp from dask.distributed import Client, LocalCluster from dask_cuda import LocalCUDACluster import dask.dataframe as dd -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') -models_dir = Path('./models/bc_embedding_1001_small') -benchmarks_dir = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') -transcripts_file = 'data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet' +segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_1001") +models_dir = Path("./models/bc_embedding_1001_small") +benchmarks_dir = Path("/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc") +transcripts_file = "data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet" # Initialize the Lightning data module dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=1, - num_workers=1, + batch_size=1, + num_workers=1, ) dm.setup() @@ -31,22 +32,22 @@ model_version = 0 # Load in latest checkpoint -model_path = models_dir / 'lightning_logs' / f'version_{model_version}' -model = load_model(model_path / 'checkpoints') +model_path = models_dir / "lightning_logs" / f"version_{model_version}" +model = load_model(model_path / "checkpoints") -receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 5, 'dist_tx': 5} +receptive_field = {"k_bd": 4, "dist_bd": 12, "k_tx": 5, "dist_tx": 5} segment( model, dm, save_dir=benchmarks_dir, - seg_tag='segger_embedding_1001_0.5_cc', + seg_tag="segger_embedding_1001_0.5_cc", transcript_file=transcripts_file, - file_format='anndata', - receptive_field = receptive_field, + file_format="anndata", + receptive_field=receptive_field, min_transcripts=5, # max_transcripts=1500, - cell_id_col='segger_cell_id', + cell_id_col="segger_cell_id", use_cc=True, - knn_method='cuda' -) \ No newline at end of file + knn_method="cuda", +) diff --git a/scripts/train_model.py b/scripts/train_model.py index 7c25bc3..d94eda3 100644 --- a/scripts/train_model.py +++ b/scripts/train_model.py @@ -15,18 +15,20 @@ os.environ["PYTORCH_USE_CUDA_DSA"] = "1" os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + def check_and_create_raw_folder(directory): - raw_dir = directory / 'raw' + raw_dir = directory / "raw" if not raw_dir.exists(): raw_dir.mkdir(parents=True, exist_ok=True) - warnings.warn(f"'{raw_dir}' does not exist. Creating this dummy folder because SpatialTranscriptomicsDataset requires it.") + warnings.warn( + f"'{raw_dir}' does not exist. Creating this dummy folder because SpatialTranscriptomicsDataset requires it." + ) + def main(args): # CONFIG - - - - sys.path.insert(0, os.path.abspath('../..')) + + sys.path.insert(0, os.path.abspath("../..")) # Paths TRAIN_DIR = Path(args.train_dir) @@ -47,9 +49,9 @@ def main(args): hidden_channels=args.hidden_channels, out_channels=args.out_channels, heads=args.heads, - num_mid_layers=args.mid_layers # mid_layers is now included + num_mid_layers=args.mid_layers, # mid_layers is now included ) - model = to_hetero(model, (['tx', 'bd'], [('tx', 'belongs', 'bd'), ('tx', 'neighbors', 'tx')]), aggr=args.aggr) + model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr=args.aggr) batch = train_ds[0] model.forward(batch.x_dict, batch.edge_index_dict) @@ -73,25 +75,35 @@ def main(args): # Train the model trainer.fit(litsegger, train_loader, val_loader) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Train the Segger model") - parser.add_argument('--train_dir', type=str, required=True, help='Path to the training data directory') - parser.add_argument('--val_dir', type=str, required=True, help='Path to the validation data directory') - parser.add_argument('--batch_size_train', type=int, default=4, help='Batch size for training') - parser.add_argument('--batch_size_val', type=int, default=4, help='Batch size for validation') - parser.add_argument('--num_tx_tokens', type=int, default=500, help='Number of unique tx tokens for embedding') # num_tx_tokens default 500 - parser.add_argument('--init_emb', type=int, default=8, help='Initial embedding size') - parser.add_argument('--hidden_channels', type=int, default=64, help='Number of hidden channels') - parser.add_argument('--out_channels', type=int, default=16, help='Number of output channels') - parser.add_argument('--heads', type=int, default=4, help='Number of attention heads') - parser.add_argument('--mid_layers', type=int, default=1, help='Number of middle layers in the model') # mid_layers default 1 - parser.add_argument('--aggr', type=str, default='sum', help='Aggregation method') - parser.add_argument('--accelerator', type=str, default='cuda', help='Type of accelerator') - parser.add_argument('--strategy', type=str, default='auto', help='Training strategy') - parser.add_argument('--precision', type=str, default='16-mixed', help='Precision mode') - parser.add_argument('--devices', type=int, default=4, help='Number of devices') - parser.add_argument('--epochs', type=int, default=100, help='Number of epochs') - parser.add_argument('--default_root_dir', type=str, default='./models/pancreas', help='Default root directory for logs and checkpoints') + parser.add_argument("--train_dir", type=str, required=True, help="Path to the training data directory") + parser.add_argument("--val_dir", type=str, required=True, help="Path to the validation data directory") + parser.add_argument("--batch_size_train", type=int, default=4, help="Batch size for training") + parser.add_argument("--batch_size_val", type=int, default=4, help="Batch size for validation") + parser.add_argument( + "--num_tx_tokens", type=int, default=500, help="Number of unique tx tokens for embedding" + ) # num_tx_tokens default 500 + parser.add_argument("--init_emb", type=int, default=8, help="Initial embedding size") + parser.add_argument("--hidden_channels", type=int, default=64, help="Number of hidden channels") + parser.add_argument("--out_channels", type=int, default=16, help="Number of output channels") + parser.add_argument("--heads", type=int, default=4, help="Number of attention heads") + parser.add_argument( + "--mid_layers", type=int, default=1, help="Number of middle layers in the model" + ) # mid_layers default 1 + parser.add_argument("--aggr", type=str, default="sum", help="Aggregation method") + parser.add_argument("--accelerator", type=str, default="cuda", help="Type of accelerator") + parser.add_argument("--strategy", type=str, default="auto", help="Training strategy") + parser.add_argument("--precision", type=str, default="16-mixed", help="Precision mode") + parser.add_argument("--devices", type=int, default=4, help="Number of devices") + parser.add_argument("--epochs", type=int, default=100, help="Number of epochs") + parser.add_argument( + "--default_root_dir", + type=str, + default="./models/pancreas", + help="Default root directory for logs and checkpoints", + ) args = parser.parse_args() main(args) diff --git a/scripts/train_model_sample.py b/scripts/train_model_sample.py index ec3611a..8b834cc 100644 --- a/scripts/train_model_sample.py +++ b/scripts/train_model_sample.py @@ -8,19 +8,20 @@ from lightning.pytorch.plugins.environments import LightningEnvironment from matplotlib import pyplot as plt import seaborn as sns + # import pandas as pd from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') -models_dir = Path('./models/bc_embedding_1001_small') +segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_1001") +models_dir = Path("./models/bc_embedding_1001_small") dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=4, - num_workers=2, + batch_size=4, + num_workers=2, ) dm.setup() @@ -33,17 +34,17 @@ out_channels=8, heads=2, num_mid_layers=2, - aggr='sum', + aggr="sum", metadata=metadata, ) # Initialize the Lightning trainer trainer = Trainer( - accelerator='cuda', - strategy='auto', - precision='16-mixed', - devices=4, - max_epochs=200, + accelerator="cuda", + strategy="auto", + precision="16-mixed", + devices=4, + max_epochs=200, default_root_dir=models_dir, logger=CSVLogger(models_dir), ) @@ -52,7 +53,4 @@ ls.forward(batch) -trainer.fit( - model=ls, - datamodule=dm -) \ No newline at end of file +trainer.fit(model=ls, datamodule=dm) diff --git a/src/segger/__init__.py b/src/segger/__init__.py index b186ac0..59dde2b 100644 --- a/src/segger/__init__.py +++ b/src/segger/__init__.py @@ -2,7 +2,8 @@ from .data import * -from .models import * -from .prediction import * -from .training import * -from .validation import * \ No newline at end of file + +# from .models import * +# from .prediction import * +# from .training import * +# from .validation import * diff --git a/src/segger/cli/cli.py b/src/segger/cli/cli.py index 9332d08..18715ee 100644 --- a/src/segger/cli/cli.py +++ b/src/segger/cli/cli.py @@ -3,12 +3,14 @@ from segger.cli.predict import predict import click + # Setup main CLI command @click.group(help="Command line interface for the Segger segmentation package") def segger(): pass + # Add sub-commands to main CLI commands segger.add_command(create_dataset) segger.add_command(train) -segger.add_command(predict) \ No newline at end of file +segger.add_command(predict) diff --git a/src/segger/cli/configs/train/default.yaml b/src/segger/cli/configs/train/default.yaml index b685eac..cf27fc1 100644 --- a/src/segger/cli/configs/train/default.yaml +++ b/src/segger/cli/configs/train/default.yaml @@ -44,7 +44,7 @@ num_workers: help: Number of workers for data loading. accelerator: type: str - default: 'cuda' + default: "cuda" help: Device type to use for training (e.g., "cuda", "cpu"). max_epochs: type: int @@ -56,9 +56,9 @@ devices: help: Number of devices (GPUs) to use. strategy: type: str - default: 'auto' + default: "auto" help: Training strategy for the trainer. precision: type: str - default: '16-mixed' + default: "16-mixed" help: Precision for training. diff --git a/src/segger/cli/create_dataset.py b/src/segger/cli/create_dataset.py index b22e1d7..f82e85b 100644 --- a/src/segger/cli/create_dataset.py +++ b/src/segger/cli/create_dataset.py @@ -8,37 +8,56 @@ import time # Path to default YAML configuration file -data_yml = Path(__file__).parent / 'configs' / 'create_dataset' / 'default.yaml' +data_yml = Path(__file__).parent / "configs" / "create_dataset" / "default.yaml" # CLI command to create a Segger dataset help_msg = "Create Segger dataset from spatial transcriptomics data (Xenium or MERSCOPE)" + + @click.command(name="create_dataset", help=help_msg) @add_options(config_path=data_yml) -@click.option('--dataset_dir', type=Path, required=True, help='Directory containing the raw dataset.') -@click.option('--data_dir', type=Path, required=True, help='Directory to save the processed Segger dataset.') -@click.option('--sample_tag', type=str, required=True, help='Sample tag for the dataset.') -@click.option('--transcripts_file', type=str, required=True, help='Name of the transcripts file.') -@click.option('--boundaries_file', type=str, required=True, help='Name of the boundaries file.') -@click.option('--x_size', type=int, default=300, help='Size of each tile in x-direction.') -@click.option('--y_size', type=int, default=300, help='Size of each tile in y-direction.') -@click.option('--d_x', type=int, default=280, help='Tile overlap in x-direction.') -@click.option('--d_y', type=int, default=280, help='Tile overlap in y-direction.') -@click.option('--margin_x', type=int, default=10, help='Margin in x-direction.') -@click.option('--margin_y', type=int, default=10, help='Margin in y-direction.') -@click.option('--r_tx', type=int, default=5, help='Radius for computing neighborhood graph.') -@click.option('--k_tx', type=int, default=5, help='Number of nearest neighbors for the neighborhood graph.') -@click.option('--val_prob', type=float, default=0.1, help='Validation data split proportion.') -@click.option('--test_prob', type=float, default=0.2, help='Test data split proportion.') -@click.option('--neg_sampling_ratio', type=float, default=5, help='Ratio for negative sampling.') -@click.option('--sampling_rate', type=float, default=1, help='Sampling rate for the dataset.') -@click.option('--workers', type=int, default=1, help='Number of workers for parallel processing.') -@click.option('--gpu', is_flag=True, default=False, help='Use GPU if available.') -def create_dataset(args: Namespace, dataset_dir: Path, data_dir: Path, sample_tag: str, - transcripts_file: str, boundaries_file: str, x_size: int, y_size: int, - d_x: int, d_y: int, margin_x: int, margin_y: int, r_tx: int, k_tx: int, - val_prob: float, test_prob: float, neg_sampling_ratio: float, - sampling_rate: float, workers: int, gpu: bool): - +@click.option("--dataset_dir", type=Path, required=True, help="Directory containing the raw dataset.") +@click.option("--data_dir", type=Path, required=True, help="Directory to save the processed Segger dataset.") +@click.option("--sample_tag", type=str, required=True, help="Sample tag for the dataset.") +@click.option("--transcripts_file", type=str, required=True, help="Name of the transcripts file.") +@click.option("--boundaries_file", type=str, required=True, help="Name of the boundaries file.") +@click.option("--x_size", type=int, default=300, help="Size of each tile in x-direction.") +@click.option("--y_size", type=int, default=300, help="Size of each tile in y-direction.") +@click.option("--d_x", type=int, default=280, help="Tile overlap in x-direction.") +@click.option("--d_y", type=int, default=280, help="Tile overlap in y-direction.") +@click.option("--margin_x", type=int, default=10, help="Margin in x-direction.") +@click.option("--margin_y", type=int, default=10, help="Margin in y-direction.") +@click.option("--r_tx", type=int, default=5, help="Radius for computing neighborhood graph.") +@click.option("--k_tx", type=int, default=5, help="Number of nearest neighbors for the neighborhood graph.") +@click.option("--val_prob", type=float, default=0.1, help="Validation data split proportion.") +@click.option("--test_prob", type=float, default=0.2, help="Test data split proportion.") +@click.option("--neg_sampling_ratio", type=float, default=5, help="Ratio for negative sampling.") +@click.option("--sampling_rate", type=float, default=1, help="Sampling rate for the dataset.") +@click.option("--workers", type=int, default=1, help="Number of workers for parallel processing.") +@click.option("--gpu", is_flag=True, default=False, help="Use GPU if available.") +def create_dataset( + args: Namespace, + dataset_dir: Path, + data_dir: Path, + sample_tag: str, + transcripts_file: str, + boundaries_file: str, + x_size: int, + y_size: int, + d_x: int, + d_y: int, + margin_x: int, + margin_y: int, + r_tx: int, + k_tx: int, + val_prob: float, + test_prob: float, + neg_sampling_ratio: float, + sampling_rate: float, + workers: int, + gpu: bool, +): + # Setup logging ch = logging.StreamHandler() ch.setLevel(logging.INFO) @@ -47,9 +66,9 @@ def create_dataset(args: Namespace, dataset_dir: Path, data_dir: Path, sample_ta # Initialize the appropriate sample class based on dataset type logging.info("Initializing sample...") - if args.dataset_type == 'xenium': + if args.dataset_type == "xenium": sample = XeniumSample() - elif args.dataset_type == 'merscope': + elif args.dataset_type == "merscope": sample = MerscopeSample() else: raise ValueError("Unsupported dataset type. Please choose 'xenium' or 'merscope'.") diff --git a/src/segger/cli/create_dataset_fast.py b/src/segger/cli/create_dataset_fast.py index 8e6e9ee..33a3a63 100644 --- a/src/segger/cli/create_dataset_fast.py +++ b/src/segger/cli/create_dataset_fast.py @@ -9,29 +9,42 @@ import time # Path to default YAML configuration file -data_yml = Path(__file__).parent / 'configs' / 'create_dataset' / 'default_fast.yaml' +data_yml = Path(__file__).parent / "configs" / "create_dataset" / "default_fast.yaml" # CLI command to create a Segger dataset help_msg = "Create Segger dataset from spatial transcriptomics data (Xenium or MERSCOPE)" + + @click.command(name="create_dataset", help=help_msg) @add_options(config_path=data_yml) -@click.option('--base_dir', type=Path, required=True, help='Directory containing the raw dataset.') -@click.option('--data_dir', type=Path, required=True, help='Directory to save the processed Segger dataset.') -@click.option('--sample_type', type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".') -@click.option('--k_bd', type=int, default=3, help='Number of nearest neighbors for boundary nodes.') -@click.option('--dist_bd', type=float, default=15., help='Maximum distance for boundary neighbors.') -@click.option('--k_tx', type=int, default=3, help='Number of nearest neighbors for transcript nodes.') -@click.option('--dist_tx', type=float, default=5., help='Maximum distance for transcript neighbors.') -@click.option('--tile_size', type=int, default=None, help='If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`.') -@click.option('--tile_width', type=int, default=None, help='Width of the tiles in pixels. Ignored if `tile_size` is provided.') -@click.option('--tile_height', type=int, default=None, help='Height of the tiles in pixels. Ignored if `tile_size` is provided.') -@click.option('--neg_sampling_ratio', type=float, default=5., help='Ratio of negative samples.') -@click.option('--frac', type=float, default=1., help='Fraction of the dataset to process.') -@click.option('--val_prob', type=float, default=0.1, help='Proportion of data for use for validation split.') -@click.option('--test_prob', type=float, default=0.2, help='Proportion of data for use for test split.') -@click.option('--n_workers', type=int, default=1, help='Number of workers for parallel processing.') +@click.option("--base_dir", type=Path, required=True, help="Directory containing the raw dataset.") +@click.option("--data_dir", type=Path, required=True, help="Directory to save the processed Segger dataset.") +@click.option( + "--sample_type", type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".' +) +@click.option("--k_bd", type=int, default=3, help="Number of nearest neighbors for boundary nodes.") +@click.option("--dist_bd", type=float, default=15.0, help="Maximum distance for boundary neighbors.") +@click.option("--k_tx", type=int, default=3, help="Number of nearest neighbors for transcript nodes.") +@click.option("--dist_tx", type=float, default=5.0, help="Maximum distance for transcript neighbors.") +@click.option( + "--tile_size", + type=int, + default=None, + help="If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`.", +) +@click.option( + "--tile_width", type=int, default=None, help="Width of the tiles in pixels. Ignored if `tile_size` is provided." +) +@click.option( + "--tile_height", type=int, default=None, help="Height of the tiles in pixels. Ignored if `tile_size` is provided." +) +@click.option("--neg_sampling_ratio", type=float, default=5.0, help="Ratio of negative samples.") +@click.option("--frac", type=float, default=1.0, help="Fraction of the dataset to process.") +@click.option("--val_prob", type=float, default=0.1, help="Proportion of data for use for validation split.") +@click.option("--test_prob", type=float, default=0.2, help="Proportion of data for use for test split.") +@click.option("--n_workers", type=int, default=1, help="Number of workers for parallel processing.") def create_dataset(args: Namespace): - + # Setup logging ch = logging.StreamHandler() ch.setLevel(logging.INFO) @@ -67,5 +80,6 @@ def create_dataset(args: Namespace): logging.info(f"Time to save dataset: {end_time - start_time} seconds") logging.info("Dataset saved successfully.") -if __name__ == '__main__': - create_dataset() \ No newline at end of file + +if __name__ == "__main__": + create_dataset() diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py index eca5a4b..2cfe83e 100644 --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -5,34 +5,47 @@ import logging import os -os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + @click.command(name="run_segmentation", help="Run the Segger segmentation model.") -@click.option('--segger_data_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') -@click.option('--models_dir', type=Path, required=True, help='Directory containing the trained models.') -@click.option('--benchmarks_dir', type=Path, required=True, help='Directory to save the segmentation results.') -@click.option('--transcripts_file', type=str, required=True, help='Path to the transcripts file.') -@click.option('--batch_size', type=int, default=1, help='Batch size for processing.') -@click.option('--num_workers', type=int, default=1, help='Number of workers for data loading.') -@click.option('--model_version', type=int, default=0, help='Model version to load.') -@click.option('--save_tag', type=str, default='segger_embedding_1001_0.5', help='Tag for saving segmentation results.') -@click.option('--min_transcripts', type=int, default=5, help='Minimum number of transcripts for segmentation.') -@click.option('--cell_id_col', type=str, default='segger_cell_id', help='Column name for cell IDs.') -@click.option('--use_cc', is_flag=True, default=False, help='Use connected components if specified.') -@click.option('--knn_method', type=str, default='cuda', help='Method for KNN computation.') -@click.option('--file_format', type=str, default='anndata', help='File format for output data.') -@click.option('--k_bd', type=int, default=4, help='K value for boundary computation.') -@click.option('--dist_bd', type=int, default=12, help='Distance for boundary computation.') -@click.option('--k_tx', type=int, default=5, help='K value for transcript computation.') -@click.option('--dist_tx', type=int, default=5, help='Distance for transcript computation.') -def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Path, - transcripts_file: str, batch_size: int = 1, num_workers: int = 1, - model_version: int = 0, save_tag: str = 'segger_embedding_1001_0.5', - min_transcripts: int = 5, cell_id_col: str = 'segger_cell_id', - use_cc: bool = False, knn_method: str = 'cuda', - file_format: str = 'anndata', k_bd: int = 4, dist_bd: int = 12, - k_tx: int = 5, dist_tx: int = 5): - +@click.option("--segger_data_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option("--models_dir", type=Path, required=True, help="Directory containing the trained models.") +@click.option("--benchmarks_dir", type=Path, required=True, help="Directory to save the segmentation results.") +@click.option("--transcripts_file", type=str, required=True, help="Path to the transcripts file.") +@click.option("--batch_size", type=int, default=1, help="Batch size for processing.") +@click.option("--num_workers", type=int, default=1, help="Number of workers for data loading.") +@click.option("--model_version", type=int, default=0, help="Model version to load.") +@click.option("--save_tag", type=str, default="segger_embedding_1001_0.5", help="Tag for saving segmentation results.") +@click.option("--min_transcripts", type=int, default=5, help="Minimum number of transcripts for segmentation.") +@click.option("--cell_id_col", type=str, default="segger_cell_id", help="Column name for cell IDs.") +@click.option("--use_cc", is_flag=True, default=False, help="Use connected components if specified.") +@click.option("--knn_method", type=str, default="cuda", help="Method for KNN computation.") +@click.option("--file_format", type=str, default="anndata", help="File format for output data.") +@click.option("--k_bd", type=int, default=4, help="K value for boundary computation.") +@click.option("--dist_bd", type=int, default=12, help="Distance for boundary computation.") +@click.option("--k_tx", type=int, default=5, help="K value for transcript computation.") +@click.option("--dist_tx", type=int, default=5, help="Distance for transcript computation.") +def run_segmentation( + segger_data_dir: Path, + models_dir: Path, + benchmarks_dir: Path, + transcripts_file: str, + batch_size: int = 1, + num_workers: int = 1, + model_version: int = 0, + save_tag: str = "segger_embedding_1001_0.5", + min_transcripts: int = 5, + cell_id_col: str = "segger_cell_id", + use_cc: bool = False, + knn_method: str = "cuda", + file_format: str = "anndata", + k_bd: int = 4, + dist_bd: int = 12, + k_tx: int = 5, + dist_tx: int = 5, +): + # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -41,16 +54,16 @@ def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Pa # Initialize the Lightning data module dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=batch_size, - num_workers=num_workers, + batch_size=batch_size, + num_workers=num_workers, ) - + dm.setup() - + logger.info("Loading the model...") # Load in the latest checkpoint - model_path = models_dir / 'lightning_logs' / f'version_{model_version}' - model = load_model(model_path / 'checkpoints') + model_path = models_dir / "lightning_logs" / f"version_{model_version}" + model = load_model(model_path / "checkpoints") logger.info("Running segmentation...") segment( @@ -59,15 +72,16 @@ def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Pa save_dir=benchmarks_dir, seg_tag=save_tag, transcript_file=transcripts_file, - file_format=file_format, - receptive_field={'k_bd': k_bd, 'dist_bd': dist_bd, 'k_tx': k_tx, 'dist_tx': dist_tx}, + file_format=file_format, + receptive_field={"k_bd": k_bd, "dist_bd": dist_bd, "k_tx": k_tx, "dist_tx": dist_tx}, min_transcripts=min_transcripts, cell_id_col=cell_id_col, use_cc=use_cc, knn_method=knn_method, ) - + logger.info("Segmentation completed.") -if __name__ == '__main__': + +if __name__ == "__main__": run_segmentation() diff --git a/src/segger/cli/train_model.py b/src/segger/cli/train_model.py index 78bd5d7..a3a23a7 100644 --- a/src/segger/cli/train_model.py +++ b/src/segger/cli/train_model.py @@ -7,27 +7,33 @@ from argparse import Namespace # Path to default YAML configuration file -train_yml = Path(__file__).parent / 'configs' / 'train' / 'default.yaml' +train_yml = Path(__file__).parent / "configs" / "train" / "default.yaml" help_msg = "Train the Segger segmentation model." + + @click.command(name="train_model", help=help_msg) @add_options(config_path=train_yml) -@click.option('--dataset_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') -@click.option('--models_dir', type=Path, required=True, help='Directory to save the trained model and the training logs.') -@click.option('--sample_tag', type=str, required=True, help='Sample tag for the dataset.') -@click.option('--init_emb', type=int, default=8, help='Size of the embedding layer.') -@click.option('--hidden_channels', type=int, default=32, help='Size of hidden channels in the model.') -@click.option('--num_tx_tokens', type=int, default=500, help='Number of transcript tokens.') -@click.option('--out_channels', type=int, default=8, help='Number of output channels.') -@click.option('--heads', type=int, default=2, help='Number of attention heads.') -@click.option('--num_mid_layers', type=int, default=2, help='Number of mid layers in the model.') -@click.option('--batch_size', type=int, default=4, help='Batch size for training.') -@click.option('--num_workers', type=int, default=2, help='Number of workers for data loading.') -@click.option('--accelerator', type=str, default='cuda', help='Device type to use for training (e.g., "cuda", "cpu").') # Ask for accelerator -@click.option('--max_epochs', type=int, default=200, help='Number of epochs for training.') -@click.option('--devices', type=int, default=4, help='Number of devices (GPUs) to use.') -@click.option('--strategy', type=str, default='auto', help='Training strategy for the trainer.') -@click.option('--precision', type=str, default='16-mixed', help='Precision for training.') +@click.option("--dataset_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option( + "--models_dir", type=Path, required=True, help="Directory to save the trained model and the training logs." +) +@click.option("--sample_tag", type=str, required=True, help="Sample tag for the dataset.") +@click.option("--init_emb", type=int, default=8, help="Size of the embedding layer.") +@click.option("--hidden_channels", type=int, default=32, help="Size of hidden channels in the model.") +@click.option("--num_tx_tokens", type=int, default=500, help="Number of transcript tokens.") +@click.option("--out_channels", type=int, default=8, help="Number of output channels.") +@click.option("--heads", type=int, default=2, help="Number of attention heads.") +@click.option("--num_mid_layers", type=int, default=2, help="Number of mid layers in the model.") +@click.option("--batch_size", type=int, default=4, help="Batch size for training.") +@click.option("--num_workers", type=int, default=2, help="Number of workers for data loading.") +@click.option( + "--accelerator", type=str, default="cuda", help='Device type to use for training (e.g., "cuda", "cpu").' +) # Ask for accelerator +@click.option("--max_epochs", type=int, default=200, help="Number of epochs for training.") +@click.option("--devices", type=int, default=4, help="Number of devices (GPUs) to use.") +@click.option("--strategy", type=str, default="auto", help="Training strategy for the trainer.") +@click.option("--precision", type=str, default="16-mixed", help="Precision for training.") def train_model(args: Namespace): # Setup logging @@ -43,6 +49,7 @@ def train_model(args: Namespace): from segger.training.segger_data_module import SeggerDataModule from lightning.pytorch.loggers import CSVLogger from pytorch_lightning import Trainer + logging.info("Done.") # Load datasets @@ -66,7 +73,7 @@ def train_model(args: Namespace): out_channels=args.out_channels, # Hard-coded value heads=args.heads, # Hard-coded value num_mid_layers=args.num_mid_layers, # Hard-coded value - aggr='sum', # Hard-coded value + aggr="sum", # Hard-coded value metadata=metadata, ) @@ -80,15 +87,12 @@ def train_model(args: Namespace): default_root_dir=args.models_dir, logger=CSVLogger(args.models_dir), ) - + logging.info("Done.") # Train model logging.info("Training model...") - trainer.fit( - model=ls, - datamodule=dm - ) + trainer.fit(model=ls, datamodule=dm) logging.info("Done.") @@ -97,11 +101,13 @@ def train_model(args: Namespace): def train_slurm(args): train_model(args) + @click.group(help="Train the Segger model") def train(): pass + train.add_command(train_slurm) -if __name__ == '__main__': - train_model() \ No newline at end of file +if __name__ == "__main__": + train_model() diff --git a/src/segger/cli/utils.py b/src/segger/cli/utils.py index 2a38610..df6e816 100644 --- a/src/segger/cli/utils.py +++ b/src/segger/cli/utils.py @@ -12,11 +12,11 @@ def add_options( show_default: bool = True, ): """ - A decorator to add command-line options to a Click command from a YAML + A decorator to add command-line options to a Click command from a YAML configuration file. Parameters: - config_path (os.PathLike): The path to the YAML configuration file + config_path (os.PathLike): The path to the YAML configuration file containing the options. show_default (bool): Whether to show default values in help. @@ -26,7 +26,7 @@ def add_options( The YAML configuration file should have the following format: ``` option_name: - type: "type_name" # Optional, the type of the option + type: "type_name" # Optional, the type of the option (e.g., "str", "int") help: "description" # Optional, the help text for the option default: value # Optional, the default value for the option @@ -52,24 +52,23 @@ def greet(args): click.echo(f"Hello, {args.name}! You are {args.age} years old.") ``` """ - def decorator( - function: typing.Callable - ): + + def decorator(function: typing.Callable): # Wrap the original function to convert kwargs to a Namespace object def wrapper(**kwargs): args_namespace = Namespace(**kwargs) return function(args_namespace) - + # Load the YAML configuration file - with open(config_path, 'r') as file: + with open(config_path, "r") as file: config = yaml.safe_load(file.read()) # Decorate function with all options for name, kwargs in reversed(config.items()): - kwargs['show_default'] = show_default - if 'type' in kwargs: - kwargs['type'] = locate(kwargs['type']) - wrapper = click.option(f'--{name}', **kwargs)(wrapper) + kwargs["show_default"] = show_default + if "type" in kwargs: + kwargs["type"] = locate(kwargs["type"]) + wrapper = click.option(f"--{name}", **kwargs)(wrapper) return wrapper @@ -87,31 +86,32 @@ class CustomFormatter(logging.Formatter): bold_red (str): ANSI escape code for bold red color. reset (str): ANSI escape code to reset color. format (str): The format string for log messages. - FORMATS (dict): A dictionary mapping log levels to their respective + FORMATS (dict): A dictionary mapping log levels to their respective color-coded format strings. Methods: format(record): - Format the specified record as text, applying color codes based on the + Format the specified record as text, applying color codes based on the log level. """ + grey = "\x1b[38;20m" green = "\x1b[32;20m" yellow = "\x1b[33;20m" red = "\x1b[31;20m" bold_red = "\x1b[31;1m" reset = "\x1b[0m" - format='%(asctime)s %(levelname)s: %(message)s' + format = "%(asctime)s %(levelname)s: %(message)s" FORMATS = { logging.DEBUG: grey + format + reset, logging.INFO: green + format + reset, logging.WARNING: yellow + format + reset, logging.ERROR: red + format + reset, - logging.CRITICAL: bold_red + format + reset + logging.CRITICAL: bold_red + format + reset, } def format(self, record): log_fmt = self.FORMATS.get(record.levelno) formatter = logging.Formatter(log_fmt) - return formatter.format(record) \ No newline at end of file + return formatter.format(record) diff --git a/src/segger/data/README.md b/src/segger/data/README.md index df6e979..28d7df0 100644 --- a/src/segger/data/README.md +++ b/src/segger/data/README.md @@ -1,6 +1,6 @@ # segger - Data Preparation for Cell Segmentation -The `segger` package provides a comprehensive data preparation module for handling and processing spatial transcriptomics data, specifically designed to support **Xenium** and **Merscope** datasets. This module facilitates the creation of datasets for cell segmentation and subsequent graph-based deep learning tasks by leveraging scalable and efficient processing tools. +The `segger` package provides a comprehensive data preparation module for handling and processing spatial transcriptomics data, specifically designed to support **Xenium** and **Merscope** datasets. This module facilitates the creation of datasets for cell segmentation and subsequent graph-based deep learning tasks by leveraging scalable and efficient processing tools. ## Module Overview @@ -48,7 +48,6 @@ These classes inherit from `SpatialTranscriptomicsSample` and implement dataset- - **`XeniumSample`**: Tailored for **Xenium** datasets, it includes specific filtering rules to exclude unwanted transcripts based on naming patterns (e.g., `NegControlProbe_`, `BLANK_`). - **`MerscopeSample`**: Designed for **Merscope** datasets, allowing for custom filtering and processing logic as needed. - ## Workflow The dataset creation and processing workflow involves several key steps, each ensuring that the spatial transcriptomics data is appropriately prepared for downstream machine learning tasks. @@ -61,39 +60,42 @@ The dataset creation and processing workflow involves several key steps, each en ### Step 2: Tiling - **Spatial Segmentation**: The dataset is divided into smaller, manageable tiles of size $$x_{\text{size}} \times y_{\text{size}}$$, defined by their top-left corner coordinates $$(x_i, y_j)$$. - + $$ n_x = \left\lfloor \frac{x_{\text{max}} - x_{\text{min}}}{d_x} \right\rfloor, \quad n_y = \left\lfloor \frac{y_{\text{max}} - y_{\text{min}}}{d_y} \right\rfloor $$ - - Where: - - $$x_{\text{min}}, y_{\text{min}}$$: Minimum spatial coordinates. - - $$x_{\text{max}}, y_{\text{max}}$$: Maximum spatial coordinates. - - $$d_x, d_y$$: Step sizes along the $$x$$- and $$y$$-axes, respectively. + +Where: + +- $$x_{\text{min}}, y_{\text{min}}$$: Minimum spatial coordinates. +- $$x_{\text{max}}, y_{\text{max}}$$: Maximum spatial coordinates. +- $$d_x, d_y$$: Step sizes along the $$x$$- and $$y$$-axes, respectively. - **Transcript and Boundary Inclusion**: For each tile, transcripts and boundaries within the spatial bounds (with optional margins) are included: - -$$ -x_i - \text{margin}_x \leq x_t < x_i + x_{\text{size}} + \text{margin}_x, \quad y_j - \text{margin}_y \leq y_t < y_j + y_{\text{size}} + \text{margin}_y + +$$ +x_i - \text{margin}_x \leq x_t < x_i + x_{\text{size}} + \text{margin}_x, \quad y_j - \text{margin}_y \leq y_t < y_j + y_{\text{size}} + \text{margin}_y $$ - - Where: - - $$x_t, y_t$$: Transcript coordinates. - - $$\text{margin}_x, \text{margin}_y$$: Optional margins to include contextual data. + +Where: + +- $$x_t, y_t$$: Transcript coordinates. +- $$\text{margin}_x, \text{margin}_y$$: Optional margins to include contextual data. ### Step 3: Graph Construction For each tile, a graph $$G$$ is constructed with: - **Nodes ($$V$$)**: + - **Transcripts**: Represented by their spatial coordinates $$(x_t, y_t)$$ and feature vectors $$\mathbf{f}_t$$. - **Boundaries**: Represented by centroid coordinates $$(x_b, y_b)$$ and associated properties (e.g., area). - **Edges ($$E$$)**: - Created based on spatial proximity using methods like KD-Tree or FAISS. - Defined by a distance threshold $$d$$ and the number of nearest neighbors $$k$$: - -$$ + +$$ E = \{ (v_i, v_j) \mid \text{dist}(v_i, v_j) < d, \, v_i \in V, \, v_j \in V \} $$ @@ -102,7 +104,7 @@ $$ If enabled, edges can be labeled based on relationships, such as whether a transcript belongs to a boundary: $$ -\text{label}(t, b) = +\text{label}(t, b) = \begin{cases} 1 & \text{if } t \text{ belongs to } b \\ 0 & \text{otherwise} @@ -123,7 +125,6 @@ Each tile is randomly assigned to one of these sets according to the specified p The final output consists of a set of tiles, each containing a graph representation of the spatial transcriptomics data. These tiles are stored in designated directories (`train_tiles`, `val_tiles`, `test_tiles`) and are ready for integration into machine learning pipelines. - ## Example Usage Below are examples demonstrating how to utilize the `segger` data preparation module for both Xenium and Merscope datasets. @@ -137,25 +138,29 @@ from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os -xenium_data_dir = Path('./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1') -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') -models_dir = Path('./models/bc_embedding_0919') +xenium_data_dir = Path("./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1") +segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_0919") +models_dir = Path("./models/bc_embedding_0919") -scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' +scRNAseq_path = ( + "/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad" +) scRNAseq = sc.read(scRNAseq_path) sc.pp.subsample(scRNAseq, 0.1) # Step 1: Calculate the gene cell type abundance embedding -celltype_column = 'celltype_minor' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) +celltype_column = "celltype_minor" +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( + scRNAseq, celltype_column +) # Setup Xenium sample to create dataset -xs = XeniumSample(verbose=False , embedding_df=gene_celltype_abundance_embedding) +xs = XeniumSample(verbose=False, embedding_df=gene_celltype_abundance_embedding) xs.set_file_paths( - transcripts_path=xenium_data_dir / 'transcripts.parquet', - boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', + transcripts_path=xenium_data_dir / "transcripts.parquet", + boundaries_path=xenium_data_dir / "nucleus_boundaries.parquet", ) xs.set_metadata() @@ -164,7 +169,7 @@ xenium_sample.set_embedding("cell_type_abundance") # Load nuclei data to define boundaries nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') +xenium_sample.load_boundaries(path=nuclei_path, file_format="parquet") # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = xenium_sample.build_pyg_data_from_tile( @@ -173,7 +178,7 @@ tile_pyg_data = xenium_sample.build_pyg_data_from_tile( r_tx=20, k_tx=20, use_precomputed=False, - workers=1 + workers=1, ) @@ -191,10 +196,10 @@ try: k_tx=10, val_prob=0.4, test_prob=0.1, - num_workers=6 + num_workers=6, ) except AssertionError as err: - print(f'Dataset already exists at {segger_data_dir}') + print(f"Dataset already exists at {segger_data_dir}") ``` ### Merscope Data @@ -204,8 +209,8 @@ from segger.data import MerscopeSample from pathlib import Path # Set up the file paths -raw_data_dir = Path('data_raw/merscope/') -processed_data_dir = Path('data_tidy/pyg_datasets') +raw_data_dir = Path("data_raw/merscope/") +processed_data_dir = Path("data_tidy/pyg_datasets") sample_tag = "Merscope_Sample_1" # Create a MerscopeSample instance for spatial transcriptomics processing @@ -215,16 +220,18 @@ merscope_sample = MerscopeSample() merscope_sample.load_transcripts( base_path=raw_data_dir, sample=sample_tag, - transcripts_filename='transcripts.csv', - file_format='csv' + transcripts_filename="transcripts.csv", + file_format="csv", ) # Optionally load cell boundaries cell_boundaries_path = raw_data_dir / sample_tag / "cell_boundaries.parquet" -merscope_sample.load_boundaries(path=cell_boundaries_path, file_format='parquet') +merscope_sample.load_boundaries(path=cell_boundaries_path, file_format="parquet") # Filter transcripts based on specific criteria -filtered_transcripts = merscope_sample.filter_transcripts(merscope_sample.transcripts_df) +filtered_transcripts = merscope_sample.filter_transcripts( + merscope_sample.transcripts_df +) # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = merscope_sample.build_pyg_data_from_tile( @@ -233,12 +240,12 @@ tile_pyg_data = merscope_sample.build_pyg_data_from_tile( r_tx=15, k_tx=15, use_precomputed=True, - workers=2 + workers=2, ) # Save dataset in processed format for segmentation merscope_sample.save_dataset_for_segger( - processed_dir=processed_data_dir / 'embedding', + processed_dir=processed_data_dir / "embedding", x_size=360, y_size=360, d_x=180, @@ -252,6 +259,6 @@ merscope_sample.save_dataset_for_segger( test_prob=0.2, neg_sampling_ratio_approx=3, sampling_rate=1, - num_workers=2 + num_workers=2, ) ``` diff --git a/src/segger/data/__init__.py b/src/segger/data/__init__.py index 1d60059..fee4f79 100644 --- a/src/segger/data/__init__.py +++ b/src/segger/data/__init__.py @@ -5,35 +5,30 @@ """ __all__ = [ - "XeniumSample", - "MerscopeSample", - "SpatialTranscriptomicsDataset", - "filter_transcripts", - "create_anndata", - "compute_transcript_metrics", + "XeniumSample", + "MerscopeSample", + "SpatialTranscriptomicsDataset", + "filter_transcripts", + "create_anndata", + "compute_transcript_metrics", "SpatialTranscriptomicsSample", "calculate_gene_celltype_abundance_embedding", "get_edge_index", ] -from .utils import ( - filter_transcripts, - create_anndata, - compute_transcript_metrics, - get_edge_index, +from segger.data.utils import ( + filter_transcripts, + create_anndata, + compute_transcript_metrics, + get_edge_index, calculate_gene_celltype_abundance_embedding, - SpatialTranscriptomicsDataset + SpatialTranscriptomicsDataset, ) -from .io import ( - XeniumSample, - MerscopeSample, +from segger.data.io import ( + XeniumSample, + MerscopeSample, SpatialTranscriptomicsSample, ) -from .constants import ( - SpatialTranscriptomicsKeys, - XeniumKeys, - MerscopeKeys -) - +from segger.data.constants import SpatialTranscriptomicsKeys, XeniumKeys, MerscopeKeys diff --git a/src/segger/data/constants.py b/src/segger/data/constants.py index 7cd1fb6..b48350f 100644 --- a/src/segger/data/constants.py +++ b/src/segger/data/constants.py @@ -1,5 +1,6 @@ from enum import Enum, auto + class SpatialTranscriptomicsKeys(Enum): """Unified keys for spatial transcriptomics data, supporting multiple platforms.""" @@ -7,11 +8,11 @@ class SpatialTranscriptomicsKeys(Enum): TRANSCRIPTS_FILE = auto() BOUNDARIES_FILE = auto() CELL_METADATA_FILE = auto() - + # Cell identifiers CELL_ID = auto() TRANSCRIPTS_ID = auto() - + # Coordinates and locations TRANSCRIPTS_X = auto() TRANSCRIPTS_Y = auto() @@ -19,7 +20,7 @@ class SpatialTranscriptomicsKeys(Enum): BOUNDARIES_VERTEX_Y = auto() GLOBAL_X = auto() GLOBAL_Y = auto() - + # Metadata METADATA_CELL_KEY = auto() COUNTS_CELL_KEY = auto() diff --git a/src/segger/data/io.py b/src/segger/data/io.py index a369b9f..1441227 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -30,11 +30,8 @@ import logging import warnings -for msg in [ - r".*Geometry is in a geographic CRS.*", - r".*You did not provide metadata.*" -]: - warnings.filterwarnings('ignore', category=UserWarning, message=msg) +for msg in [r".*Geometry is in a geographic CRS.*", r".*You did not provide metadata.*"]: + warnings.filterwarnings("ignore", category=UserWarning, message=msg) class SpatialTranscriptomicsSample(ABC): @@ -60,10 +57,8 @@ def __init__( self.boundaries_graph = boundaries_graph self.keys = keys self.embedding_df = embedding_df - self.current_embedding = 'token' + self.current_embedding = "token" self.verbose = verbose - - @abstractmethod def filter_transcripts(self, transcripts_df: pd.DataFrame, min_qv: float = 20.0) -> pd.DataFrame: @@ -78,8 +73,7 @@ def filter_transcripts(self, transcripts_df: pd.DataFrame, min_qv: float = 20.0) pd.DataFrame: The filtered dataframe. """ pass - - + def set_file_paths(self, transcripts_path: Path, boundaries_path: Path) -> None: """ Set the paths for the transcript and boundary files. @@ -90,10 +84,11 @@ def set_file_paths(self, transcripts_path: Path, boundaries_path: Path) -> None: """ self.transcripts_path = transcripts_path self.boundaries_path = boundaries_path - - if self.verbose: print(f"Set transcripts file path to {transcripts_path}") - if self.verbose: print(f"Set boundaries file path to {boundaries_path}") + if self.verbose: + print(f"Set transcripts file path to {transcripts_path}") + if self.verbose: + print(f"Set boundaries file path to {boundaries_path}") def load_transcripts( self, @@ -153,22 +148,22 @@ def load_transcripts( self.keys.TRANSCRIPTS_X.value, self.keys.TRANSCRIPTS_Y.value, self.keys.FEATURE_NAME.value, - self.keys.CELL_ID.value + self.keys.CELL_ID.value, ] # Check if the QUALITY_VALUE key exists in the dataset, and add it to the columns list if present if self.keys.QUALITY_VALUE.value in available_columns: columns_to_read.append(self.keys.QUALITY_VALUE.value) - + if self.keys.OVERLAPS_BOUNDARY.value in available_columns: columns_to_read.append(self.keys.OVERLAPS_BOUNDARY.value) # Use filters to only load data within the specified bounding box (x_min, x_max, y_min, y_max) filters = [ - (self.keys.TRANSCRIPTS_X.value, '>=', x_min), - (self.keys.TRANSCRIPTS_X.value, '<=', x_max), - (self.keys.TRANSCRIPTS_Y.value, '>=', y_min), - (self.keys.TRANSCRIPTS_Y.value, '<=', y_max) + (self.keys.TRANSCRIPTS_X.value, ">=", x_min), + (self.keys.TRANSCRIPTS_X.value, "<=", x_max), + (self.keys.TRANSCRIPTS_Y.value, ">=", y_min), + (self.keys.TRANSCRIPTS_Y.value, "<=", y_max), ] # Load the dataset lazily with filters applied for the bounding box @@ -190,32 +185,32 @@ def load_transcripts( transcripts_df = self.filter_transcripts(transcripts_df) # Handle additional embeddings if provided - if not self.embedding_df.empty: + if self.embedding_df is not None and not self.embedding_df.empty: valid_genes = self.embedding_df.index # Lazily count the number of rows in the DataFrame before filtering initial_count = delayed(lambda df: df.shape[0])(transcripts_df) # Filter the DataFrame lazily based on valid genes from embeddings - transcripts_df = transcripts_df[ - transcripts_df[self.keys.FEATURE_NAME.value].isin(valid_genes) - ] + transcripts_df = transcripts_df[transcripts_df[self.keys.FEATURE_NAME.value].isin(valid_genes)] final_count = delayed(lambda df: df.shape[0])(transcripts_df) - if self.verbose: print(f"Dropped {initial_count - final_count} transcripts not found in {key} embedding.") + if self.verbose: + print(f"Dropped {initial_count - final_count} transcripts not found in {key} embedding.") # Ensure that the 'OVERLAPS_BOUNDARY' column is boolean if it exists if self.keys.OVERLAPS_BOUNDARY.value in transcripts_df.columns: - transcripts_df[self.keys.OVERLAPS_BOUNDARY.value] = transcripts_df[self.keys.OVERLAPS_BOUNDARY.value].astype(bool) + transcripts_df[self.keys.OVERLAPS_BOUNDARY.value] = transcripts_df[ + self.keys.OVERLAPS_BOUNDARY.value + ].astype(bool) return transcripts_df - def load_boundaries( - self, - path: Path, - file_format: str = "parquet", - x_min: float = None, - x_max: float = None, - y_min: float = None, - y_max: float = None + self, + path: Path, + file_format: str = "parquet", + x_min: float = None, + x_max: float = None, + y_min: float = None, + y_max: float = None, ) -> dd.DataFrame: """ Load boundaries data lazily using Dask, filtering by the specified bounding box. @@ -233,7 +228,7 @@ def load_boundaries( """ if file_format != "parquet": raise ValueError(f"Unsupported file format: {file_format}") - + self.boundaries_path = path # Use bounding box values from set_metadata if not explicitly provided @@ -246,15 +241,15 @@ def load_boundaries( columns_to_read = [ self.keys.BOUNDARIES_VERTEX_X.value, self.keys.BOUNDARIES_VERTEX_Y.value, - self.keys.CELL_ID.value + self.keys.CELL_ID.value, ] # Use filters to only load data within the specified bounding box (x_min, x_max, y_min, y_max) filters = [ - (self.keys.BOUNDARIES_VERTEX_X.value, '>=', x_min), - (self.keys.BOUNDARIES_VERTEX_X.value, '<=', x_max), - (self.keys.BOUNDARIES_VERTEX_Y.value, '>=', y_min), - (self.keys.BOUNDARIES_VERTEX_Y.value, '<=', y_max) + (self.keys.BOUNDARIES_VERTEX_X.value, ">=", x_min), + (self.keys.BOUNDARIES_VERTEX_X.value, "<=", x_max), + (self.keys.BOUNDARIES_VERTEX_Y.value, ">=", y_min), + (self.keys.BOUNDARIES_VERTEX_Y.value, "<=", y_max), ] # Load the dataset lazily with filters applied for the bounding box @@ -262,24 +257,27 @@ def load_boundaries( # Convert the cell IDs to strings lazily boundaries_df[self.keys.CELL_ID.value] = boundaries_df[self.keys.CELL_ID.value].apply( - lambda x: str(x) if pd.notnull(x) else None + lambda x: str(x) if pd.notnull(x) else None, meta=("cell_id", "object") ) - if self.verbose: print(f"Loaded boundaries from '{path}' within bounding box ({x_min}, {x_max}, {y_min}, {y_max}).") + if self.verbose: + print(f"Loaded boundaries from '{path}' within bounding box ({x_min}, {x_max}, {y_min}, {y_max}).") return boundaries_df - - - def set_metadata(self) -> None: """ Set metadata for the transcript dataset, including bounding box limits and unique gene names, - without reading the entire Parquet file. Additionally, return integer tokens for unique gene names + without reading the entire Parquet file. Additionally, return integer tokens for unique gene names instead of one-hot encodings and store the lookup table for later mapping. """ # Load the Parquet file metadata - parquet_file = pq.ParquetFile(self.transcripts_path) + ## + # luca's experiment + import time + + # old method + parquet_file = pq.read_table(self.transcripts_path) # Get the column names for X, Y, and feature names from the class's keys x_col = self.keys.TRANSCRIPTS_X.value @@ -287,7 +285,7 @@ def set_metadata(self) -> None: feature_col = self.keys.FEATURE_NAME.value # Initialize variables to track min/max values for X and Y - x_min, x_max, y_min, y_max = float('inf'), float('-inf'), float('inf'), float('-inf') + x_min, x_max, y_min, y_max = float("inf"), float("-inf"), float("inf"), float("-inf") # Extract unique gene names and ensure they're strings gene_set = set() @@ -299,16 +297,19 @@ def set_metadata(self) -> None: "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", - "UnassignedCodeword_" + "UnassignedCodeword_", ) - # Iterate over row groups to extract statistics and unique gene names - for row_group_idx in range(parquet_file.num_row_groups): - row_group_table = parquet_file.read_row_group(row_group_idx, columns=[x_col, y_col, feature_col]) + row_group_size = 1_000_000 + start = 0 + n = len(parquet_file) + while start < n: + chunk = parquet_file.slice(start, start + row_group_size) + start += row_group_size # Update the bounding box values (min/max) - x_values = row_group_table[x_col].to_pandas() - y_values = row_group_table[y_col].to_pandas() + x_values = chunk[x_col].to_pandas() + y_values = chunk[y_col].to_pandas() x_min = min(x_min, x_values.min()) x_max = max(x_max, x_values.max()) @@ -316,8 +317,12 @@ def set_metadata(self) -> None: y_max = max(y_max, y_values.max()) # Convert feature values (gene names) to strings and filter out unwanted codewords - feature_values = row_group_table[feature_col].to_pandas().apply( - lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), + feature_values = ( + chunk[feature_col] + .to_pandas() + .apply( + lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), + ) ) # Filter out unwanted codewords @@ -332,11 +337,15 @@ def set_metadata(self) -> None: self.y_min = y_min self.y_max = y_max - if self.verbose: print(f"Bounding box limits set: x_min={self.x_min}, x_max={self.x_max}, y_min={self.y_min}, y_max={self.y_max}") + if self.verbose: + print( + f"Bounding box limits set: x_min={self.x_min}, x_max={self.x_max}, y_min={self.y_min}, y_max={self.y_max}" + ) # Convert the set of unique genes into a sorted list for consistent ordering self.unique_genes = sorted(gene_set) - if self.verbose: print(f"Extracted {len(self.unique_genes)} unique gene names for integer tokenization.") + if self.verbose: + print(f"Extracted {len(self.unique_genes)} unique gene names for integer tokenization.") # Initialize a LabelEncoder to convert unique genes into integer tokens self.tx_encoder = LabelEncoder() @@ -345,18 +354,19 @@ def set_metadata(self) -> None: self.tx_encoder.fit(self.unique_genes) # Store the integer tokens mapping to gene names - self.gene_to_token_map = dict(zip(self.tx_encoder.classes_, self.tx_encoder.transform(self.tx_encoder.classes_))) - + self.gene_to_token_map = dict( + zip(self.tx_encoder.classes_, self.tx_encoder.transform(self.tx_encoder.classes_)) + ) - if self.verbose: print("Integer tokens have been computed and stored based on unique gene names.") + if self.verbose: + print("Integer tokens have been computed and stored based on unique gene names.") # Optional: Create a reverse mapping for lookup purposes (token to gene) self.token_to_gene_map = {v: k for k, v in self.gene_to_token_map.items()} + if self.verbose: + print("Lookup tables (gene_to_token_map and token_to_gene_map) have been created.") - if self.verbose: print("Lookup tables (gene_to_token_map and token_to_gene_map) have been created.") - - def set_embedding(self, embedding_name: str) -> None: """ Set the current embedding type for the transcripts. @@ -370,8 +380,7 @@ def set_embedding(self, embedding_name: str) -> None: self.current_embedding = embedding_name else: raise ValueError(f"Embedding {embedding_name} not found in embeddings_dict.") - - + @staticmethod def create_scaled_polygon(group: pd.DataFrame, scale_factor: float, keys) -> gpd.GeoDataFrame: """ @@ -386,9 +395,9 @@ def create_scaled_polygon(group: pd.DataFrame, scale_factor: float, keys) -> gpd gpd.GeoDataFrame: A GeoDataFrame containing the scaled Polygon and cell_id. """ # Extract coordinates and cell ID from the group using keys - x_coords = group[keys['vertex_x']] - y_coords = group[keys['vertex_y']] - cell_id = group[keys['cell_id']].iloc[0] + x_coords = group[keys["vertex_x"]] + y_coords = group[keys["vertex_y"]] + cell_id = group[keys["cell_id"]].iloc[0] # Ensure there are at least 3 points to form a polygon if len(x_coords) >= 3: @@ -398,19 +407,13 @@ def create_scaled_polygon(group: pd.DataFrame, scale_factor: float, keys) -> gpd # Scale the polygon by the provided factor scaled_polygon = polygon.buffer(scale_factor) if scaled_polygon.is_valid and not scaled_polygon.is_empty: - return gpd.GeoDataFrame({ - 'geometry': [scaled_polygon], - keys['cell_id']: [cell_id] - }, geometry='geometry', crs="EPSG:4326") + return gpd.GeoDataFrame( + {"geometry": [scaled_polygon], keys["cell_id"]: [cell_id]}, geometry="geometry", crs="EPSG:4326" + ) # Return an empty GeoDataFrame if no valid polygon is created - return gpd.GeoDataFrame({ - 'geometry': [None], - keys['cell_id']: [cell_id] - }, geometry='geometry', crs="EPSG:4326") - - def generate_and_scale_polygons( - self, boundaries_df: dd.DataFrame, scale_factor: float = 1.0 - ) -> dgpd.GeoDataFrame: + return gpd.GeoDataFrame({"geometry": [None], keys["cell_id"]: [cell_id]}, geometry="geometry", crs="EPSG:4326") + + def generate_and_scale_polygons(self, boundaries_df: dd.DataFrame, scale_factor: float = 1.0) -> dgpd.GeoDataFrame: """ Generate and scale polygons from boundary coordinates using Dask. Keeps class structure intact by using static method for the core polygon generation. @@ -428,40 +431,39 @@ def generate_and_scale_polygons( cell_id_column = self.keys.CELL_ID.value vertex_x_column = self.keys.BOUNDARIES_VERTEX_X.value vertex_y_column = self.keys.BOUNDARIES_VERTEX_Y.value - + create_polygon = self.create_scaled_polygon # Use a lambda to wrap the static method call and avoid passing the function object directly to Dask polygons_ddf = boundaries_df.groupby(cell_id_column).apply( lambda group: create_polygon( - group=group, scale_factor=scale_factor, + group=group, + scale_factor=scale_factor, keys={ # Pass keys as a dict for the lambda function - 'vertex_x': vertex_x_column, - 'vertex_y': vertex_y_column, - 'cell_id': cell_id_column - } + "vertex_x": vertex_x_column, + "vertex_y": vertex_y_column, + "cell_id": cell_id_column, + }, ) ) - + # Lazily compute centroids for each polygon - if self.verbose: print("Adding centroids to the polygons...") - polygons_ddf['centroid_x'] = polygons_ddf.geometry.centroid.x - polygons_ddf['centroid_y'] = polygons_ddf.geometry.centroid.y - + if self.verbose: + print("Adding centroids to the polygons...") + polygons_ddf["centroid_x"] = polygons_ddf.geometry.centroid.x + polygons_ddf["centroid_y"] = polygons_ddf.geometry.centroid.y + polygons_ddf = polygons_ddf.drop_duplicates() # polygons_ddf = polygons_ddf.to_crs("EPSG:3857") return polygons_ddf - - - def compute_transcript_overlap_with_boundaries( self, transcripts_df: dd.DataFrame, boundaries_df: dd.DataFrame = None, polygons_gdf: dgpd.GeoDataFrame = None, - scale_factor: float = 1.0 - ) -> dd.DataFrame: + scale_factor: float = 1.0, + ) -> dd.DataFrame: """ Computes the overlap of transcript locations with scaled boundary polygons and assigns corresponding cell IDs to the transcripts using Dask. @@ -479,15 +481,16 @@ def compute_transcript_overlap_with_boundaries( if polygons_gdf is None: if boundaries_df is None: raise ValueError("Both boundaries_df and polygons_gdf cannot be None. Provide at least one.") - + # Generate polygons from boundaries_df if polygons_gdf is None # if self.verbose: print(f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}.") polygons_gdf = self.generate_and_scale_polygons(boundaries_df, scale_factor) - + if polygons_gdf.empty(): raise ValueError("No valid polygons were generated from the boundaries.") else: - if self.verbose: print(f"Polygons are available. Proceeding with overlap computation.") + if self.verbose: + print(f"Polygons are available. Proceeding with overlap computation.") # Create a delayed function to check if a point is within any polygon def check_overlap(transcript, polygons_gdf): @@ -508,11 +511,14 @@ def check_overlap(transcript, polygons_gdf): return overlap, cell_id # Apply the check_overlap function in parallel to each row using Dask's map_partitions - if self.verbose: print(f"Starting overlap computation for transcripts with the boundary polygons.") + if self.verbose: + print(f"Starting overlap computation for transcripts with the boundary polygons.") transcripts_df = transcripts_df.map_partitions( lambda df: df.assign( **{ - self.keys.OVERLAPS_BOUNDARY.value: df.apply(lambda row: delayed(check_overlap)(row, polygons_gdf)[0], axis=1), + self.keys.OVERLAPS_BOUNDARY.value: df.apply( + lambda row: delayed(check_overlap)(row, polygons_gdf)[0], axis=1 + ), self.keys.CELL_ID.value: df.apply(lambda row: delayed(check_overlap)(row, polygons_gdf)[1], axis=1), } ) @@ -520,9 +526,6 @@ def check_overlap(transcript, polygons_gdf): return transcripts_df - - - def compute_boundaries_geometries( self, boundaries_df: dd.DataFrame = None, @@ -552,38 +555,47 @@ def compute_boundaries_geometries( if polygons_gdf is None: if boundaries_df is None: raise ValueError("Both boundaries_df and polygons_gdf cannot be None. Provide at least one.") - + # Generate polygons from boundaries_df if polygons_gdf is None - if self.verbose: print(f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}.") + if self.verbose: + print( + f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}." + ) polygons_gdf = self.generate_and_scale_polygons(boundaries_df, scale_factor) - + # Check if the generated polygons_gdf is empty if polygons_gdf.shape[0] == 0: raise ValueError("No valid polygons were generated from the boundaries.") else: - if self.verbose: print(f"Polygons are available. Proceeding with geometrical computations.") - + if self.verbose: + print(f"Polygons are available. Proceeding with geometrical computations.") + # Compute additional geometrical properties polygons = polygons_gdf.geometry # Compute additional geometrical properties if area: - if self.verbose: print("Computing area...") - polygons_gdf['area'] = polygons.area + if self.verbose: + print("Computing area...") + polygons_gdf["area"] = polygons.area if convexity: - if self.verbose: print("Computing convexity...") - polygons_gdf['convexity'] = polygons.convex_hull.area / polygons.area + if self.verbose: + print("Computing convexity...") + polygons_gdf["convexity"] = polygons.convex_hull.area / polygons.area if elongation: - if self.verbose: print("Computing elongation...") + if self.verbose: + print("Computing elongation...") r = polygons.minimum_rotated_rectangle() - polygons_gdf['elongation'] = (r.length * r.length) / r.area + polygons_gdf["elongation"] = (r.length * r.length) / r.area if circularity: - if self.verbose: print("Computing circularity...") + if self.verbose: + print("Computing circularity...") r = polygons_gdf.minimum_bounding_radius() - polygons_gdf['circularity'] = polygons.area / (r * r) + polygons_gdf["circularity"] = polygons.area / (r * r) + + if self.verbose: + print("Geometrical computations completed.") - if self.verbose: print("Geometrical computations completed.") - return polygons_gdf.reset_index(drop=True) def save_dataset_for_segger( @@ -604,9 +616,9 @@ def save_dataset_for_segger( sampling_rate: float = 1, num_workers: int = 1, scale_boundaries: float = 1.0, - method: str = 'kd_tree', + method: str = "kd_tree", gpu: bool = False, - workers: int = 1 + workers: int = 1, ) -> None: """ Saves the dataset for Segger in a processed format using Dask for parallel and lazy processing. @@ -631,49 +643,64 @@ def save_dataset_for_segger( method (str, optional): Method for computing edge indices (e.g., 'kd_tree', 'faiss'). gpu (bool, optional): Whether to use GPU acceleration for edge index computation. workers (int, optional): Number of workers to use to compute the neighborhood graph (per tile). - + """ # Prepare directories for storing processed tiles self._prepare_directories(processed_dir) - + # Get x and y coordinate ranges for tiling x_range, y_range = self._get_ranges(d_x, d_y) - + # Generate parameters for each tile tile_params = self._generate_tile_params( - x_range, y_range, x_size, y_size, margin_x, margin_y, compute_labels, - r_tx, k_tx, val_prob, test_prob, neg_sampling_ratio_approx, sampling_rate, - processed_dir, scale_boundaries, method, gpu, workers + x_range, + y_range, + x_size, + y_size, + margin_x, + margin_y, + compute_labels, + r_tx, + k_tx, + val_prob, + test_prob, + neg_sampling_ratio_approx, + sampling_rate, + processed_dir, + scale_boundaries, + method, + gpu, + workers, ) # Process each tile using Dask to parallelize the task - if self.verbose: print("Starting tile processing...") + if self.verbose: + print("Starting tile processing...") tasks = [delayed(self._process_tile)(params) for params in tile_params] - + with ProgressBar(): - # Use Dask to process all tiles in parallel + # Use Dask to process all tiles in parallel dask.compute(*tasks, num_workers=num_workers) - if self.verbose: print("Tile processing completed.") - + if self.verbose: + print("Tile processing completed.") def _prepare_directories(self, processed_dir: Path) -> None: """Prepares directories for saving tiles.""" processed_dir = Path(processed_dir) # by default, convert to Path object - for data_type in ['train', 'test', 'val']: - for data_stage in ['raw', 'processed']: - tile_dir = processed_dir / f'{data_type}_tiles' / data_stage + for data_type in ["train", "test", "val"]: + for data_stage in ["raw", "processed"]: + tile_dir = processed_dir / f"{data_type}_tiles" / data_stage tile_dir.mkdir(parents=True, exist_ok=True) if os.listdir(tile_dir): msg = f"Directory '{tile_dir}' must be empty." raise AssertionError(msg) - def _get_ranges(self, d_x: float, d_y: float) -> Tuple[np.ndarray, np.ndarray]: """Generates ranges for tiling.""" x_range = np.arange(self.x_min // 1000 * 1000, self.x_max, d_x) y_range = np.arange(self.y_min // 1000 * 1000, self.y_max, d_y) return x_range, y_range - + def _generate_tile_params( self, x_range: np.ndarray, @@ -693,7 +720,7 @@ def _generate_tile_params( scale_boundaries: float, method: str, gpu: bool, - workers: int + workers: int, ) -> List[Tuple]: """ Generates parameters for processing tiles using the bounding box approach. @@ -707,22 +734,36 @@ def _generate_tile_params( # Generate tile parameters based on ranges and margins tile_params = [ ( - i, j, x_size, y_size, x_range[i], y_range[j], margin_x, margin_y, - compute_labels, r_tx, k_tx, neg_sampling_ratio_approx, val_prob, - test_prob, processed_dir, scale_boundaries, sampling_rate, - method, gpu, workers + i, + j, + x_size, + y_size, + x_range[i], + y_range[j], + margin_x, + margin_y, + compute_labels, + r_tx, + k_tx, + neg_sampling_ratio_approx, + val_prob, + test_prob, + processed_dir, + scale_boundaries, + sampling_rate, + method, + gpu, + workers, ) - for i in range(len(x_range)) + for i in range(len(x_range)) for j in range(len(y_range)) ] return tile_params - - # def _process_tiles(self, tile_params: List[Tuple], num_workers: int) -> None: # """ # Processes the tiles using Dask's parallelization utilities. - + # Parameters: # ----------- # tile_params : List[Tuple] @@ -741,7 +782,6 @@ def _generate_tile_params( # if self.verbose: print("Tile processing completed.") - def _process_tile(self, tile_params: Tuple) -> None: """ Process a single tile using Dask for parallelism and lazy evaluation, and save the data. @@ -751,33 +791,54 @@ def _process_tile(self, tile_params: Tuple) -> None: Parameters for the tile processing. """ ( - i, j, x_size, y_size, x_loc, y_loc, margin_x, margin_y, compute_labels, - r_tx, k_tx, neg_sampling_ratio_approx, val_prob, test_prob, processed_dir, - scale_boundaries, sampling_rate, method, gpu, workers + i, + j, + x_size, + y_size, + x_loc, + y_loc, + margin_x, + margin_y, + compute_labels, + r_tx, + k_tx, + neg_sampling_ratio_approx, + val_prob, + test_prob, + processed_dir, + scale_boundaries, + sampling_rate, + method, + gpu, + workers, ) = tile_params - if self.verbose: print(f"Processing tile at location (x_min: {x_loc}, y_min: {y_loc}), size (width: {x_size}, height: {y_size})") + if self.verbose: + print( + f"Processing tile at location (x_min: {x_loc}, y_min: {y_loc}), size (width: {x_size}, height: {y_size})" + ) # Sampling rate to decide if the tile should be processed if random.random() > sampling_rate: - if self.verbose: print(f"Skipping tile at (x_min: {x_loc}, y_min: {y_loc}) due to sampling rate.") + if self.verbose: + print(f"Skipping tile at (x_min: {x_loc}, y_min: {y_loc}) due to sampling rate.") return # Read only the required boundaries and transcripts for this tile using delayed loading boundaries_df = delayed(self.load_boundaries)( path=self.boundaries_path, - x_min=x_loc - margin_x, - x_max=x_loc + x_size + margin_x, - y_min=y_loc - margin_y, - y_max=y_loc + y_size + margin_y + x_min=x_loc - margin_x, + x_max=x_loc + x_size + margin_x, + y_min=y_loc - margin_y, + y_max=y_loc + y_size + margin_y, ).compute() - + transcripts_df = delayed(self.load_transcripts)( path=self.transcripts_path, x_min=x_loc - margin_x, - x_max=x_loc + x_size , + x_max=x_loc + x_size, y_min=y_loc - margin_y, - y_max=y_loc + y_size + y_max=y_loc + y_size, ).compute() # If no data is found in transcripts or boundaries, skip the tile @@ -788,62 +849,78 @@ def _process_tile(self, tile_params: Tuple) -> None: # If the number of transcripts is less than 20 or the number of nuclei is less than 2, skip the tile if transcripts_df_count < 20 or boundaries_df_count < 2: - if self.verbose: print(f"Dropping tile (x_min: {x_loc}, y_min: {y_loc}) due to insufficient data (transcripts: {transcripts_df_count}, boundaries: {boundaries_df_count}).") + if self.verbose: + print( + f"Dropping tile (x_min: {x_loc}, y_min: {y_loc}) due to insufficient data (transcripts: {transcripts_df_count}, boundaries: {boundaries_df_count})." + ) return # Build PyG data structure from tile-specific data - if self.verbose: print(f"Building PyG data for tile at (x_min: {x_loc}, y_min: {y_loc})...") + if self.verbose: + print(f"Building PyG data for tile at (x_min: {x_loc}, y_min: {y_loc})...") data = delayed(self.build_pyg_data_from_tile)( - boundaries_df, transcripts_df, r_tx=r_tx, k_tx=k_tx, method=method, gpu=gpu, workers=workers, scale_boundaries=scale_boundaries + boundaries_df, + transcripts_df, + r_tx=r_tx, + k_tx=k_tx, + method=method, + gpu=gpu, + workers=workers, + scale_boundaries=scale_boundaries, ) - + data = data.compute() - if self.verbose: print(data) + if self.verbose: + print(data) try: # Probability to assign to train-val-test split prob = random.random() if compute_labels and (prob > test_prob): - if self.verbose: print(f"Computing labels for tile at (x_min: {x_loc}, y_min: {y_loc})...") + if self.verbose: + print(f"Computing labels for tile at (x_min: {x_loc}, y_min: {y_loc})...") transform = RandomLinkSplit( - num_val=0, num_test=0, is_undirected=True, edge_types=[('tx', 'belongs', 'bd')], + num_val=0, + num_test=0, + is_undirected=True, + edge_types=[("tx", "belongs", "bd")], neg_sampling_ratio=neg_sampling_ratio_approx * 2, ) data = delayed(transform)(data).compute()[0] - + # if self.verbose: print(data) # Save the tile data to the appropriate directory based on split - if self.verbose: print(f"Saving data for tile at (x_min: {x_loc}, y_min: {y_loc})...") + if self.verbose: + print(f"Saving data for tile at (x_min: {x_loc}, y_min: {y_loc})...") filename = f"tiles_x={x_loc}_y={y_loc}_w={x_size}_h={y_size}.pt" if prob > val_prob + test_prob: - torch.save(data, processed_dir / 'train_tiles' / 'processed' / filename) + torch.save(data, processed_dir / "train_tiles" / "processed" / filename) elif prob > test_prob: - torch.save(data, processed_dir / 'val_tiles' / 'processed' / filename) + torch.save(data, processed_dir / "val_tiles" / "processed" / filename) else: - torch.save(data, processed_dir / 'test_tiles' / 'processed' / filename) + torch.save(data, processed_dir / "test_tiles" / "processed" / filename) # Use Dask to save the file in parallel # save_task.compute() - if self.verbose: print(f"Tile at (x_min: {x_loc}, y_min: {y_loc}) processed and saved successfully.") + if self.verbose: + print(f"Tile at (x_min: {x_loc}, y_min: {y_loc}) processed and saved successfully.") except Exception as e: - if self.verbose: print(f"Error processing tile at (x_min: {x_loc}, y_min: {y_loc}): {e}") - - + if self.verbose: + print(f"Error processing tile at (x_min: {x_loc}, y_min: {y_loc}): {e}") def build_pyg_data_from_tile( - self, - boundaries_df: dd.DataFrame, - transcripts_df: dd.DataFrame, - r_tx: float = 5.0, - k_tx: int = 3, - method: str = 'kd_tree', - gpu: bool = False, + self, + boundaries_df: dd.DataFrame, + transcripts_df: dd.DataFrame, + r_tx: float = 5.0, + k_tx: int = 3, + method: str = "kd_tree", + gpu: bool = False, workers: int = 1, - scale_boundaries: float = 1.0 - + scale_boundaries: float = 1.0, ) -> HeteroData: """ Builds PyG data from a tile of boundaries and transcripts data using Dask utilities for efficient processing. @@ -857,7 +934,7 @@ def build_pyg_data_from_tile( gpu (bool, optional): Whether to use GPU acceleration for edge index computation. workers (int, optional): Number of workers to use for parallel processing. scale_boundaries (float, optional): The factor by which to scale the boundary polygons. Default is 1.0. - + Returns: HeteroData: PyG Heterogeneous Data object. """ @@ -865,100 +942,94 @@ def build_pyg_data_from_tile( data = HeteroData() # Lazily compute boundaries geometries using Dask - if self.verbose: print("Computing boundaries geometries...") + if self.verbose: + print("Computing boundaries geometries...") bd_gdf = self.compute_boundaries_geometries(boundaries_df, scale_factor=scale_boundaries) - bd_gdf = bd_gdf[bd_gdf['geometry'].notnull()] - + bd_gdf = bd_gdf[bd_gdf["geometry"].notnull()] + # Add boundary node data to PyG HeteroData lazily - data['bd'].id = bd_gdf[self.keys.CELL_ID.value].values - data['bd'].pos = torch.as_tensor(bd_gdf[['centroid_x', 'centroid_y']].values.astype(float)) - - if data['bd'].pos.isnan().any(): - raise ValueError(data['bd'].id[data['bd'].pos.isnan().any(1)]) - - bd_x = bd_gdf.iloc[:, 4:] - data['bd'].x = torch.as_tensor(bd_x.to_numpy(), dtype=torch.float32) + data["bd"].id = bd_gdf[self.keys.CELL_ID.value].values + data["bd"].pos = torch.as_tensor(bd_gdf[["centroid_x", "centroid_y"]].values.astype(float)) + if data["bd"].pos.isnan().any(): + raise ValueError(data["bd"].id[data["bd"].pos.isnan().any(1)]) + + bd_x = bd_gdf.iloc[:, 4:] + data["bd"].x = torch.as_tensor(bd_x.to_numpy(), dtype=torch.float32) # Extract the transcript coordinates lazily - if self.verbose: print("Preparing transcript features and positions...") + if self.verbose: + print("Preparing transcript features and positions...") x_xyz = transcripts_df[[self.keys.TRANSCRIPTS_X.value, self.keys.TRANSCRIPTS_Y.value]].to_numpy() - data['tx'].id = torch.as_tensor(transcripts_df[self.keys.TRANSCRIPTS_ID.value].values.astype(int)) - data['tx'].pos = torch.tensor(x_xyz, dtype=torch.float32) - - + data["tx"].id = torch.as_tensor(transcripts_df[self.keys.TRANSCRIPTS_ID.value].values.astype(int)) + data["tx"].pos = torch.tensor(x_xyz, dtype=torch.float32) - # Lazily prepare transcript embeddings (if available) - if self.verbose: print("Preparing transcript embeddings..") + if self.verbose: + print("Preparing transcript embeddings..") token_encoding = self.tx_encoder.transform(transcripts_df[self.keys.FEATURE_NAME.value]) - transcripts_df['token'] = token_encoding # Store the integer tokens in the 'token' column - data['tx'].token = torch.as_tensor(token_encoding).int() + transcripts_df["token"] = token_encoding # Store the integer tokens in the 'token' column + data["tx"].token = torch.as_tensor(token_encoding).int() # Handle additional embeddings lazily as well - if not self.embedding_df.empty: - embeddings = delayed(lambda df: self.embedding_df.loc[ - df[self.keys.FEATURE_NAME.value].values - ].values)(transcripts_df) - else: + if self.embedding_df is not None and not self.embedding_df.empty: + embeddings = delayed(lambda df: self.embedding_df.loc[df[self.keys.FEATURE_NAME.value].values].values)( + transcripts_df + ) + else: embeddings = token_encoding - embeddings = embeddings.compute() + if hasattr(embeddings, "compute"): + embeddings = embeddings.compute() x_features = torch.as_tensor(embeddings).int() - data['tx'].x = x_features + data["tx"].x = x_features # Check if the overlap column exists, if not, compute it lazily using Dask if self.keys.OVERLAPS_BOUNDARY.value not in transcripts_df.columns: - if self.verbose: print(f"Computing overlaps for transcripts...") - transcripts_df = self.compute_transcript_overlap_with_boundaries( - transcripts_df, bd_gdf, scale_factor=1.0 - ) + if self.verbose: + print(f"Computing overlaps for transcripts...") + transcripts_df = self.compute_transcript_overlap_with_boundaries(transcripts_df, bd_gdf, scale_factor=1.0) # Connect transcripts with their corresponding boundaries (e.g., nuclei, cells) - if self.verbose: print("Connecting transcripts with boundaries...") + if self.verbose: + print("Connecting transcripts with boundaries...") overlaps = transcripts_df[self.keys.OVERLAPS_BOUNDARY.value].values valid_cell_ids = bd_gdf[self.keys.CELL_ID.value].values - ind = np.where( - overlaps & transcripts_df[self.keys.CELL_ID.value].isin(valid_cell_ids) - )[0] - tx_bd_edge_index = np.column_stack(( - ind, - np.searchsorted( - valid_cell_ids, - transcripts_df.iloc[ind][self.keys.CELL_ID.value] - ) - )) + ind = np.where(overlaps & transcripts_df[self.keys.CELL_ID.value].isin(valid_cell_ids))[0] + tx_bd_edge_index = np.column_stack( + (ind, np.searchsorted(valid_cell_ids, transcripts_df.iloc[ind][self.keys.CELL_ID.value])) + ) # Add transcript-boundary edge index to PyG HeteroData - data['tx', 'belongs', 'bd'].edge_index = torch.as_tensor(tx_bd_edge_index.T, dtype=torch.long) + data["tx", "belongs", "bd"].edge_index = torch.as_tensor(tx_bd_edge_index.T, dtype=torch.long) # Compute transcript-to-transcript (tx-tx) edges using Dask (lazy computation) - if self.verbose: print("Computing tx-tx edges...") + if self.verbose: + print("Computing tx-tx edges...") tx_positions = transcripts_df[[self.keys.TRANSCRIPTS_X.value, self.keys.TRANSCRIPTS_Y.value]].values delayed_tx_edge_index = delayed(get_edge_index)( - tx_positions, - tx_positions, - k=k_tx, - dist=r_tx, - method=method, - gpu=gpu, - workers=workers + tx_positions, tx_positions, k=k_tx, dist=r_tx, method=method, gpu=gpu, workers=workers ) tx_edge_index = delayed_tx_edge_index.compute() # Add the tx-tx edge index to the PyG HeteroData object - data['tx', 'neighbors', 'tx'].edge_index = torch.as_tensor(tx_edge_index.T, dtype=torch.long) - - - if self.verbose: print("Finished building PyG data for the tile.") - return data - - - + data["tx", "neighbors", "tx"].edge_index = torch.as_tensor(tx_edge_index.T, dtype=torch.long) + if self.verbose: + print("Finished building PyG data for the tile.") + return data class XeniumSample(SpatialTranscriptomicsSample): - def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, embedding_df: pd.DataFrame = None, verbose: bool = True): - super().__init__(transcripts_df, transcripts_radius, boundaries_graph, embedding_df, XeniumKeys, verbose=verbose) + def __init__( + self, + transcripts_df: dd.DataFrame = None, + transcripts_radius: int = 10, + boundaries_graph: bool = False, + embedding_df: pd.DataFrame = None, + verbose: bool = True, + ): + super().__init__( + transcripts_df, transcripts_radius, boundaries_graph, embedding_df, XeniumKeys, verbose=verbose + ) def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: """ @@ -977,14 +1048,14 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", - "UnassignedCodeword_" + "UnassignedCodeword_", ) # Ensure FEATURE_NAME is a string type for proper filtering (compatible with Dask) # Handle potential bytes to string conversion for Dask DataFrame if pd.api.types.is_object_dtype(transcripts_df[self.keys.FEATURE_NAME.value]): transcripts_df[self.keys.FEATURE_NAME.value] = transcripts_df[self.keys.FEATURE_NAME.value].apply( - lambda x: x.decode('utf-8') if isinstance(x, bytes) else x + lambda x: x.decode("utf-8") if isinstance(x, bytes) else x ) # Apply the quality value filter using Dask @@ -1001,7 +1072,14 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) class MerscopeSample(SpatialTranscriptomicsSample): - def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, embedding_df: pd.DataFrame = None, verbose: bool = True): + def __init__( + self, + transcripts_df: dd.DataFrame = None, + transcripts_radius: int = 10, + boundaries_graph: bool = False, + embedding_df: pd.DataFrame = None, + verbose: bool = True, + ): super().__init__(transcripts_df, transcripts_radius, boundaries_graph, embedding_df, MerscopeKeys) def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: @@ -1021,5 +1099,3 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) # Add custom Merscope-specific filtering logic if needed # For now, apply only the quality value filter return transcripts_df[transcripts_df[self.keys.QUALITY_VALUE.value] >= min_qv] - - diff --git a/src/segger/data/parquet/__init__.py b/src/segger/data/parquet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/segger/data/parquet/_experimental.py b/src/segger/data/parquet/_experimental.py index f8af0f1..739ff23 100644 --- a/src/segger/data/parquet/_experimental.py +++ b/src/segger/data/parquet/_experimental.py @@ -1,9 +1,9 @@ - from typing import TYPE_CHECKING -if TYPE_CHECKING: # False at runtime +if TYPE_CHECKING: # False at runtime import dask, cudf, dask_cudf, pandas as pd + class BackendHandler: """ A class to handle different DataFrame backends for reading and processing @@ -19,15 +19,15 @@ class BackendHandler: Methods ------- read_parquet(): - Returns the function to read Parquet files according to the selected + Returns the function to read Parquet files according to the selected backend. """ _valid_backends = { - 'pandas', - 'dask', - 'cudf', - 'dask_cudf', + "pandas", + "dask", + "cudf", + "dask_cudf", } def __init__(self, backend): @@ -35,31 +35,31 @@ def __init__(self, backend): if backend in self._valid_backends: self.backend = backend else: - valid = ', '.join(map(lambda o: f"'{o}'", self._valid_backends)) + valid = ", ".join(map(lambda o: f"'{o}'", self._valid_backends)) msg = f"Unsupported backend: {backend}. Valid options are {valid}." raise ValueError(msg) # Dynamically import packages only if requested - if self.backend == 'pandas': + if self.backend == "pandas": import pandas as pd - elif self.backend == 'dask': + elif self.backend == "dask": import dask - elif self.backend == 'cudf': + elif self.backend == "cudf": import cudf - elif self.backend == 'dask_cudf': + elif self.backend == "dask_cudf": import dask_cudf else: - raise ValueError('Internal Error') + raise ValueError("Internal Error") @property def read_parquet(self): - if self.backend == 'pandas': + if self.backend == "pandas": return pd.read_parquet - elif self.backend == 'dask': + elif self.backend == "dask": return dask.dataframe.read_parquet - elif self.backend == 'cudf': + elif self.backend == "cudf": return cudf.read_parquet - elif self.backend == 'dask_cudf': + elif self.backend == "dask_cudf": return dask_cudf.read_parquet else: - raise ValueError('Internal Error') \ No newline at end of file + raise ValueError("Internal Error") diff --git a/src/segger/data/parquet/_ndtree.py b/src/segger/data/parquet/_ndtree.py index cc68ef0..bad3ee5 100644 --- a/src/segger/data/parquet/_ndtree.py +++ b/src/segger/data/parquet/_ndtree.py @@ -3,10 +3,11 @@ import numpy as np import math -class NDTree(): + +class NDTree: """ - NDTree is a data structure for recursively splitting multi-dimensional data - into smaller regions until each leaf node contains less than or equal to a + NDTree is a data structure for recursively splitting multi-dimensional data + into smaller regions until each leaf node contains less than or equal to a specified number of points. It stores these regions in a balanced binary tree. @@ -19,7 +20,7 @@ class NDTree(): idx : np.ndarray The indices of the input data points. boxes : list - A list to store the bounding boxes (as shapely polygons) of each region + A list to store the bounding boxes (as shapely polygons) of each region in the tree. rect : Rectangle The bounding box of the entire input data space. @@ -46,7 +47,8 @@ def __init__(self, data, n): self.rect = Rectangle(data.min(0), data.max(0)) self.tree = innernode(self.n, self.idx, self.rect, self) -class innernode(): + +class innernode: """ Represents a node in the NDTree. Each node either stores a bounding box for the data it contains (leaf nodes) or splits the data into two child nodes. @@ -66,7 +68,7 @@ class innernode(): split_point : float The value along the split dimension used to divide the data. less : innernode - The child node containing data points less than or equal to the split + The child node containing data points less than or equal to the split point. greater : innernode The child node containing data points greater than the split point. @@ -85,10 +87,10 @@ def __init__(self, n, idx, rect, tree): else: box = shapely.box(*self.rect.mins, *self.rect.maxes) self.tree.boxes.append(box) - + def split(self): """ - Recursively splits the node's data into two child nodes along the + Recursively splits the node's data into two child nodes along the dimension with the largest spread. """ less = math.floor(self.n // 2) @@ -98,19 +100,6 @@ def split(self): data = data[:, self.split_dim] self.split_point = np.quantile(data, less / (less + greater)) mask = data <= self.split_point - less_rect, greater_rect = self.rect.split( - self.split_dim, - self.split_point - ) - self.less = innernode( - less, - self.idx[mask], - less_rect, - self.tree - ) - self.greater = innernode( - greater, - self.idx[~mask], - greater_rect, - self.tree - ) \ No newline at end of file + less_rect, greater_rect = self.rect.split(self.split_dim, self.split_point) + self.less = innernode(less, self.idx[mask], less_rect, self.tree) + self.greater = innernode(greater, self.idx[~mask], greater_rect, self.tree) diff --git a/src/segger/data/parquet/_settings/xenium.yaml b/src/segger/data/parquet/_settings/xenium.yaml index 7304aa7..6c5333e 100644 --- a/src/segger/data/parquet/_settings/xenium.yaml +++ b/src/segger/data/parquet/_settings/xenium.yaml @@ -13,14 +13,14 @@ transcripts: - "BLANK_" - "DeprecatedCodeword_" - "UnassignedCodeword_" - xy: + xy: - "x_location" - "y_location" - xyz: + xyz: - "x_location" - "y_location" - "z_location" - columns: + columns: - "x_location" - "y_location" - "z_location" @@ -36,10 +36,10 @@ boundaries: y: "vertex_y" id: "cell_id" label: "cell_id" - xy: + xy: - "vertex_x" - "vertex_y" - columns: + columns: - "vertex_x" - "vertex_y" - "cell_id" diff --git a/src/segger/data/parquet/_utils.py b/src/segger/data/parquet/_utils.py index 6f29cec..8c3ffec 100644 --- a/src/segger/data/parquet/_utils.py +++ b/src/segger/data/parquet/_utils.py @@ -10,6 +10,7 @@ from pathlib import Path import yaml + def get_xy_extents( filepath, x: str, @@ -50,6 +51,7 @@ def get_xy_extents( bounds = shapely.box(x_min, y_min, x_max, y_max) return bounds + def read_parquet_region( filepath, x: str, @@ -89,14 +91,17 @@ def read_parquet_region( # Find bounds of full file if not supplied if bounds is None: bounds = get_xy_bounds(filepath, x, y) - + # Load pre-filtered data from Parquet file - filters = [[ - (x, '>', bounds.bounds[0]), - (y, '>', bounds.bounds[1]), - (x, '<', bounds.bounds[2]), - (y, '<', bounds.bounds[3]), - ] + extra_filters] + filters = [ + [ + (x, ">", bounds.bounds[0]), + (y, ">", bounds.bounds[1]), + (x, "<", bounds.bounds[2]), + (y, "<", bounds.bounds[3]), + ] + + extra_filters + ] columns = list({x, y} | set(extra_columns)) @@ -107,6 +112,7 @@ def read_parquet_region( ) return region + def get_polygons_from_xy( boundaries: pd.DataFrame, x: str, @@ -114,13 +120,13 @@ def get_polygons_from_xy( label: str, ) -> gpd.GeoSeries: """ - Convert boundary coordinates from a cuDF DataFrame to a GeoSeries of + Convert boundary coordinates from a cuDF DataFrame to a GeoSeries of polygons. Parameters ---------- boundaries : pd.DataFrame - A DataFrame containing the boundary data with x and y coordinates + A DataFrame containing the boundary data with x and y coordinates and identifiers. x : str The name of the column representing the x-coordinate. @@ -133,7 +139,7 @@ def get_polygons_from_xy( Returns ------- gpd.GeoSeries - A GeoSeries containing the polygons created from the boundary + A GeoSeries containing the polygons created from the boundary coordinates. """ # Polygon offsets in coords @@ -152,6 +158,7 @@ def get_polygons_from_xy( return gs + def filter_boundaries( boundaries: pd.DataFrame, inset: shapely.Polygon, @@ -161,13 +168,13 @@ def filter_boundaries( label: str, ): """ - Filter boundary polygons based on their overlap with specified inset and + Filter boundary polygons based on their overlap with specified inset and outset regions. Parameters ---------- boundaries : cudf.DataFrame - A DataFrame containing the boundary data with x and y coordinates and + A DataFrame containing the boundary data with x and y coordinates and identifiers. inset : shapely.Polygon A polygon representing the inner region to filter the boundaries. @@ -187,43 +194,46 @@ def filter_boundaries( Notes ----- - The function determines overlaps of boundary polygons with the specified - inset and outset regions. It creates boolean masks for overlaps with the - top, left, right, and bottom sides of the outset region, as well as the - center region defined by the inset polygon. The filtering logic includes + The function determines overlaps of boundary polygons with the specified + inset and outset regions. It creates boolean masks for overlaps with the + top, left, right, and bottom sides of the outset region, as well as the + center region defined by the inset polygon. The filtering logic includes polygons that: - Are completely within the center region. - Overlap with the center and the left side, but not the bottom side. - Overlap with the center and the top side, but not the right side. """ + # Determine overlaps of boundary polygons def in_region(region): in_x = boundaries[x].between(region.bounds[0], region.bounds[2]) in_y = boundaries[y].between(region.bounds[1], region.bounds[3]) return in_x & in_y + x1, y1, x4, y4 = outset.bounds x2, y2, x3, y3 = inset.bounds - boundaries['top'] = in_region(shapely.box(x1, y1, x4, y2)) - boundaries['left'] = in_region(shapely.box(x1, y1, x2, y4)) - boundaries['right'] = in_region(shapely.box(x3, y1, x4, y4)) - boundaries['bottom'] = in_region(shapely.box(x1, y3, x4, y4)) - boundaries['center'] = in_region(inset) + boundaries["top"] = in_region(shapely.box(x1, y1, x4, y2)) + boundaries["left"] = in_region(shapely.box(x1, y1, x2, y4)) + boundaries["right"] = in_region(shapely.box(x3, y1, x4, y4)) + boundaries["bottom"] = in_region(shapely.box(x1, y3, x4, y4)) + boundaries["center"] = in_region(inset) # Filter boundary polygons # Include overlaps with top and left, not bottom and right gb = boundaries.groupby(label, sort=False) - total = gb['center'].transform('size') - in_top = gb['top'].transform('sum') - in_left = gb['left'].transform('sum') - in_right = gb['right'].transform('sum') - in_bottom = gb['bottom'].transform('sum') - in_center = gb['center'].transform('sum') + total = gb["center"].transform("size") + in_top = gb["top"].transform("sum") + in_left = gb["left"].transform("sum") + in_right = gb["right"].transform("sum") + in_bottom = gb["bottom"].transform("sum") + in_center = gb["center"].transform("sum") keep = in_center == total - keep |= ((in_center > 0) & (in_left > 0) & (in_bottom == 0)) - keep |= ((in_center > 0) & (in_top > 0) & (in_right == 0)) + keep |= (in_center > 0) & (in_left > 0) & (in_bottom == 0) + keep |= (in_center > 0) & (in_top > 0) & (in_right == 0) inset_boundaries = boundaries.loc[keep] return inset_boundaries + def filter_transcripts( transcripts_df: pd.DataFrame, label: Optional[str] = None, @@ -256,9 +266,10 @@ def filter_transcripts( mask &= transcripts_df["qv"].ge(min_qv) return transcripts_df[mask] + def load_settings(sample_type: str) -> SimpleNamespace: """ - Loads a matching YAML file from the _settings/ directory and converts its + Loads a matching YAML file from the _settings/ directory and converts its contents into a SimpleNamespace. Parameters @@ -276,25 +287,23 @@ def load_settings(sample_type: str) -> SimpleNamespace: ValueError If `sample_type` does not match any filenames. """ - settings_dir = Path(__file__).parent.resolve() / '_settings' + settings_dir = Path(__file__).parent.resolve() / "_settings" # Get a list of YAML filenames (without extensions) in the _settings dir - filenames = [file.stem for file in settings_dir.glob('*.yaml')] + filenames = [file.stem for file in settings_dir.glob("*.yaml")] # Convert sample_type to lowercase and check if it matches any filename sample_type = sample_type.lower() if sample_type not in filenames: - msg = ( - f"Sample type '{sample_type}' not found in settings. " - f"Available options: {', '.join(filenames)}" - ) + msg = f"Sample type '{sample_type}' not found in settings. " f"Available options: {', '.join(filenames)}" raise FileNotFoundError(msg) # Load the matching YAML file yaml_file_path = settings_dir / f"{sample_type}.yaml" - with yaml_file_path.open('r') as file: + with yaml_file_path.open("r") as file: data = yaml.safe_load(file) - + # Convert the YAML data into a SimpleNamespace recursively return _dict_to_namespace(data) + def _dict_to_namespace(d): """ Recursively converts a dictionary to a SimpleNamespace. @@ -302,4 +311,4 @@ def _dict_to_namespace(d): if isinstance(d, dict): d = {k: _dict_to_namespace(v) for k, v in d.items()} return SimpleNamespace(**d) - return d \ No newline at end of file + return d diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index 5599cb3..d64b9e6 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -5,17 +5,19 @@ from pathlib import Path import torch + class STPyGDataset(InMemoryDataset): """ - An in-memory dataset class for handling training using spatial + An in-memory dataset class for handling training using spatial transcriptomics data. """ + def __init__( self, root: str, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, - pre_filter: Optional[Callable] = None + pre_filter: Optional[Callable] = None, ): super().__init__(root, transform, pre_transform, pre_filter) @@ -37,7 +39,7 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - paths = glob.glob(f'{self.processed_dir}/tiles_x*_y*_*_*.pt') + paths = glob.glob(f"{self.processed_dir}/tiles_x*_y*_*_*.pt") # paths = paths.append(paths = glob.glob(f'{self.processed_dir}/tiles_x*_y*_*_*.pt')) file_names = list(map(os.path.basename, paths)) return file_names @@ -63,13 +65,13 @@ def get(self, idx: int) -> Data: """ filepath = Path(self.processed_dir) / self.processed_file_names[idx] data = torch.load(filepath) - data['tx'].x = data['tx'].x.to_dense() - if data['tx'].x.dim() == 1: - data['tx'].x = data['tx'].x.unsqueeze(1) - assert data['tx'].x.dim() == 2 + data["tx"].x = data["tx"].x.to_dense() + if data["tx"].x.dim() == 1: + data["tx"].x = data["tx"].x.unsqueeze(1) + assert data["tx"].x.dim() == 2 # this is an issue in PyG's RandomLinkSplit, dimensions are not consistent if there is only one edge in the graph - if data['tx', 'belongs', 'bd'].edge_label_index.dim() == 1: - data['tx', 'belongs', 'bd'].edge_label_index = data['tx', 'belongs', 'bd'].edge_label_index.unsqueeze(1) - data['tx', 'belongs', 'bd'].edge_label = data['tx', 'belongs', 'bd'].edge_label.unsqueeze(0) - assert data['tx', 'belongs', 'bd'].edge_label_index.dim() == 2 + if data["tx", "belongs", "bd"].edge_label_index.dim() == 1: + data["tx", "belongs", "bd"].edge_label_index = data["tx", "belongs", "bd"].edge_label_index.unsqueeze(1) + data["tx", "belongs", "bd"].edge_label = data["tx", "belongs", "bd"].edge_label.unsqueeze(0) + assert data["tx", "belongs", "bd"].edge_label_index.dim() == 2 return data diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 5e21366..6937a72 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -21,12 +21,12 @@ # TODO: Add documentation for settings -class STSampleParquet(): +class STSampleParquet: """ A class to manage spatial transcriptomics data stored in parquet files. - This class provides methods for loading, processing, and saving data related - to ST samples. It supports parallel processing and efficient handling of + This class provides methods for loading, processing, and saving data related + to ST samples. It supports parallel processing and efficient handling of transcript and boundary data. """ @@ -51,7 +51,7 @@ def __init__( Raises ------ FileNotFoundError - If the base directory does not exist or the required files are + If the base directory does not exist or the required files are missing. """ # Setup paths and resource constraints @@ -65,18 +65,17 @@ def __init__( # Setup logging logging.basicConfig(level=logging.INFO) - self.logger = logging.Logger(f'STSample@{base_dir}') + self.logger = logging.Logger(f"STSample@{base_dir}") # Internal caches self._extents = None self._transcripts_metadata = None self._boundaries_metadata = None - # Setup default embedding for transcripts - classes = self.transcripts_metadata['feature_names'] + # Setup default embedding for transcripts + classes = self.transcripts_metadata["feature_names"] self._transcript_embedding = TranscriptEmbedding(np.array(classes)) - @classmethod def _get_parquet_metadata( cls, @@ -91,7 +90,7 @@ def _get_parquet_metadata( filepath : os.PathLike The path to the parquet file. columns : Optional[List[str]], default None - List of columns to extract metadata for. If None, all columns + List of columns to extract metadata for. If None, all columns are used. Returns @@ -109,13 +108,13 @@ def _get_parquet_metadata( """ # Size in bytes of field dtypes size_map = { - 'BOOLEAN': 1, - 'INT32': 4, - 'FLOAT': 4, - 'INT64': 8, - 'DOUBLE': 8, - 'BYTE_ARRAY': 8, - 'INT96': 12, + "BOOLEAN": 1, + "INT32": 4, + "FLOAT": 4, + "INT64": 8, + "DOUBLE": 8, + "BYTE_ARRAY": 8, + "INT96": 12, } # Read in metadata @@ -129,21 +128,20 @@ def _get_parquet_metadata( # Grab important fields from metadata summary = dict() - summary['n_rows'] = metadata.num_rows - summary['n_columns'] = len(columns) - summary['column_sizes'] = dict() + summary["n_rows"] = metadata.num_rows + summary["n_columns"] = len(columns) + summary["column_sizes"] = dict() for c in columns: # Error where 10X saved BOOLEAN field as INT32 in schema - if c == 'overlaps_nucleus': - dtype = 'BOOLEAN' + if c == "overlaps_nucleus": + dtype = "BOOLEAN" else: i = metadata.schema.names.index(c) dtype = metadata.schema[i].physical_type - summary['column_sizes'][c] = size_map[dtype] + summary["column_sizes"][c] = size_map[dtype] return summary - @cached_property def transcripts_metadata(self) -> dict: """ @@ -152,7 +150,7 @@ def transcripts_metadata(self) -> dict: Returns ------- dict - Metadata dictionary for transcripts including column sizes and + Metadata dictionary for transcripts including column sizes and feature names. Raises @@ -169,13 +167,12 @@ def transcripts_metadata(self) -> dict: # Get filtered unique feature names table = pq.read_table(self._transcripts_filepath) names = pc.unique(table[self.settings.transcripts.label]) - pattern = '|'.join(self.settings.transcripts.filter_substrings) + pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) - metadata['feature_names'] = pc.filter(names, mask).tolist() + metadata["feature_names"] = pc.filter(names, mask).tolist() self._transcripts_metadata = metadata return self._transcripts_metadata - @cached_property def boundaries_metadata(self) -> dict: """ @@ -199,7 +196,6 @@ def boundaries_metadata(self) -> dict: self._boundaries_metadata = metadata return self._boundaries_metadata - @property def n_transcripts(self) -> int: """ @@ -210,8 +206,7 @@ def n_transcripts(self) -> int: int The number of transcripts. """ - return self.transcripts_metadata['n_rows'] - + return self.transcripts_metadata["n_rows"] @cached_property def extents(self) -> shapely.Polygon: @@ -236,7 +231,6 @@ def extents(self) -> shapely.Polygon: return self._extents - def _get_balanced_regions( self, ) -> List[shapely.Polygon]: @@ -252,10 +246,10 @@ def _get_balanced_regions( # If no. workers is 1, return full extents if self.n_workers == 1: return [self.extents] - + # Otherwise, split based on boundary distribution which is much smaller # than transcripts DataFrame. - # Note: Assumes boundaries are distributed similarly to transcripts at + # Note: Assumes boundaries are distributed similarly to transcripts at # a coarse level. data = pd.read_parquet( self._boundaries_filepath, @@ -265,7 +259,6 @@ def _get_balanced_regions( return ndtree.boxes - @staticmethod def _setup_directory( data_dir: os.PathLike, @@ -273,8 +266,8 @@ def _setup_directory( """ Sets up the directory structure for saving processed tiles. - Ensures that the necessary subdirectories for 'train', 'test', and - 'val' are created under the provided base directory. If any of these + Ensures that the necessary subdirectories for 'train', 'test', and + 'val' are created under the provided base directory. If any of these subdirectories already exist and are not empty, an error is raised. Directory structure created: @@ -298,15 +291,14 @@ def _setup_directory( If any of the 'processed' directories already contain files. """ data_dir = Path(data_dir) # by default, convert to Path object - for tile_type in ['train_tiles', 'test_tiles', 'val_tiles']: - for stage in ['raw', 'processed']: + for tile_type in ["train_tiles", "test_tiles", "val_tiles"]: + for stage in ["raw", "processed"]: tile_dir = data_dir / tile_type / stage tile_dir.mkdir(parents=True, exist_ok=True) if os.listdir(tile_dir): msg = f"Directory '{tile_dir}' must be empty." raise AssertionError(msg) - def set_transcript_embedding(self, weights: pd.DataFrame): """ Sets the transcript embedding for the sample. @@ -319,33 +311,32 @@ def set_transcript_embedding(self, weights: pd.DataFrame): Raises ------ ValueError - If the provided weights do not match the number of transcript + If the provided weights do not match the number of transcript features. """ - classes = self._transcripts_metadata['feature_names'] + classes = self._transcripts_metadata["feature_names"] self._transcript_embedding = TranscriptEmbedding(classes, weights) - def save( self, data_dir: os.PathLike, k_bd: int = 3, - dist_bd: float = 15., + dist_bd: float = 15.0, k_tx: int = 3, - dist_tx: float = 5., + dist_tx: float = 5.0, tile_size: Optional[int] = None, tile_width: Optional[float] = None, tile_height: Optional[float] = None, - neg_sampling_ratio: float = 5., - frac: float = 1., + neg_sampling_ratio: float = 5.0, + frac: float = 1.0, val_prob: float = 0.1, test_prob: float = 0.2, ): """ - Saves the tiles of the sample as PyTorch geometric datasets. See + Saves the tiles of the sample as PyTorch geometric datasets. See documentation for 'STTile' for more information on dataset contents. - Note: This function requires either 'tile_size' OR both 'tile_width' and + Note: This function requires either 'tile_size' OR both 'tile_width' and 'tile_height' to be provided. Parameters @@ -361,7 +352,7 @@ def save( dist_tx : float, optional, default 5.0 Maximum distance for transcript neighbors. tile_size : int, optional - If provided, specifies the size of the tile. Overrides `tile_width` + If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`. tile_width : int, optional Width of the tiles in pixels. Ignored if `tile_size` is provided. @@ -379,7 +370,7 @@ def save( Raises ------ ValueError - If the 'frac' parameter is greater than 1.0 or if the calculated + If the 'frac' parameter is greater than 1.0 or if the calculated number of tiles is zero. AssertionError If the specified directory structure is not properly set up. @@ -412,7 +403,7 @@ def func(region): for tile in tiles: # Choose training, test, or validation datasets data_type = np.random.choice( - a=['train_tiles', 'test_tiles', 'val_tiles'], + a=["train_tiles", "test_tiles", "val_tiles"], p=[1 - (test_prob + val_prob), test_prob, val_prob], ) xt = STTile(dataset=xm, extents=tile) @@ -425,9 +416,9 @@ def func(region): ) if pyg_data is not None: if pyg_data["tx", "belongs", "bd"].edge_index.numel() == 0: - # this tile is only for testing - data_type = 'test_tiles' - filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' + # this tile is only for testing + data_type = "test_tiles" + filepath = data_dir / data_type / "processed" / f"{xt.uid}.pt" torch.save(pyg_data, filepath) # TODO: Add Dask backend @@ -436,12 +427,12 @@ def func(region): # TODO: Add documentation for settings -class STInMemoryDataset(): +class STInMemoryDataset: """ A class for handling in-memory representations of ST data. This class is used to load and manage ST sample data from parquet files, - filter boundaries and transcripts, and provide spatial tiling for further + filter boundaries and transcripts, and provide spatial tiling for further analysis. The class also pre-loads KDTrees for efficient spatial queries. Parameters @@ -467,7 +458,7 @@ class STInMemoryDataset(): The filtered boundaries within the dataset extents. kdtree_tx : KDTree The KDTree for fast spatial queries on the transcripts. - + Raises ------ ValueError @@ -482,7 +473,7 @@ def __init__( ): """ Initializes the STInMemoryDataset instance by loading transcripts - and boundaries from parquet files and pre-loading a KDTree for fast + and boundaries from parquet files and pre-loading a KDTree for fast spatial queries. Parameters @@ -505,11 +496,7 @@ def __init__( self._load_boundaries(self.sample._boundaries_filepath) # Pre-load KDTrees - self.kdtree_tx = KDTree( - self.transcripts[self.settings.transcripts.xy], - leafsize=100 - ) - + self.kdtree_tx = KDTree(self.transcripts[self.settings.transcripts.xy], leafsize=100) def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): """ @@ -528,7 +515,7 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): If the transcripts dataframe cannot be loaded or filtered. """ # Load and filter transcripts dataframe - bounds = self.extents.buffer(self.margin, join_style='mitre') + bounds = self.extents.buffer(self.margin, join_style="mitre") transcripts = utils.read_parquet_region( path, x=self.settings.transcripts.x, @@ -542,11 +529,10 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): self.settings.transcripts.filter_substrings, min_qv, ) - + # Only set object properties once everything finishes successfully self.transcripts = transcripts - def _load_boundaries(self, path: os.PathLike): """ Loads and filters the boundaries dataframe for the dataset. @@ -562,7 +548,7 @@ def _load_boundaries(self, path: os.PathLike): If the boundaries dataframe cannot be loaded or filtered. """ # Load and filter boundaries dataframe - outset = self.extents.buffer(self.margin, join_style='mitre') + outset = self.extents.buffer(self.margin, join_style="mitre") boundaries = utils.read_parquet_region( path, x=self.settings.boundaries.x, @@ -580,7 +566,6 @@ def _load_boundaries(self, path: os.PathLike): ) self.boundaries = boundaries - def _get_rectangular_tile_bounds( self, tile_width: float, @@ -607,7 +592,7 @@ def _get_rectangular_tile_bounds( x_coords = np.append(x_coords, x_max) y_coords = np.arange(y_min, y_max, tile_height) y_coords = np.append(y_coords, y_max) - + # Generate tiles from grid points tiles = [] for x_min, x_max in zip(x_coords[:-1], x_coords[1:]): @@ -616,7 +601,6 @@ def _get_rectangular_tile_bounds( return tiles - def _get_balanced_tile_bounds( self, max_size: Optional[int], @@ -657,14 +641,14 @@ def recurse(node, bounds): bounds = Rectangle(self.kdtree_tx.mins, self.kdtree_tx.maxes) return recurse(node, bounds) - - def _tile(self, + def _tile( + self, width: Optional[float] = None, height: Optional[float] = None, max_size: Optional[int] = None, - ) -> List[shapely.Polygon]: + ) -> List[shapely.Polygon]: """ - Generates tiles based on either fixed dimensions or balanced + Generates tiles based on either fixed dimensions or balanced partitioning. Parameters @@ -674,7 +658,7 @@ def _tile(self, height : Optional[float] The height of each tile. Required if `max_size` is not provided. max_size : Optional[int] - The maximum number of points in each tile. Required if `width` and + The maximum number of points in each tile. Required if `width` and `height` are not provided. Returns @@ -685,7 +669,7 @@ def _tile(self, Raises ------ ValueError - If both `width`/`height` and `max_size` are provided or none are + If both `width`/`height` and `max_size` are provided or none are provided. """ # Square tiling kwargs provided @@ -697,11 +681,8 @@ def _tile(self, # Bad set of kwargs else: args = list(compress(locals().keys(), locals().values())) - args.remove('self') - msg = ( - "Function requires either 'max_size' or both " - f"'width' and 'height'. Found: {', '.join(args)}." - ) + args.remove("self") + msg = "Function requires either 'max_size' or both " f"'width' and 'height'. Found: {', '.join(args)}." logging.error(msg) raise ValueError @@ -740,9 +721,9 @@ def __init__( Notes ----- - The `boundaries` and `transcripts` attributes are cached to avoid the - overhead of filtering when tiles are instantiated. This is particularly - useful in multiprocessing settings where generating tiles in parallel + The `boundaries` and `transcripts` attributes are cached to avoid the + overhead of filtering when tiles are instantiated. This is particularly + useful in multiprocessing settings where generating tiles in parallel could lead to high overhead. Internal Attributes @@ -761,22 +742,21 @@ def __init__( self._boundaries = None self._transcripts = None - @property def uid(self) -> str: """ - Generates a unique identifier for the tile based on its extents. This - UID is particularly useful for saving or indexing tiles in distributed + Generates a unique identifier for the tile based on its extents. This + UID is particularly useful for saving or indexing tiles in distributed processing environments. The UID is constructed using the minimum and maximum x and y coordinates - of the tile's bounding box, representing its position and size in the + of the tile's bounding box, representing its position and size in the sample. Returns ------- str - A unique identifier string in the format + A unique identifier string in the format 'x=_y=_w=_h=' where: - ``: Minimum x-coordinate of the tile's extents. - ``: Minimum y-coordinate of the tile's extents. @@ -790,52 +770,49 @@ def uid(self) -> str: 'x=100_y=200_w=50_h=50' """ x_min, y_min, x_max, y_max = map(int, self.extents.bounds) - uid = f'tiles_x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}' + uid = f"tiles_x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}" return uid - @cached_property def boundaries(self) -> pd.DataFrame: """ Returns the filtered boundaries within the tile extents, cached for efficiency. - The boundaries are computed only once and cached. If the boundaries - have not been computed yet, they are computed using + The boundaries are computed only once and cached. If the boundaries + have not been computed yet, they are computed using `get_filtered_boundaries()`. Returns ------- pd.DataFrame - A DataFrame containing the filtered boundaries within the tile + A DataFrame containing the filtered boundaries within the tile extents. """ if self._boundaries is None: self._boundaries = self.get_filtered_boundaries() return self._boundaries - @cached_property def transcripts(self) -> pd.DataFrame: """ Returns the filtered transcripts within the tile extents, cached for efficiency. - The transcripts are computed only once and cached. If the transcripts - have not been computed yet, they are computed using + The transcripts are computed only once and cached. If the transcripts + have not been computed yet, they are computed using `get_filtered_transcripts()`. Returns ------- pd.DataFrame - A DataFrame containing the filtered transcripts within the tile + A DataFrame containing the filtered transcripts within the tile extents. """ if self._transcripts is None: self._transcripts = self.get_filtered_transcripts() return self._transcripts - def get_filtered_boundaries(self) -> pd.DataFrame: """ Filters the boundaries in the sample to include only those within @@ -844,20 +821,19 @@ def get_filtered_boundaries(self) -> pd.DataFrame: Returns ------- pd.DataFrame - A DataFrame containing the filtered boundaries within the tile + A DataFrame containing the filtered boundaries within the tile extents. """ filtered_boundaries = utils.filter_boundaries( boundaries=self.dataset.boundaries, inset=self.extents, - outset=self.extents.buffer(self.margin, join_style='mitre'), + outset=self.extents.buffer(self.margin, join_style="mitre"), x=self.settings.boundaries.x, y=self.settings.boundaries.y, label=self.settings.boundaries.label, ) return filtered_boundaries - def get_filtered_transcripts(self) -> pd.DataFrame: """ Filters the transcripts in the sample to include only those within @@ -866,13 +842,13 @@ def get_filtered_transcripts(self) -> pd.DataFrame: Returns ------- pd.DataFrame - A DataFrame containing the filtered transcripts within the tile + A DataFrame containing the filtered transcripts within the tile extents. """ # Buffer tile bounds to include transcripts around boundary - outset = self.extents.buffer(self.margin, join_style='mitre') - xmin, ymin, xmax, ymax = outset.bounds + outset = self.extents.buffer(self.margin, join_style="mitre") + xmin, ymin, xmax, ymax = outset.bounds # Get transcripts inside buffered region x, y = self.settings.transcripts.xy @@ -882,7 +858,6 @@ def get_filtered_transcripts(self) -> pd.DataFrame: return filtered_transcripts - def get_transcript_props(self) -> torch.Tensor: """ Encodes transcript features in a sparse format. @@ -894,9 +869,9 @@ def get_transcript_props(self) -> torch.Tensor: Notes ----- - The intention is for this function to simplify testing new strategies + The intention is for this function to simplify testing new strategies for 'tx' node representations. For example, the encoder can be any type - of encoder that transforms the transcript labels into a numerical + of encoder that transforms the transcript labels into a numerical matrix (in sparse format). """ # Encode transcript features in sparse format @@ -906,7 +881,6 @@ def get_transcript_props(self) -> torch.Tensor: return props - @staticmethod def get_polygon_props( polygons: gpd.GeoSeries, @@ -938,18 +912,17 @@ def get_polygon_props( """ props = pd.DataFrame(index=polygons.index, dtype=float) if area: - props['area'] = polygons.area + props["area"] = polygons.area if convexity: - props['convexity'] = polygons.convex_hull.area / polygons.area + props["convexity"] = polygons.convex_hull.area / polygons.area if elongation: rects = polygons.minimum_rotated_rectangle() - props['elongation'] = rects.area / polygons.envelope.area + props["elongation"] = rects.area / polygons.envelope.area if circularity: r = polygons.minimum_bounding_radius() - props["circularity"] = polygons.area / r ** 2 - - return props + props["circularity"] = polygons.area / r**2 + return props @staticmethod def get_kdtree_edge_index( @@ -993,7 +966,6 @@ def get_kdtree_edge_index( return edge_index - def get_boundary_props( self, area: bool = True, @@ -1007,29 +979,29 @@ def get_boundary_props( Parameters ---------- area : bool, optional - If True, compute the area of each boundary polygon (default is + If True, compute the area of each boundary polygon (default is True). convexity : bool, optional - If True, compute the convexity of each boundary polygon (default is + If True, compute the convexity of each boundary polygon (default is True). elongation : bool, optional If True, compute the elongation of each boundary polygon (default is True). circularity : bool, optional - If True, compute the circularity of each boundary polygon (default + If True, compute the circularity of each boundary polygon (default is True). Returns ------- torch.Tensor - A tensor containing the computed properties for each boundary + A tensor containing the computed properties for each boundary polygon. Notes ----- - The intention is for this function to simplify testing new strategies + The intention is for this function to simplify testing new strategies for 'bd' node representations. You can just change the function body to - return another torch.Tensor without worrying about changes to the rest + return another torch.Tensor without worrying about changes to the rest of the code. """ # Get polygons from coordinates @@ -1045,10 +1017,9 @@ def get_boundary_props( return props - def to_pyg_dataset( self, - #train: bool, + # train: bool, neg_sampling_ratio: float = 5, k_bd: int = 3, dist_bd: float = 15, @@ -1066,7 +1037,7 @@ def to_pyg_dataset( Parameters ---------- train: bool - Whether a sample is part of the training dataset. If True, add + Whether a sample is part of the training dataset. If True, add negative edges to dataset. k_bd : int, optional The number of nearest neighbors for the 'bd' nodes (default is 4). @@ -1142,7 +1113,7 @@ def to_pyg_dataset( Edge indices in COO format between transcripts and boundaries 3. ("tx", "neighbors", "tx") - Represents the relationship where a transcript is nearby another + Represents the relationship where a transcript is nearby another transcript. Attributes @@ -1154,15 +1125,15 @@ def to_pyg_dataset( pyg_data = HeteroData() # Set up Transcript nodes - pyg_data['tx'].id = torch.tensor( + pyg_data["tx"].id = torch.tensor( self.transcripts[self.settings.transcripts.id].values.astype(int), dtype=torch.int, ) - pyg_data['tx'].pos = torch.tensor( + pyg_data["tx"].pos = torch.tensor( self.transcripts[self.settings.transcripts.xyz].values, dtype=torch.float32, ) - pyg_data['tx'].x = self.get_transcript_props() + pyg_data["tx"].x = self.get_transcript_props() # Set up Transcript-Transcript neighbor edges nbrs_edge_idx = self.get_kdtree_edge_index( @@ -1187,11 +1158,9 @@ def to_pyg_dataset( self.settings.boundaries.label, ) centroids = polygons.centroid.get_coordinates() - pyg_data['bd'].id = polygons.index.to_numpy() - pyg_data['bd'].pos = torch.tensor(centroids.values, dtype=torch.float32) - pyg_data['bd'].x = self.get_boundary_props( - area, convexity, elongation, circularity - ) + pyg_data["bd"].id = polygons.index.to_numpy() + pyg_data["bd"].pos = torch.tensor(centroids.values, dtype=torch.float32) + pyg_data["bd"].x = self.get_boundary_props(area, convexity, elongation, circularity) # Set up Boundary-Transcript neighbor edges dist = np.sqrt(polygons.area.max()) * 10 # heuristic distance @@ -1208,16 +1177,14 @@ def to_pyg_dataset( logging.warning(f"No tx-neighbors-bd edges found in tile {self.uid}.") pyg_data["tx", "belongs", "bd"].edge_index = torch.tensor([], dtype=torch.long) return pyg_data - + # Now we identify and split the tx-belongs-bd edges - edge_type = ('tx', 'belongs', 'bd') + edge_type = ("tx", "belongs", "bd") # Find nuclear transcripts tx_cell_ids = self.transcripts[self.settings.boundaries.id] cell_ids_map = {idx: i for (i, idx) in enumerate(polygons.index)} - is_nuclear = self.transcripts[ - self.settings.transcripts.nuclear - ].astype(bool) + is_nuclear = self.transcripts[self.settings.transcripts.nuclear].astype(bool) is_nuclear &= tx_cell_ids.isin(polygons.index) # Set up overlap edges @@ -1242,11 +1209,10 @@ def to_pyg_dataset( ) pyg_data, _, _ = transform(pyg_data) - # Refilter negative edges to include only transcripts in the + # Refilter negative edges to include only transcripts in the # original positive edges (still need a memory-efficient solution) edges = pyg_data[edge_type] - mask = edges.edge_label_index[0].unsqueeze(1) == \ - edges.edge_index[0].unsqueeze(0) + mask = edges.edge_label_index[0].unsqueeze(1) == edges.edge_index[0].unsqueeze(0) mask = torch.nonzero(torch.any(mask, 1)).squeeze() edges.edge_label_index = edges.edge_label_index[:, mask] edges.edge_label = edges.edge_label[mask] diff --git a/src/segger/data/parquet/transcript_embedding.py b/src/segger/data/parquet/transcript_embedding.py index 2f8085c..8abeebc 100644 --- a/src/segger/data/parquet/transcript_embedding.py +++ b/src/segger/data/parquet/transcript_embedding.py @@ -6,14 +6,15 @@ from numpy.typing import ArrayLike import pandas as pd + # TODO: Add documentation class TranscriptEmbedding(torch.nn.Module): - ''' + """ Utility class to handle transcript embeddings in PyTorch so that they are optionally learnable in the future. - + Default behavior is to use the index of gene names. - ''' + """ # TODO: Add documentation @staticmethod @@ -23,26 +24,17 @@ def _check_inputs( ): # Classes is a 1D array if len(classes.shape) > 1: - msg = ( - "'classes' should be a 1D array, got an array of shape " - f"{classes.shape} instead." - ) + msg = "'classes' should be a 1D array, got an array of shape " f"{classes.shape} instead." raise ValueError(msg) # Items appear exactly once if len(classes) != len(set(classes)): - msg = ( - "All embedding classes must be unique. One or more items in " - "'classes' appears twice." - ) + msg = "All embedding classes must be unique. One or more items in " "'classes' appears twice." raise ValueError(msg) # All classes have an entry in weights elif weights is not None: missing = set(classes).difference(weights.index) if len(missing) > 0: - msg = ( - f"Index of 'weights' DataFrame is missing {len(missing)} " - "entries compared to classes." - ) + msg = f"Index of 'weights' DataFrame is missing {len(missing)} " "entries compared to classes." raise ValueError(msg) # TODO: Add documentation @@ -66,6 +58,6 @@ def embed(self, classes: ArrayLike): indices = LongTensor(self._encoder.transform(classes)) # Default, one-hot encoding if self._weights is None: - return indices #F.one_hot(indices, len(self._encoder.classes_)) + return indices # F.one_hot(indices, len(self._encoder.classes_)) else: return F.embedding(indices, self._weights) diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 3abd5b1..b673a87 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -5,6 +5,7 @@ def try_import(module_name): except ImportError: print(f"Warning: {module_name} is not installed. Please install it to use this functionality.") + # Standard imports import pandas as pd import numpy as np @@ -20,6 +21,7 @@ def try_import(module_name): from torch_geometric.nn import radius_graph import os from scipy.spatial import cKDTree + # import hnswlib from shapely.geometry import Polygon from shapely.affinity import scale @@ -28,10 +30,10 @@ def try_import(module_name): import sys # Attempt to import specific modules with try_import function -try_import('multiprocessing') -try_import('joblib') -try_import('faiss') -try_import('cuvs') +try_import("multiprocessing") +try_import("joblib") +try_import("faiss") +try_import("cuvs") try: import cupy as cp from cuvs.neighbors import cagra @@ -42,8 +44,6 @@ def try_import(module_name): from datetime import timedelta - - def filter_transcripts( transcripts_df: pd.DataFrame, min_qv: float = 20.0, @@ -64,7 +64,7 @@ def filter_transcripts( "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", - "UnassignedCodeword_" + "UnassignedCodeword_", ) mask = transcripts_df["qv"].ge(min_qv) mask &= ~transcripts_df["feature_name"].str.startswith(filter_codewords) @@ -72,9 +72,7 @@ def filter_transcripts( def compute_transcript_metrics( - df: pd.DataFrame, - qv_threshold: float = 30, - cell_id_col: str = 'cell_id' + df: pd.DataFrame, qv_threshold: float = 30, cell_id_col: str = "cell_id" ) -> Dict[str, Any]: """ Computes various metrics for a given dataframe of transcript data filtered by quality value threshold. @@ -92,44 +90,48 @@ def compute_transcript_metrics( - 'percent_non_assigned_cytoplasmic' (float): The percentage of non-assigned cytoplasmic transcripts. - 'gene_metrics' (pd.DataFrame): A dataframe containing gene-level metrics. """ - df_filtered = df[df['qv'] > qv_threshold] + df_filtered = df[df["qv"] > qv_threshold] total_transcripts = len(df_filtered) assigned_transcripts = df_filtered[df_filtered[cell_id_col] != -1] - percent_assigned = len(assigned_transcripts) / (total_transcripts+1) * 100 - cytoplasmic_transcripts = assigned_transcripts[assigned_transcripts['overlaps_nucleus'] != 1] - percent_cytoplasmic = len(cytoplasmic_transcripts) / (len(assigned_transcripts) + 1)* 100 + percent_assigned = len(assigned_transcripts) / (total_transcripts + 1) * 100 + cytoplasmic_transcripts = assigned_transcripts[assigned_transcripts["overlaps_nucleus"] != 1] + percent_cytoplasmic = len(cytoplasmic_transcripts) / (len(assigned_transcripts) + 1) * 100 percent_nucleus = 100 - percent_cytoplasmic non_assigned_transcripts = df_filtered[df_filtered[cell_id_col] == -1] - non_assigned_cytoplasmic = non_assigned_transcripts[non_assigned_transcripts['overlaps_nucleus'] != 1] - percent_non_assigned_cytoplasmic = len(non_assigned_cytoplasmic) / (len(non_assigned_transcripts)+1) * 100 - gene_group_assigned = assigned_transcripts.groupby('feature_name') - gene_group_all = df_filtered.groupby('feature_name') - gene_percent_assigned = (gene_group_assigned.size() / (gene_group_all.size()+1) * 100).reset_index(names='percent_assigned') - cytoplasmic_gene_group = cytoplasmic_transcripts.groupby('feature_name') - gene_percent_cytoplasmic = (cytoplasmic_gene_group.size() / (len(cytoplasmic_transcripts)+1) * 100).reset_index(name='percent_cytoplasmic') - gene_metrics = pd.merge(gene_percent_assigned, gene_percent_cytoplasmic, on='feature_name', how='outer').fillna(0) + non_assigned_cytoplasmic = non_assigned_transcripts[non_assigned_transcripts["overlaps_nucleus"] != 1] + percent_non_assigned_cytoplasmic = len(non_assigned_cytoplasmic) / (len(non_assigned_transcripts) + 1) * 100 + gene_group_assigned = assigned_transcripts.groupby("feature_name") + gene_group_all = df_filtered.groupby("feature_name") + gene_percent_assigned = (gene_group_assigned.size() / (gene_group_all.size() + 1) * 100).reset_index( + names="percent_assigned" + ) + cytoplasmic_gene_group = cytoplasmic_transcripts.groupby("feature_name") + gene_percent_cytoplasmic = (cytoplasmic_gene_group.size() / (len(cytoplasmic_transcripts) + 1) * 100).reset_index( + name="percent_cytoplasmic" + ) + gene_metrics = pd.merge(gene_percent_assigned, gene_percent_cytoplasmic, on="feature_name", how="outer").fillna(0) results = { - 'percent_assigned': percent_assigned, - 'percent_cytoplasmic': percent_cytoplasmic, - 'percent_nucleus': percent_nucleus, - 'percent_non_assigned_cytoplasmic': percent_non_assigned_cytoplasmic, - 'gene_metrics': gene_metrics + "percent_assigned": percent_assigned, + "percent_cytoplasmic": percent_cytoplasmic, + "percent_nucleus": percent_nucleus, + "percent_non_assigned_cytoplasmic": percent_non_assigned_cytoplasmic, + "gene_metrics": gene_metrics, } return results def create_anndata( - df: pd.DataFrame, - panel_df: Optional[pd.DataFrame] = None, - min_transcripts: int = 5, - cell_id_col: str = 'cell_id', - qv_threshold: float = 30, - min_cell_area: float = 10.0, - max_cell_area: float = 1000.0 + df: pd.DataFrame, + panel_df: Optional[pd.DataFrame] = None, + min_transcripts: int = 5, + cell_id_col: str = "cell_id", + qv_threshold: float = 30, + min_cell_area: float = 10.0, + max_cell_area: float = 1000.0, ) -> ad.AnnData: """ Generates an AnnData object from a dataframe of segmented transcriptomics data. - + Parameters: df (pd.DataFrame): The dataframe containing segmented transcriptomics data. panel_df (Optional[pd.DataFrame]): The dataframe containing panel information. @@ -138,24 +140,23 @@ def create_anndata( qv_threshold (float): The quality value threshold for filtering transcripts. min_cell_area (float): The minimum cell area to include a cell. max_cell_area (float): The maximum cell area to include a cell. - + Returns: ad.AnnData: The generated AnnData object containing the transcriptomics data and metadata. """ # df_filtered = filter_transcripts(df, min_qv=qv_threshold) df_filtered = df # metrics = compute_transcript_metrics(df_filtered, qv_threshold, cell_id_col) - df_filtered = df_filtered[df_filtered[cell_id_col].astype(str) != '-1'] - pivot_df = df_filtered.rename(columns={ - cell_id_col: "cell", - "feature_name": "gene" - })[['cell', 'gene']].pivot_table(index='cell', columns='gene', aggfunc='size', fill_value=0) + df_filtered = df_filtered[df_filtered[cell_id_col].astype(str) != "-1"] + pivot_df = df_filtered.rename(columns={cell_id_col: "cell", "feature_name": "gene"})[["cell", "gene"]].pivot_table( + index="cell", columns="gene", aggfunc="size", fill_value=0 + ) pivot_df = pivot_df[pivot_df.sum(axis=1) >= min_transcripts] cell_summary = [] for cell_id, cell_data in df_filtered.groupby(cell_id_col): if len(cell_data) < min_transcripts: continue - cell_convex_hull = ConvexHull(cell_data[['x_location', 'y_location']], qhull_options='QJ') + cell_convex_hull = ConvexHull(cell_data[["x_location", "y_location"]], qhull_options="QJ") cell_area = cell_convex_hull.area if cell_area < min_cell_area or cell_area > max_cell_area: continue @@ -167,47 +168,50 @@ def create_anndata( # nucleus_convex_hull = ConvexHull(nucleus_data[['x_location', 'y_location']]) # else: # nucleus_convex_hull = None - cell_summary.append({ - "cell": cell_id, - "cell_centroid_x": cell_data['x_location'].mean(), - "cell_centroid_y": cell_data['y_location'].mean(), - "cell_area": cell_area, - # "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), - # "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), - # "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0, - # "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100, - # "has_nucleus": len(nucleus_data) > 0 - }) + cell_summary.append( + { + "cell": cell_id, + "cell_centroid_x": cell_data["x_location"].mean(), + "cell_centroid_y": cell_data["y_location"].mean(), + "cell_area": cell_area, + # "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), + # "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), + # "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0, + # "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100, + # "has_nucleus": len(nucleus_data) > 0 + } + ) cell_summary = pd.DataFrame(cell_summary).set_index("cell") if panel_df is not None: - panel_df = panel_df.sort_values('gene') - genes = panel_df['gene'].values + panel_df = panel_df.sort_values("gene") + genes = panel_df["gene"].values for gene in genes: if gene not in pivot_df: pivot_df[gene] = 0 pivot_df = pivot_df[genes.tolist()] if panel_df is None: - var_df = pd.DataFrame([{ - "gene": i, - "feature_types": 'Gene Expression', - 'genome': 'Unknown' - } for i in np.unique(pivot_df.columns.values)]).set_index('gene') + var_df = pd.DataFrame( + [ + {"gene": i, "feature_types": "Gene Expression", "genome": "Unknown"} + for i in np.unique(pivot_df.columns.values) + ] + ).set_index("gene") else: - var_df = panel_df[['gene', 'ensembl']].rename(columns={'ensembl':'gene_ids'}) - var_df['feature_types'] = 'Gene Expression' - var_df['genome'] = 'Unknown' - var_df = var_df.set_index('gene') + var_df = panel_df[["gene", "ensembl"]].rename(columns={"ensembl": "gene_ids"}) + var_df["feature_types"] = "Gene Expression" + var_df["genome"] = "Unknown" + var_df = var_df.set_index("gene") # gene_metrics = metrics['gene_metrics'].set_index('feature_name') # var_df = var_df.join(gene_metrics, how='left').fillna(0) cells = list(set(pivot_df.index) & set(cell_summary.index)) - pivot_df = pivot_df.loc[cells,:] - cell_summary = cell_summary.loc[cells,:] + pivot_df = pivot_df.loc[cells, :] + cell_summary = cell_summary.loc[cells, :] adata = ad.AnnData(pivot_df.values) adata.var = var_df - adata.obs['transcripts'] = pivot_df.sum(axis=1).values - adata.obs['unique_transcripts'] = (pivot_df > 0).sum(axis=1).values + adata.obs["transcripts"] = pivot_df.sum(axis=1).values + adata.obs["unique_transcripts"] = (pivot_df > 0).sum(axis=1).values adata.obs_names = pivot_df.index.values.tolist() - adata.obs = pd.merge(adata.obs, cell_summary.loc[adata.obs_names,:], left_index=True, right_index=True) + adata.obs = pd.merge(adata.obs, cell_summary.loc[adata.obs_names, :], left_index=True, right_index=True) # adata.uns['metrics'] = { # 'percent_assigned': metrics['percent_assigned'], # 'percent_cytoplasmic': metrics['percent_cytoplasmic'], @@ -216,10 +220,9 @@ def create_anndata( # } return adata - def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_column: str) -> pd.DataFrame: - """Calculate the cell type abundance embedding for each gene based on the percentage of cells in each cell type + """Calculate the cell type abundance embedding for each gene based on the percentage of cells in each cell type that express the gene (non-zero expression). Parameters: @@ -227,9 +230,9 @@ def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_colu celltype_column (str): The column name in `adata.obs` that contains the cell type information. Returns: - pd.DataFrame: A DataFrame where rows are genes and columns are cell types, with each value representing + pd.DataFrame: A DataFrame where rows are genes and columns are cell types, with each value representing the percentage of cells in that cell type expressing the gene. - + Example: >>> adata = AnnData(...) # Load your scRNA-seq AnnData object >>> celltype_column = 'celltype_major' @@ -255,13 +258,21 @@ def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_colu abundance = gene_expression_df[cell_type_mask].mean(axis=0) * 100 cell_type_abundance_list.append(abundance) # Create a DataFrame for the cell type abundance with gene names as rows and cell types as columns - cell_type_abundance_df = pd.DataFrame(cell_type_abundance_list, - columns=adata.var_names, - index=encoder.categories_[0]).T + cell_type_abundance_df = pd.DataFrame( + cell_type_abundance_list, columns=adata.var_names, index=encoder.categories_[0] + ).T return cell_type_abundance_df -def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, method: str = 'kd_tree', - gpu: bool = False, workers: int = 1) -> torch.Tensor: + +def get_edge_index( + coords_1: np.ndarray, + coords_2: np.ndarray, + k: int = 5, + dist: int = 10, + method: str = "kd_tree", + gpu: bool = False, + workers: int = 1, +) -> torch.Tensor: """ Computes edge indices using various methods (KD-Tree, FAISS, RAPIDS::cuvs+cupy (cuda)). @@ -276,23 +287,21 @@ def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: Returns: torch.Tensor: Edge indices. """ - if method == 'kd_tree': + if method == "kd_tree": return get_edge_index_kdtree(coords_1, coords_2, k=k, dist=dist, workers=workers) - elif method == 'faiss': + elif method == "faiss": return get_edge_index_faiss(coords_1, coords_2, k=k, dist=dist, gpu=gpu) - elif method == 'cuda': + elif method == "cuda": # pass return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) else: - msg = ( - f"Unknown method {method}. Valid methods include: 'kd_tree', " - "'faiss', and 'cuda'." - ) + msg = f"Unknown method {method}. Valid methods include: 'kd_tree', " "'faiss', and 'cuda'." raise ValueError() - -def get_edge_index_kdtree(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, workers: int = 1) -> torch.Tensor: +def get_edge_index_kdtree( + coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, workers: int = 1 +) -> torch.Tensor: """ Computes edge indices using KDTree. @@ -313,15 +322,15 @@ def get_edge_index_kdtree(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5 for idx, valid in enumerate(valid_mask): valid_indices = idx_out[idx][valid] if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) + edges.append(np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T) edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() return edge_index -def get_edge_index_faiss(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, gpu: bool = False) -> torch.Tensor: +def get_edge_index_faiss( + coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, gpu: bool = False +) -> torch.Tensor: """ Computes edge indices using FAISS. @@ -344,30 +353,28 @@ def get_edge_index_faiss(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, else: index = faiss.IndexFlatL2(d) - index.add(coords_1.astype('float32')) - D, I = index.search(coords_2.astype('float32'), k) + index.add(coords_1.astype("float32")) + D, I = index.search(coords_2.astype("float32"), k) - valid_mask = D < dist ** 2 + valid_mask = D < dist**2 edges = [] for idx, valid in enumerate(valid_mask): valid_indices = I[idx][valid] if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) + edges.append(np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T) edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() return edge_index def get_edge_index_cuda( - coords_1: torch.Tensor, - coords_2: torch.Tensor, - k: int = 10, + coords_1: torch.Tensor, + coords_2: torch.Tensor, + k: int = 10, dist: float = 10.0, metric: str = "sqeuclidean", - nn_descent_niter: int = 100 + nn_descent_niter: int = 100, ) -> torch.Tensor: """ Computes edge indices using RAPIDS cuVS with cagra for vector similarity search, @@ -382,11 +389,14 @@ def get_edge_index_cuda( Returns: torch.Tensor: Edge indices as a PyTorch tensor on CUDA. """ + def cupy_to_torch(cupy_array): return torch.from_dlpack((cupy_array.toDlpack())) + # gg def torch_to_cupy(tensor): return cp.fromDlpack(dlpack.to_dlpack(tensor)) + # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack cp_coords_1 = torch_to_cupy(coords_1).astype(cp.float32) cp_coords_2 = torch_to_cupy(coords_2).astype(cp.float32) @@ -394,14 +404,16 @@ def torch_to_cupy(tensor): cp_dist = cp.float32(dist) # IndexParams and SearchParams for cagra # compression_params = cagra.CompressionParams(pq_bits=pq_bits) - index_params = cagra.IndexParams(metric=metric,nn_descent_niter=nn_descent_niter) #, compression=compression_params) + index_params = cagra.IndexParams( + metric=metric, nn_descent_niter=nn_descent_niter + ) # , compression=compression_params) search_params = cagra.SearchParams() # Build index using CuPy coords index = cagra.build_index(index_params, cp_coords_1) # Perform search to get distances and indices (still in CuPy) D, I = cagra.search(search_params, index, cp_coords_2, k) # Boolean mask for filtering distances below the squared threshold (all in CuPy) - valid_mask = cp.asarray(D < cp_dist ** 2) + valid_mask = cp.asarray(D < cp_dist**2) # Vectorized operations for row and valid indices (all in CuPy) repeats = valid_mask.sum(axis=1).tolist() row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats) @@ -412,6 +424,7 @@ def torch_to_cupy(tensor): edge_index = cupy_to_torch(edges).long().contiguous() return edge_index + class SpatialTranscriptomicsDataset(InMemoryDataset): """A dataset class for handling SpatialTranscriptomics spatial transcriptomics data. @@ -421,7 +434,10 @@ class SpatialTranscriptomicsDataset(InMemoryDataset): pre_transform (callable): A function/transform that takes in a Data object and returns a transformed version. pre_filter (callable): A function that takes in a Data object and returns a boolean indicating whether to keep it. """ - def __init__(self, root: str, transform: Callable = None, pre_transform: Callable = None, pre_filter: Callable = None): + + def __init__( + self, root: str, transform: Callable = None, pre_transform: Callable = None, pre_filter: Callable = None + ): """Initialize the SpatialTranscriptomicsDataset. Args: @@ -448,16 +464,14 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - return [x for x in os.listdir(self.processed_dir) if 'tiles' in x] + return [x for x in os.listdir(self.processed_dir) if "tiles" in x] def download(self) -> None: - """Download the raw data. This method should be overridden if you need to download the data. - """ + """Download the raw data. This method should be overridden if you need to download the data.""" pass def process(self) -> None: - """Process the raw data and save it to the processed directory. This method should be overridden if you need to process the data. - """ + """Process the raw data and save it to the processed directory. This method should be overridden if you need to process the data.""" pass def len(self) -> int: @@ -478,7 +492,7 @@ def get(self, idx: int) -> Data: Data: The processed data object. """ data = torch.load(os.path.join(self.processed_dir, self.processed_file_names[idx])) - data['tx'].x = data['tx'].x.to_dense() + data["tx"].x = data["tx"].x.to_dense() return data @@ -531,8 +545,7 @@ def coo_to_dense_adj( # Check COO format if not edge_index.shape[0] == 2: msg = ( - "Edge index is not in COO format. First dimension should have " - f"size 2, but found {edge_index.shape[0]}." + "Edge index is not in COO format. First dimension should have " f"size 2, but found {edge_index.shape[0]}." ) raise ValueError(msg) @@ -547,39 +560,23 @@ def coo_to_dense_adj( # Fill matrix with neighbors nbr_idx = torch.full((num_nodes, num_nbrs), -1) for i, nbrs in zip(uniques, torch.split(edge_index[1], counts)): - nbr_idx[i, :len(nbrs)] = nbrs + nbr_idx[i, : len(nbrs)] = nbrs return nbr_idx - - - def format_time(elapsed: float) -> str: """ Format elapsed time to h:m:s. - + Parameters: ---------- elapsed : float Elapsed time in seconds. - + Returns: ------- str Formatted time in h:m:s. """ return str(timedelta(seconds=int(elapsed))) - - - - - - - - - - - - - diff --git a/src/segger/models/README.md b/src/segger/models/README.md index 1f872b3..033e545 100644 --- a/src/segger/models/README.md +++ b/src/segger/models/README.md @@ -1,4 +1,3 @@ - # segger: Graph Neural Network Model The `segger` model is a graph neural network designed to handle heterogeneous graphs with two primary node types: **transcripts** and **nuclei or cell boundaries**. It leverages attention-based convolutional layers to compute node embeddings and relationships in spatial transcriptomics data. The architecture includes an initial embedding layer for node feature transformation, multiple graph attention layers (GATv2Conv), and residual linear connections. @@ -32,7 +31,8 @@ The `segger` model is a graph neural network designed to handle heterogeneous gr $$ where: - - \( \alpha_{ij} \) is the attention coefficient between node \( i \) and node \( j \), computed as: + + - \( \alpha\_{ij} \) is the attention coefficient between node \( i \) and node \( j \), computed as: $$ \alpha_{ij} = \frac{\exp\left( \text{LeakyReLU}\left( \mathbf{a}^{\top} [\mathbf{W}^{(l)} \mathbf{h}_{i}^{(l)} || \mathbf{W}^{(l)} \mathbf{h}_{j}^{(l)}] \right)\right)}{\sum_{k \in \mathcal{N}(i)} \exp\left( \text{LeakyReLU}\left( \mathbf{a}^{\top} [\mathbf{W}^{(l)} \mathbf{h}_{i}^{(l)} || \mathbf{W}^{(l)} \mathbf{h}_{k}^{(l)}] \right)\right)} @@ -47,7 +47,7 @@ The `segger` model is a graph neural network designed to handle heterogeneous gr \mathbf{h}_{i}^{(l+1)} = \text{ReLU}\left( \mathbf{h}_{i}^{(l+1)} + \mathbf{W}_{res} \mathbf{h}_{i}^{(l)} \right) $$ - where \( \mathbf{W}_{res} \) is a residual weight matrix. + where \( \mathbf{W}\_{res} \) is a residual weight matrix. 4. **L2 Normalization**: Finally, the embeddings are normalized using L2 normalization: @@ -62,23 +62,21 @@ The `segger` model is a graph neural network designed to handle heterogeneous gr In the next step, the `segger` model is transformed into a **heterogeneous graph neural network** using PyTorch Geometric's `to_hetero` function. This transformation enables the model to handle distinct node and edge types (transcripts and nuclei or cell boundaries) with separate mechanisms for modeling their relationships. - ## Usage To instantiate and run the segger model: ```python model = segger( - num_tx_tokens=5000, # Number of unique 'tx' tokens - init_emb=32, # Initial embedding dimension - hidden_channels=64, # Number of hidden channels - num_mid_layers=2, # Number of middle layers - out_channels=128, # Number of output channels - heads=4 # Number of attention heads + num_tx_tokens=5000, # Number of unique 'tx' tokens + init_emb=32, # Initial embedding dimension + hidden_channels=64, # Number of hidden channels + num_mid_layers=2, # Number of middle layers + out_channels=128, # Number of output channels + heads=4, # Number of attention heads ) output = model(x, edge_index) ``` Once transformed to a heterogeneous model and trained using PyTorch Lightning, the model can efficiently learn relationships between transcripts and nuclei or cell boundaries. - diff --git a/src/segger/models/__init__.py b/src/segger/models/__init__.py index 1271af3..0a66407 100644 --- a/src/segger/models/__init__.py +++ b/src/segger/models/__init__.py @@ -4,8 +4,6 @@ Contains the implementation of the Segger model using Graph Neural Networks. """ -__all__ = [ - "Segger" - ] +__all__ = ["Segger"] from .segger_model import * diff --git a/src/segger/models/segger_model.py b/src/segger/models/segger_model.py index d2e13ad..6943dab 100644 --- a/src/segger/models/segger_model.py +++ b/src/segger/models/segger_model.py @@ -3,10 +3,20 @@ from torch.nn import Embedding from torch import Tensor from typing import Union -#from torch_sparse import SparseTensor + +# from torch_sparse import SparseTensor + class Segger(torch.nn.Module): - def __init__(self, num_tx_tokens: int, init_emb: int = 16, hidden_channels: int = 32, num_mid_layers: int = 3, out_channels: int = 32, heads: int = 3): + def __init__( + self, + num_tx_tokens: int, + init_emb: int = 16, + hidden_channels: int = 32, + num_mid_layers: int = 3, + out_channels: int = 32, + heads: int = 3, + ): """ Initializes the Segger model. @@ -54,27 +64,26 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: Returns: Tensor: Output node embeddings. """ - x = torch.nan_to_num(x, nan = 0) + x = torch.nan_to_num(x, nan=0) is_one_dim = (x.ndim == 1) * 1 - # x = x[:, None] - x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) + # x = x[:, None] + x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) # First layer x = x.relu() - x = self.conv_first(x, edge_index) # + self.lin_first(x) + x = self.conv_first(x, edge_index) # + self.lin_first(x) x = x.relu() # Middle layers if self.num_mid_layers > 0: - for conv_mid in self.conv_mid_layers: - x = conv_mid(x, edge_index) # + lin_mid(x) + for conv_mid in self.conv_mid_layers: + x = conv_mid(x, edge_index) # + lin_mid(x) x = x.relu() # Last layer - x = self.conv_last(x, edge_index) # + self.lin_last(x) + x = self.conv_last(x, edge_index) # + self.lin_last(x) return x - def decode(self, z: Tensor, edge_index: Union[Tensor]) -> Tensor: """ Decode the node embeddings to predict edge values. diff --git a/src/segger/prediction/__init__.py b/src/segger/prediction/__init__.py index abc96d9..f82a9cc 100644 --- a/src/segger/prediction/__init__.py +++ b/src/segger/prediction/__init__.py @@ -4,9 +4,6 @@ Contains prediction scripts and utilities for the Segger model. """ -__all__ = [ - "load_model", - "predict" - ] +__all__ = ["load_model", "predict"] from .predict import load_model, predict diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index cf73116..337a3c1 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -40,8 +40,8 @@ from cupyx.scipy.sparse import find # To find non-zero elements in sparse matrix from scipy.sparse.csgraph import connected_components as cc from scipy.sparse import coo_matrix as scipy_coo_matrix -# Setup Dask cluster with 3 workers +# Setup Dask cluster with 3 workers # CONFIG @@ -57,7 +57,7 @@ def load_model(checkpoint_path: str) -> LitSegger: Parameters ---------- checkpoint_path : str - Specific checkpoint file to load, or directory where the model checkpoints are stored. + Specific checkpoint file to load, or directory where the model checkpoints are stored. If directory, the latest checkpoint is loaded. Returns @@ -75,13 +75,15 @@ def load_model(checkpoint_path: str) -> LitSegger: # Get last checkpoint if directory is provided if os.path.isdir(checkpoint_path): - checkpoints = glob.glob(str(checkpoint_path / '*.ckpt')) + checkpoints = glob.glob(str(checkpoint_path / "*.ckpt")) if len(checkpoints) == 0: raise FileNotFoundError(msg) + # Sort checkpoints by epoch and step def sort_order(c): - match = re.match(r'.*epoch=(\d+)-step=(\d+).ckpt', c) + match = re.match(r".*epoch=(\d+)-step=(\d+).ckpt", c) return int(match[1]), int(match[2]) + checkpoint_path = Path(sorted(checkpoints, key=sort_order)[-1]) elif not checkpoint_path.exists(): raise FileExistsError(msg) @@ -94,16 +96,11 @@ def sort_order(c): return lit_segger - def get_similarity_scores( - model: torch.nn.Module, - batch: Batch, - from_type: str, - to_type: str, - receptive_field: dict + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, receptive_field: dict ) -> coo_matrix: """ - Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes + Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes using sparse matrix multiplication with CuPy and the 'sees' edge relation. Args: @@ -113,7 +110,7 @@ def get_similarity_scores( to_type (str): The type of node to which the similarity is computed. Returns: - coo_matrix: A sparse matrix containing the similarity scores between + coo_matrix: A sparse matrix containing the similarity scores between 'from_type' and 'to_type' nodes. """ # Step 1: Get embeddings from the model @@ -122,21 +119,21 @@ def get_similarity_scores( edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], - method='cuda' + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method="cuda", ) edge_index = coo_to_dense_adj( - edge_index.T, - num_nodes=shape[0], - num_nbrs=receptive_field[f'k_{to_type}'], + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f"k_{to_type}"], ) - + with torch.no_grad(): embeddings = model(batch.x_dict, batch.edge_index_dict) del batch - + # print(edge_index) # print(embeddings) @@ -144,19 +141,19 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros similarity = torch.bmm( - m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed - embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1), # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 del embeddings # Sigmoid to get most similar 'to_type' neighbor similarity[similarity == 0] = -torch.inf # ensure zero stays zero similarity = F.sigmoid(similarity) # Neighbor-filtered similarity scores # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(edge_index != -1).T + indices = torch.argwhere(edge_index != -1).T indices[1] = edge_index[edge_index != -1] - rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) - columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + rows = cp.fromDlpack(to_dlpack(indices[0, :].to("cuda"))) + columns = cp.fromDlpack(to_dlpack(indices[1, :].to("cuda"))) # print(rows) del indices values = similarity[edge_index != -1].flatten() @@ -164,7 +161,6 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: return sparse_result # Free GPU memory after computation - # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) gc.collect() @@ -175,38 +171,37 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: return sparse_similarity - - def predict_batch( lit_segger: torch.nn.Module, batch: Batch, score_cut: float, receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = 'cuda' + knn_method: str = "cuda", ) -> pd.DataFrame: """ Predict cell assignments for a batch of transcript data using a segmentation model. - Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) + Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) or unassigned (bound=0). Args: lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. batch (Batch): A batch of transcript and cell data. score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. - receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell + receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. - use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. Returns: - pd.DataFrame: A DataFrame containing the transcript IDs, similarity scores, + pd.DataFrame: A DataFrame containing the transcript IDs, similarity scores, assigned cell IDs, and 'bound' column. """ + def _get_id(): """Generate a random Xenium-style ID.""" - return ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '-nx' + return "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 8)) + "-nx" # Use CuPy with GPU context with cp.cuda.Device(0): @@ -214,10 +209,10 @@ def _get_id(): batch = batch.to("cuda") # Extract transcript IDs and initialize assignments DataFrame - transcript_id = cp.asnumpy(batch['tx'].id) - assignments = pd.DataFrame({'transcript_id': transcript_id}) + transcript_id = cp.asnumpy(batch["tx"].id) + assignments = pd.DataFrame({"transcript_id": transcript_id}) - if len(batch['bd'].pos) >= 10: + if len(batch["bd"].pos) >= 10: # Compute similarity scores between 'tx' and 'bd' scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field) torch.cuda.empty_cache() @@ -227,48 +222,47 @@ def _get_id(): cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory # Get direct assignments from similarity matrix belongs = cp.max(dense_scores, axis=1) # Max score per transcript - assignments['score'] = cp.asnumpy(belongs) # Move back to CPU + assignments["score"] = cp.asnumpy(belongs) # Move back to CPU - mask = assignments['score'] > score_cut - all_ids = np.concatenate(batch['bd'].id) # Keep IDs as NumPy array - assignments['segger_cell_id'] = None # Initialize as None + mask = assignments["score"] > score_cut + all_ids = np.concatenate(batch["bd"].id) # Keep IDs as NumPy array + assignments["segger_cell_id"] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments['segger_cell_id'][mask] = all_ids[max_indices[mask]] # Assign IDs - + assignments["segger_cell_id"][mask] = all_ids[max_indices[mask]] # Assign IDs + del dense_scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() -# Move back to CPU - assignments['bound'] = 0 - assignments['bound'][mask] = 1 - - + # Move back to CPU + assignments["bound"] = 0 + assignments["bound"][mask] = 1 + if use_cc: # Compute similarity scores between 'tx' and 'tx' scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) - # Convert to dense NumPy array - data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) - row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) - col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) + # Convert to dense NumPy array + data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) + row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) + col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) # dense_scores_tx = scores_tx.toarray().astype(cp.float16) # Rebuild the matrix on CPU using SciPy dense_scores_tx = scipy_coo_matrix((data_cpu, (row_cpu, col_cpu)), shape=scores_tx.shape).toarray() np.fill_diagonal(dense_scores_tx, 0) # Ignore self-similarity - + del scores_tx # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory # Assign unassigned transcripts using connected components - no_id = assignments['segger_cell_id'].isna() + no_id = assignments["segger_cell_id"].isna() if np.any(no_id): # Only compute if there are unassigned transcripts no_id_scores = dense_scores_tx[no_id][:, no_id] del dense_scores_tx # Remove from memory no_id_scores[no_id_scores < score_cut] = 0 n, comps = cc(no_id_scores, connection="weak", directed=False) new_ids = np.array([_get_id() for _ in range(n)]) - assignments['segger_cell_id'][no_id] = new_ids[comps] + assignments["segger_cell_id"][no_id] = new_ids[comps] # Perform memory cleanup to avoid OOM issues cp.get_default_memory_pool().free_all_blocks() @@ -276,9 +270,6 @@ def _get_id(): return assignments - - - def predict( lit_segger: LitSegger, @@ -286,7 +277,7 @@ def predict( score_cut: float, receptive_field: dict, use_cc: bool = True, - knn_method: str = 'cuda' + knn_method: str = "cuda", ) -> pd.DataFrame: # Change return type to Dask DataFrame if applicable """ Optimized prediction for multiple batches of transcript data. @@ -296,7 +287,7 @@ def predict( for batch in data_loader: assignments = predict_batch(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) all_assignments.append(dd.from_pandas(assignments, npartitions=1)) - + cp.get_default_memory_pool().free_all_blocks() torch.cuda.empty_cache() @@ -304,26 +295,26 @@ def predict( final_assignments = dd.concat(all_assignments, ignore_index=True) # Sort the Dask DataFrame by 'transcript_id' before setting it as an index - final_assignments = final_assignments.sort_values(by='transcript_id') + final_assignments = final_assignments.sort_values(by="transcript_id") # Set a unique index for Dask DataFrame - final_assignments = final_assignments.set_index('transcript_id', sorted=True) + final_assignments = final_assignments.set_index("transcript_id", sorted=True) # Max score selection logic - max_bound_idx = final_assignments[final_assignments['bound'] == 1].groupby('transcript_id')['score'].idxmax() - max_unbound_idx = final_assignments[final_assignments['bound'] == 0].groupby('transcript_id')['score'].idxmax() + max_bound_idx = final_assignments[final_assignments["bound"] == 1].groupby("transcript_id")["score"].idxmax() + max_unbound_idx = final_assignments[final_assignments["bound"] == 0].groupby("transcript_id")["score"].idxmax() # Combine indices, prioritizing bound=1 scores final_idx = max_bound_idx.combine_first(max_unbound_idx).compute() # Ensure it's computed # Now use the computed final_idx for indexing - result = final_assignments.loc[final_idx].compute().reset_index(names=['transcript_id']) - + result = final_assignments.loc[final_idx].compute().reset_index(names=["transcript_id"]) + # result = results.reset_index() # Handle cases where there's only one entry per 'segger_cell_id' # single_entry_mask = result.groupby('segger_cell_id').size() == 1 -# Handle cases where there's only one entry per 'segger_cell_id' + # Handle cases where there's only one entry per 'segger_cell_id' # single_entry_counts = result['segger_cell_id'].value_counts() # Count occurrences of each ID # single_entry_mask = single_entry_counts[single_entry_counts == 1].index # Get IDs with a count of 1 @@ -331,27 +322,26 @@ def predict( # for segger_id in single_entry_mask: # result.loc[result['segger_cell_id'] == segger_id, 'segger_cell_id'] = 'floating' - return result def segment( - model: LitSegger, - dm: SeggerDataModule, - save_dir: Union[str, Path], - seg_tag: str, - transcript_file: Union[str, Path], - score_cut: float = .5, + model: LitSegger, + dm: SeggerDataModule, + save_dir: Union[str, Path], + seg_tag: str, + transcript_file: Union[str, Path], + score_cut: float = 0.5, use_cc: bool = True, - file_format: str = 'anndata', - receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, - knn_method: str = 'kd_tree', + file_format: str = "anndata", + receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, + knn_method: str = "kd_tree", verbose: bool = False, - **anndata_kwargs + **anndata_kwargs, ) -> None: """ Perform segmentation using the model, merge segmentation results with transcripts_df, and save in the specified format. - + Parameters: ---------- model : LitSegger @@ -388,22 +378,22 @@ def segment( # Step 1: Prediction step_start_time = time.time() - + train_dataloader = dm.train_dataloader() - test_dataloader = dm.test_dataloader() - val_dataloader = dm.val_dataloader() - + test_dataloader = dm.test_dataloader() + val_dataloader = dm.val_dataloader() + segmentation_train = predict(model, train_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() - - segmentation_val = predict(model, val_dataloader, score_cut, receptive_field, use_cc, knn_method) + + segmentation_val = predict(model, val_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() - - segmentation_test = predict(model, test_dataloader, score_cut, receptive_field, use_cc, knn_method) + + segmentation_test = predict(model, test_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() @@ -422,7 +412,7 @@ def segment( # print(seg_combined.columns) # print(transcripts_df.id) # Drop any unassigned rows - seg_final = seg_combined.dropna(subset=['segger_cell_id']).reset_index(drop=True) + seg_final = seg_combined.dropna(subset=["segger_cell_id"]).reset_index(drop=True) if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -440,7 +430,7 @@ def segment( seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) # Merge the segmentation results with the transcript data (still as Dask DataFrame) - transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') + transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on="transcript_id", how="inner") if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -448,18 +438,18 @@ def segment( # Step 4: Save the merged result step_start_time = time.time() - + if verbose: print(f"Saving results in {file_format} format...") - if file_format == 'csv': - save_path = save_dir / f'{seg_tag}_segmentation.csv' + if file_format == "csv": + save_path = save_dir / f"{seg_tag}_segmentation.csv" transcripts_df_filtered.compute().to_csv(save_path, index=False) # Use pandas after computing - elif file_format == 'parquet': - save_path = save_dir / f'{seg_tag}_segmentation.parquet' + elif file_format == "parquet": + save_path = save_dir / f"{seg_tag}_segmentation.parquet" transcripts_df_filtered.to_parquet(save_path, index=False) # Dask handles Parquet fine - elif file_format == 'anndata': - save_path = save_dir / f'{seg_tag}_segmentation.h5ad' + elif file_format == "anndata": + save_path = save_dir / f"{seg_tag}_segmentation.h5ad" segger_adata = create_anndata(transcripts_df_filtered.compute(), **anndata_kwargs) # Compute for AnnData segger_adata.write(save_path) else: @@ -479,9 +469,6 @@ def segment( torch.cuda.empty_cache() gc.collect() - - - # def predict( # lit_segger: LitSegger, @@ -493,7 +480,7 @@ def segment( # ) -> dd.DataFrame: # """ # Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - + # Args: # lit_segger (LitSegger): The lightning module wrapping the segmentation model. # data_loader (DataLoader): A data loader providing batches of transcript and cell data. @@ -539,7 +526,7 @@ def segment( # # Handle cases where there's only one entry per 'segger_cell_id' # single_entry_mask = result.groupby('segger_cell_id').size() == 1 # result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - + # return result # # Map the logic over each partition using Dask @@ -548,14 +535,11 @@ def segment( # # Trigger garbage collection and free GPU memory # torch.cuda.empty_cache() # gc.collect() - -# final_assignments = final_assignments.compute() - - -# return final_assignments +# final_assignments = final_assignments.compute() +# return final_assignments # # def predict( @@ -568,7 +552,7 @@ def segment( # # ) -> dd.DataFrame: # # """ # # Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - + # # Args: # # lit_segger (LitSegger): The lightning module wrapping the segmentation model. # # data_loader (DataLoader): A data loader providing batches of transcript and cell data. @@ -596,7 +580,7 @@ def segment( # # delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) # # for batch in data_loader # # ] - + # # # Build the Dask DataFrame from the delayed assignments # # assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) @@ -612,7 +596,7 @@ def segment( # # # Handle cases where there's only one entry per 'segger_cell_id' # # single_entry_mask = result.groupby('segger_cell_id').size() == 1 # # result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - + # # return result # # # Map the logic over each partition using Dask @@ -627,22 +611,22 @@ def segment( # def segment( -# model: LitSegger, -# dm: SeggerDataModule, -# save_dir: Union[str, Path], -# seg_tag: str, -# transcript_file: Union[str, Path], +# model: LitSegger, +# dm: SeggerDataModule, +# save_dir: Union[str, Path], +# seg_tag: str, +# transcript_file: Union[str, Path], # score_cut: float = .25, # use_cc: bool = True, -# file_format: str = 'anndata', +# file_format: str = 'anndata', # receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, # knn_method: str = 'kd_tree', # verbose: bool = False, # **anndata_kwargs # ) -> None: # """ -# Perform segmentation using the model, merge segmentation results with transcripts_df, -# and save in the specified format. Memory is managed efficiently using Dask and GPU +# Perform segmentation using the model, merge segmentation results with transcripts_df, +# and save in the specified format. Memory is managed efficiently using Dask and GPU # memory optimizations. # Args: @@ -674,15 +658,15 @@ def segment( # # Step 1: Prediction # step_start_time = time.time() - + # train_dataloader = dm.train_dataloader() # test_dataloader = dm.test_dataloader() # val_dataloader = dm.val_dataloader() - + # # delayed_train = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) # # delayed_val = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) # delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - + # delayed_test = delayed_test.compute() # # Compute all predictions at once using Dask # # with ProgressBar(): @@ -726,7 +710,7 @@ def segment( # # Step 4: Save the merged result # step_start_time = time.time() - + # if verbose: # print(f"Saving results in {file_format} format...") diff --git a/src/segger/training/README.md b/src/segger/training/README.md index ff7e04d..958cd20 100644 --- a/src/segger/training/README.md +++ b/src/segger/training/README.md @@ -7,20 +7,24 @@ The training module makes use of **PyTorch Lightning** for efficient and scalabl ## Key Components ### 1. **SpatialTranscriptomicsDataset** + The `SpatialTranscriptomicsDataset` class is used to load and manage spatial transcriptomics data stored in the format of PyTorch Geometric `Data` objects. It inherits from `InMemoryDataset` to load preprocessed datasets, ensuring efficient in-memory data handling for training and validation phases. - **Root Path**: The root directory contains the dataset, which is expected to have separate folders for training, validation, and test sets. - **Raw and Processed Data**: The module expects datasets in the form of processed PyTorch files, and the dataset class is responsible for loading them efficiently. ### 2. **Segger Model** + The `Segger` model is a custom graph neural network designed to work with heterogeneous graph data. It takes both **transcript (tx)** and **boundary (bd)** nodes, utilizing attention mechanisms for better feature aggregation. Key parameters such as `num_tx_tokens`, `init_emb`, `hidden_channels`, `out_channels`, and `heads` allow the user to control the model's architecture and initial embedding sizes. - **Heterogeneous Graph Support**: The model is converted to handle different node types using `to_hetero` from PyTorch Geometric. The transformation allows the model to handle multiple relations like `belongs` (tx to bd) and `neighbors` (tx to tx). ### 3. **LitSegger** + `LitSegger` is the PyTorch Lightning wrapper around the Segger model, which handles training, validation, and optimization. This wrapper facilitates the integration with Lightning’s trainer, allowing easy multi-GPU and distributed training. ### 4. **Training Pipeline** + The module provides an easily configurable pipeline for training the Segger model: - **Datasets**: Training and validation datasets are loaded using `SpatialTranscriptomicsDataset` with paths provided via arguments. @@ -30,6 +34,7 @@ The module provides an easily configurable pipeline for training the Segger mode ## Usage and Configuration ### Command-Line Arguments + The module accepts various command-line arguments that allow for flexible configuration: - `--train_dir`: Path to the training data directory. This directory should include `processed` and `raw` subdirectories. The direcotry `processed` should include the `pyg` `HeteroData` objects. @@ -51,6 +56,7 @@ The module accepts various command-line arguments that allow for flexible config - `--default_root_dir`: Directory where logs, checkpoints, and models will be saved. ### Example Training Command + The module can be executed from the command line as follows: ```bash diff --git a/src/segger/training/segger_data_module.py b/src/segger/training/segger_data_module.py index c1be43d..3feadef 100644 --- a/src/segger/training/segger_data_module.py +++ b/src/segger/training/segger_data_module.py @@ -21,9 +21,9 @@ def __init__( # TODO: Add documentation def setup(self, stage=None): - self.train = STPyGDataset(root=self.data_dir / 'train_tiles') - self.test = STPyGDataset(root=self.data_dir / 'test_tiles') - self.val = STPyGDataset(root=self.data_dir / 'val_tiles') + self.train = STPyGDataset(root=self.data_dir / "train_tiles") + self.test = STPyGDataset(root=self.data_dir / "test_tiles") + self.val = STPyGDataset(root=self.data_dir / "val_tiles") self.loader_kwargs = dict( batch_size=self.batch_size, num_workers=self.num_workers, diff --git a/src/segger/training/train.py b/src/segger/training/train.py index a3cf471..68adbb3 100644 --- a/src/segger/training/train.py +++ b/src/segger/training/train.py @@ -60,7 +60,17 @@ def __init__(self, **kwargs): self.validation_step_outputs = [] self.criterion = torch.nn.BCEWithLogitsLoss() - def from_new(self, num_tx_tokens: int, init_emb: int, hidden_channels: int, out_channels: int, heads: int, num_mid_layers: int, aggr: str, metadata: Union[Tuple, Metadata]): + def from_new( + self, + num_tx_tokens: int, + init_emb: int, + hidden_channels: int, + out_channels: int, + heads: int, + num_mid_layers: int, + aggr: str, + metadata: Union[Tuple, Metadata], + ): """ Initializes the LitSegger module with new parameters. @@ -124,7 +134,7 @@ def forward(self, batch: SpatialTranscriptomicsDataset) -> torch.Tensor: The output of the model. """ z = self.model(batch.x_dict, batch.edge_index_dict) - output = torch.matmul(z['tx'], z['bd'].t()) # Example for bipartite graph + output = torch.matmul(z["tx"], z["bd"].t()) # Example for bipartite graph return output def training_step(self, batch: Any, batch_idx: int) -> torch.Tensor: @@ -145,16 +155,16 @@ def training_step(self, batch: Any, batch_idx: int) -> torch.Tensor: """ # Forward pass to get the logits z = self.model(batch.x_dict, batch.edge_index_dict) - output = torch.matmul(z['tx'], z['bd'].t()) + output = torch.matmul(z["tx"], z["bd"].t()) # Get edge labels and logits - edge_label_index = batch['tx', 'belongs', 'bd'].edge_label_index + edge_label_index = batch["tx", "belongs", "bd"].edge_label_index out_values = output[edge_label_index[0], edge_label_index[1]] - edge_label = batch['tx', 'belongs', 'bd'].edge_label - + edge_label = batch["tx", "belongs", "bd"].edge_label + # Compute binary cross-entropy loss with logits (no sigmoid here) loss = self.criterion(out_values, edge_label) - + # Log the training loss self.log("train_loss", loss, prog_bar=True, batch_size=batch.num_graphs) return loss @@ -177,31 +187,31 @@ def validation_step(self, batch: Any, batch_idx: int) -> torch.Tensor: """ # Forward pass to get the logits z = self.model(batch.x_dict, batch.edge_index_dict) - output = torch.matmul(z['tx'], z['bd'].t()) + output = torch.matmul(z["tx"], z["bd"].t()) # Get edge labels and logits - edge_label_index = batch['tx', 'belongs', 'bd'].edge_label_index + edge_label_index = batch["tx", "belongs", "bd"].edge_label_index out_values = output[edge_label_index[0], edge_label_index[1]] - edge_label = batch['tx', 'belongs', 'bd'].edge_label - + edge_label = batch["tx", "belongs", "bd"].edge_label + # Compute binary cross-entropy loss with logits (no sigmoid here) loss = self.criterion(out_values, edge_label) - + # Apply sigmoid to logits for AUROC and F1 metrics out_values_prob = torch.sigmoid(out_values) # Compute metrics auroc = torchmetrics.AUROC(task="binary") auroc_res = auroc(out_values_prob, edge_label) - + f1 = F1Score(task="binary").to(self.device) f1_res = f1(out_values_prob, edge_label) - + # Log validation metrics self.log("validation_loss", loss, batch_size=batch.num_graphs) self.log("validation_auroc", auroc_res, prog_bar=True, batch_size=batch.num_graphs) self.log("validation_f1", f1_res, prog_bar=True, batch_size=batch.num_graphs) - + return loss def configure_optimizers(self) -> torch.optim.Optimizer: diff --git a/src/segger/validation/__init__.py b/src/segger/validation/__init__.py index 220150b..bfc7689 100644 --- a/src/segger/validation/__init__.py +++ b/src/segger/validation/__init__.py @@ -1,3 +1,3 @@ from .utils import * -from .xenium_explorer import * \ No newline at end of file +from .xenium_explorer import * diff --git a/src/segger/validation/utils.py b/src/segger/validation/utils.py index b283b00..72a5438 100644 --- a/src/segger/validation/utils.py +++ b/src/segger/validation/utils.py @@ -11,22 +11,20 @@ from matplotlib.backends.backend_pdf import PdfPages import matplotlib.pyplot as plt import dask -dask.config.set({'dataframe.query-planning': False}) + +dask.config.set({"dataframe.query-planning": False}) import squidpy as sq from sklearn.metrics import calinski_harabasz_score, silhouette_score, f1_score from pathlib import Path import seaborn as sns - - - def find_markers( - adata: ad.AnnData, - cell_type_column: str, - pos_percentile: float = 5, - neg_percentile: float = 10, - percentage: float = 50 + adata: ad.AnnData, + cell_type_column: str, + pos_percentile: float = 5, + neg_percentile: float = 10, + percentage: float = 50, ) -> Dict[str, Dict[str, List[str]]]: """Identify positive and negative markers for each cell type based on gene expression and filter by expression percentage. @@ -62,17 +60,12 @@ def find_markers( valid_pos_indices = pos_indices[expr_frac >= (percentage / 100)] positive_markers = genes[valid_pos_indices] negative_markers = genes[neg_indices] - markers[cell_type] = { - 'positive': list(positive_markers), - 'negative': list(negative_markers) - } + markers[cell_type] = {"positive": list(positive_markers), "negative": list(negative_markers)} return markers def find_mutually_exclusive_genes( - adata: ad.AnnData, - markers: Dict[str, Dict[str, List[str]]], - cell_type_column: str + adata: ad.AnnData, markers: Dict[str, Dict[str, List[str]]], cell_type_column: str ) -> List[Tuple[str, str]]: """Identify mutually exclusive genes based on expression criteria. @@ -94,7 +87,7 @@ def find_mutually_exclusive_genes( all_exclusive = [] gene_expression = adata.to_df() for cell_type, marker_sets in markers.items(): - positive_markers = marker_sets['positive'] + positive_markers = marker_sets["positive"] exclusive_genes[cell_type] = [] for gene in positive_markers: gene_expr = adata[:, gene].X @@ -104,7 +97,9 @@ def find_mutually_exclusive_genes( exclusive_genes[cell_type].append(gene) all_exclusive.append(gene) unique_genes = list({gene for i in exclusive_genes.keys() for gene in exclusive_genes[i] if gene in all_exclusive}) - filtered_exclusive_genes = {i: [gene for gene in exclusive_genes[i] if gene in unique_genes] for i in exclusive_genes.keys()} + filtered_exclusive_genes = { + i: [gene for gene in exclusive_genes[i] if gene in unique_genes] for i in exclusive_genes.keys() + } mutually_exclusive_gene_pairs = [ (gene1, gene2) for key1, key2 in combinations(filtered_exclusive_genes.keys(), 2) @@ -114,10 +109,7 @@ def find_mutually_exclusive_genes( return mutually_exclusive_gene_pairs -def compute_MECR( - adata: ad.AnnData, - gene_pairs: List[Tuple[str, str]] -) -> Dict[Tuple[str, str], float]: +def compute_MECR(adata: ad.AnnData, gene_pairs: List[Tuple[str, str]]) -> Dict[Tuple[str, str], float]: """Compute the Mutually Exclusive Co-expression Rate (MECR) for each gene pair in an AnnData object. Args: @@ -143,9 +135,7 @@ def compute_MECR( def compute_quantized_mecr_area( - adata: sc.AnnData, - gene_pairs: List[Tuple[str, str]], - quantiles: int = 10 + adata: sc.AnnData, gene_pairs: List[Tuple[str, str]], quantiles: int = 10 ) -> pd.DataFrame: """Compute the average MECR, variance of MECR, and average cell area for quantiles of cell areas. @@ -161,28 +151,28 @@ def compute_quantized_mecr_area( - quantized_data: pd.DataFrame DataFrame containing quantile information, average MECR, variance of MECR, average area, and number of cells. """ - adata.obs['quantile'] = pd.qcut(adata.obs['cell_area'], quantiles, labels=False) + adata.obs["quantile"] = pd.qcut(adata.obs["cell_area"], quantiles, labels=False) quantized_data = [] for quantile in range(quantiles): - cells_in_quantile = adata.obs['quantile'] == quantile + cells_in_quantile = adata.obs["quantile"] == quantile mecr = compute_MECR(adata[cells_in_quantile, :], gene_pairs) average_mecr = np.mean([i for i in mecr.values()]) variance_mecr = np.var([i for i in mecr.values()]) - average_area = adata.obs.loc[cells_in_quantile, 'cell_area'].mean() - quantized_data.append({ - 'quantile': quantile / quantiles, - 'average_mecr': average_mecr, - 'variance_mecr': variance_mecr, - 'average_area': average_area, - 'num_cells': cells_in_quantile.sum() - }) + average_area = adata.obs.loc[cells_in_quantile, "cell_area"].mean() + quantized_data.append( + { + "quantile": quantile / quantiles, + "average_mecr": average_mecr, + "variance_mecr": variance_mecr, + "average_area": average_area, + "num_cells": cells_in_quantile.sum(), + } + ) return pd.DataFrame(quantized_data) def compute_quantized_mecr_counts( - adata: sc.AnnData, - gene_pairs: List[Tuple[str, str]], - quantiles: int = 10 + adata: sc.AnnData, gene_pairs: List[Tuple[str, str]], quantiles: int = 10 ) -> pd.DataFrame: """Compute the average MECR, variance of MECR, and average transcript counts for quantiles of transcript counts. @@ -198,28 +188,28 @@ def compute_quantized_mecr_counts( - quantized_data: pd.DataFrame DataFrame containing quantile information, average MECR, variance of MECR, average counts, and number of cells. """ - adata.obs['quantile'] = pd.qcut(adata.obs['transcripts'], quantiles, labels=False) + adata.obs["quantile"] = pd.qcut(adata.obs["transcripts"], quantiles, labels=False) quantized_data = [] for quantile in range(quantiles): - cells_in_quantile = adata.obs['quantile'] == quantile + cells_in_quantile = adata.obs["quantile"] == quantile mecr = compute_MECR(adata[cells_in_quantile, :], gene_pairs) average_mecr = np.mean([i for i in mecr.values()]) variance_mecr = np.var([i for i in mecr.values()]) - average_counts = adata.obs.loc[cells_in_quantile, 'transcripts'].mean() - quantized_data.append({ - 'quantile': quantile / quantiles, - 'average_mecr': average_mecr, - 'variance_mecr': variance_mecr, - 'average_counts': average_counts, - 'num_cells': cells_in_quantile.sum() - }) + average_counts = adata.obs.loc[cells_in_quantile, "transcripts"].mean() + quantized_data.append( + { + "quantile": quantile / quantiles, + "average_mecr": average_mecr, + "variance_mecr": variance_mecr, + "average_counts": average_counts, + "num_cells": cells_in_quantile.sum(), + } + ) return pd.DataFrame(quantized_data) def annotate_query_with_reference( - reference_adata: ad.AnnData, - query_adata: ad.AnnData, - transfer_column: str + reference_adata: ad.AnnData, query_adata: ad.AnnData, transfer_column: str ) -> ad.AnnData: """Annotate query AnnData object using a scRNA-seq reference atlas. @@ -238,25 +228,25 @@ def annotate_query_with_reference( common_genes = list(set(reference_adata.var_names) & set(query_adata.var_names)) reference_adata = reference_adata[:, common_genes] query_adata = query_adata[:, common_genes] - query_adata.layers['raw'] = query_adata.raw.X if query_adata.raw else query_adata.X - query_adata.var['raw_counts'] = query_adata.layers['raw'].sum(axis=0) + query_adata.layers["raw"] = query_adata.raw.X if query_adata.raw else query_adata.X + query_adata.var["raw_counts"] = query_adata.layers["raw"].sum(axis=0) sc.pp.normalize_total(query_adata, target_sum=1e4) sc.pp.log1p(query_adata) sc.pp.pca(reference_adata) sc.pp.neighbors(reference_adata) sc.tl.umap(reference_adata) sc.tl.ingest(query_adata, reference_adata, obs=transfer_column) - query_adata.obsm['X_umap'] = query_adata.obsm['X_umap'] + query_adata.obsm["X_umap"] = query_adata.obsm["X_umap"] return query_adata def calculate_contamination( - adata: ad.AnnData, - markers: Dict[str, Dict[str, List[str]]], - radius: float = 15, - n_neighs: int = 10, - celltype_column: str = 'celltype_major', - num_cells: int = 10000 + adata: ad.AnnData, + markers: Dict[str, Dict[str, List[str]]], + radius: float = 15, + n_neighs: int = 10, + celltype_column: str = "celltype_major", + num_cells: int = 10000, ) -> pd.DataFrame: """Calculate normalized contamination from neighboring cells of different cell types based on positive markers. @@ -282,11 +272,11 @@ def calculate_contamination( """ if celltype_column not in adata.obs: raise ValueError("Column celltype_column must be present in adata.obs.") - positive_markers = {ct: markers[ct]['positive'] for ct in markers} + positive_markers = {ct: markers[ct]["positive"] for ct in markers} adata.obsm["spatial"] = adata.obs[["cell_centroid_x", "cell_centroid_y"]].copy().to_numpy() - sq.gr.spatial_neighbors(adata, radius=radius, n_neighs=n_neighs, coord_type='generic') - neighbors = adata.obsp['spatial_connectivities'].tolil() - raw_counts = adata[:, adata.var_names].layers['raw'].toarray() + sq.gr.spatial_neighbors(adata, radius=radius, n_neighs=n_neighs, coord_type="generic") + neighbors = adata.obsp["spatial_connectivities"].tolil() + raw_counts = adata[:, adata.var_names].layers["raw"].toarray() cell_types = adata.obs[celltype_column] selected_cells = np.random.choice(adata.n_obs, size=min(num_cells, adata.n_obs), replace=False) contamination = {ct: {ct2: 0 for ct2 in positive_markers.keys()} for ct in positive_markers.keys()} @@ -309,19 +299,19 @@ def calculate_contamination( if marker in adata.var_names: marker_counts_in_neighbor = raw_counts[neighbor_idx, adata.var_names.get_loc(marker)] if total_counts_in_neighborhood > 0: - contamination[cell_type][neighbor_type] += marker_counts_in_neighbor / total_counts_in_neighborhood + contamination[cell_type][neighbor_type] += ( + marker_counts_in_neighbor / total_counts_in_neighborhood + ) negighborings[cell_type][neighbor_type] += 1 contamination_df = pd.DataFrame(contamination).T negighborings_df = pd.DataFrame(negighborings).T - contamination_df.index.name = 'Source Cell Type' - contamination_df.columns.name = 'Target Cell Type' + contamination_df.index.name = "Source Cell Type" + contamination_df.columns.name = "Target Cell Type" return contamination_df / (negighborings_df + 1) def calculate_sensitivity( - adata: ad.AnnData, - purified_markers: Dict[str, List[str]], - max_cells_per_type: int = 1000 + adata: ad.AnnData, purified_markers: Dict[str, List[str]], max_cells_per_type: int = 1000 ) -> Dict[str, List[float]]: """Calculate the sensitivity of the purified markers for each cell type. @@ -339,8 +329,8 @@ def calculate_sensitivity( """ sensitivity_results = {cell_type: [] for cell_type in purified_markers.keys()} for cell_type, markers in purified_markers.items(): - markers = markers['positive'] - subset = adata[adata.obs['celltype_major'] == cell_type] + markers = markers["positive"] + subset = adata[adata.obs["celltype_major"] == cell_type] if subset.n_obs > max_cells_per_type: cell_indices = np.random.choice(subset.n_obs, max_cells_per_type, replace=False) subset = subset[cell_indices] @@ -352,9 +342,7 @@ def calculate_sensitivity( def compute_clustering_scores( - adata: ad.AnnData, - cell_type_column: str = 'celltype_major', - use_pca: bool = True + adata: ad.AnnData, cell_type_column: str = "celltype_major", use_pca: bool = True ) -> Tuple[float, float]: """Compute the Calinski-Harabasz and Silhouette scores for an AnnData object based on the assigned cell types. @@ -384,11 +372,11 @@ def compute_clustering_scores( def compute_neighborhood_metrics( - adata: ad.AnnData, - radius: float = 10, - celltype_column: str = 'celltype_major', + adata: ad.AnnData, + radius: float = 10, + celltype_column: str = "celltype_major", n_neighs: int = 20, - subset_size: int = 10000 + subset_size: int = 10000, ) -> None: """Compute neighborhood entropy and number of neighbors for each cell in the AnnData object. @@ -418,8 +406,8 @@ def compute_neighborhood_metrics( # Randomly select a subset of cells subset_indices = np.random.choice(adata.n_obs, subset_size, replace=False) # Compute spatial neighbors for the entire dataset - sq.gr.spatial_neighbors(adata, radius=radius, coord_type='generic', n_neighs=n_neighs) - neighbors = adata.obsp['spatial_distances'].tolil().rows + sq.gr.spatial_neighbors(adata, radius=radius, coord_type="generic", n_neighs=n_neighs) + neighbors = adata.obsp["spatial_distances"].tolil().rows entropies = [] num_neighbors = [] # Calculate entropy and number of neighbors only for the selected subset @@ -441,8 +429,8 @@ def compute_neighborhood_metrics( neighbors_full = np.full(adata.n_obs, np.nan) entropy_full[subset_indices] = entropies neighbors_full[subset_indices] = num_neighbors - adata.obs['neighborhood_entropy'] = entropy_full - adata.obs['number_of_neighbors'] = neighbors_full + adata.obs["neighborhood_entropy"] = entropy_full + adata.obs["number_of_neighbors"] = neighbors_full def compute_transcript_density(adata: ad.AnnData) -> None: @@ -453,15 +441,15 @@ def compute_transcript_density(adata: ad.AnnData) -> None: Annotated data object containing transcript and cell area information. """ try: - transcript_counts = adata.obs['transcript_counts'] + transcript_counts = adata.obs["transcript_counts"] except: - transcript_counts = adata.obs['transcripts'] - cell_areas = adata.obs['cell_area'] - adata.obs['transcript_density'] = transcript_counts / cell_areas + transcript_counts = adata.obs["transcripts"] + cell_areas = adata.obs["cell_area"] + adata.obs["transcript_density"] = transcript_counts / cell_areas # def compute_celltype_f1_purity( -# adata: ad.AnnData, +# adata: ad.AnnData, # marker_genes: Dict[str, Dict[str, List[str]]] # ) -> Dict[str, float]: # """ @@ -497,7 +485,7 @@ def compute_transcript_density(adata: ad.AnnData) -> None: # def average_log_normalized_expression( -# adata: ad.AnnData, +# adata: ad.AnnData, # celltype_column: str # ) -> pd.DataFrame: # """ @@ -516,18 +504,8 @@ def compute_transcript_density(adata: ad.AnnData) -> None: # return adata.to_df().groupby(adata.obs[celltype_column]).mean() - - - - def plot_metric_comparison( - ax: plt.Axes, - data: pd.DataFrame, - metric: str, - label: str, - method1: str, - method2: str, - output_path: Path + ax: plt.Axes, data: pd.DataFrame, metric: str, label: str, method1: str, method2: str, output_path: Path ) -> None: """Plot a comparison of a specific metric between two methods and save the comparison data. @@ -547,25 +525,22 @@ def plot_metric_comparison( - output_path: Path Path to save the merged DataFrame as a CSV. """ - subset1 = data[data['method'] == method1] - subset2 = data[data['method'] == method2] - merged_data = pd.merge(subset1, subset2, on='celltype_major', suffixes=(f'_{method1}', f'_{method2}')) - + subset1 = data[data["method"] == method1] + subset2 = data[data["method"] == method2] + merged_data = pd.merge(subset1, subset2, on="celltype_major", suffixes=(f"_{method1}", f"_{method2}")) + # Save the merged data used in the plot to CSV - merged_data.to_csv(output_path / f'metric_comparison_{metric}_{method1}_vs_{method2}.csv', index=False) - - for cell_type in merged_data['celltype_major'].unique(): - cell_data = merged_data[merged_data['celltype_major'] == cell_type] - ax.scatter(cell_data[f'{metric}_{method1}'], cell_data[f'{metric}_{method2}'], - label=cell_type) - - max_value = max(merged_data[f'{metric}_{method1}'].max(), merged_data[f'{metric}_{method2}'].max()) - ax.plot([0, max_value], [0, max_value], 'k--', alpha=0.5) - ax.set_xlabel(f'{label} ({method1})') - ax.set_ylabel(f'{label} ({method2})') - ax.set_title(f'{label}: {method1} vs {method2}') + merged_data.to_csv(output_path / f"metric_comparison_{metric}_{method1}_vs_{method2}.csv", index=False) + for cell_type in merged_data["celltype_major"].unique(): + cell_data = merged_data[merged_data["celltype_major"] == cell_type] + ax.scatter(cell_data[f"{metric}_{method1}"], cell_data[f"{metric}_{method2}"], label=cell_type) + max_value = max(merged_data[f"{metric}_{method1}"].max(), merged_data[f"{metric}_{method2}"].max()) + ax.plot([0, max_value], [0, max_value], "k--", alpha=0.5) + ax.set_xlabel(f"{label} ({method1})") + ax.set_ylabel(f"{label} ({method2})") + ax.set_title(f"{label}: {method1} vs {method2}") def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnData]: @@ -581,16 +556,15 @@ def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnD for method, path in segmentation_paths.items(): adata = sc.read(path) # Special handling for 'segger' to separate into 'segger_n0' and 'segger_n1' - if method == 'segger': - cells_n1 = [i for i in adata.obs_names if not i.endswith('-nx')] - cells_n0 = [i for i in adata.obs_names if i.endswith('-nx')] - segmentations_dict['segger_n1'] = adata[cells_n1, :] - segmentations_dict['segger_n0'] = adata[cells_n0, :] + if method == "segger": + cells_n1 = [i for i in adata.obs_names if not i.endswith("-nx")] + cells_n0 = [i for i in adata.obs_names if i.endswith("-nx")] + segmentations_dict["segger_n1"] = adata[cells_n1, :] + segmentations_dict["segger_n0"] = adata[cells_n0, :] segmentations_dict[method] = adata return segmentations_dict - def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the number of cells per segmentation method and save the cell count data as a CSV. @@ -600,33 +574,37 @@ def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat """ # Calculate the number of cells in each segmentation method cell_counts = {method: seg.n_obs for method, seg in segmentations_dict.items()} - + # Create a DataFrame for the bar plot - df = pd.DataFrame(cell_counts, index=['Number of Cells']).T - + df = pd.DataFrame(cell_counts, index=["Number of Cells"]).T + # Save the DataFrame to CSV - df.to_csv(output_path / 'cell_counts_data.csv', index=True) - + df.to_csv(output_path / "cell_counts_data.csv", index=True) + # Generate the bar plot - ax = df.plot(kind='bar', stacked=False, color=[palette.get(key, '#333333') for key in df.index], figsize=(3, 6), width=0.9) - + ax = df.plot( + kind="bar", stacked=False, color=[palette.get(key, "#333333") for key in df.index], figsize=(3, 6), width=0.9 + ) + # Add a dashed line for the 10X baseline - if '10X' in cell_counts: - baseline_height = cell_counts['10X'] - ax.axhline(y=baseline_height, color='gray', linestyle='--', linewidth=1.5, label='10X Baseline') - + if "10X" in cell_counts: + baseline_height = cell_counts["10X"] + ax.axhline(y=baseline_height, color="gray", linestyle="--", linewidth=1.5, label="10X Baseline") + # Set plot titles and labels - plt.title('Number of Cells per Segmentation Method') - plt.xlabel('Segmentation Method') - plt.ylabel('Number of Cells') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - + plt.title("Number of Cells per Segmentation Method") + plt.xlabel("Segmentation Method") + plt.ylabel("Number of Cells") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + # Save the figure as a PDF - plt.savefig(output_path / 'cell_counts_bar_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "cell_counts_bar_plot.pdf", bbox_inches="tight") plt.show() -def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: +def plot_percent_assigned( + segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the percentage of assigned transcripts (normalized) for each segmentation method. Args: @@ -646,43 +624,38 @@ def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path percent_assigned_normalized = total_counts_per_gene.divide(max_counts_per_gene, axis=0) * 100 # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Percent Assigned (Normalized)': [] - }) - - + violin_data = pd.DataFrame({"Segmentation Method": [], "Percent Assigned (Normalized)": []}) # Add normalized percent_assigned data for each method for method in segmentations_dict.keys(): method_data = percent_assigned_normalized[method].dropna() - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_data), - 'Percent Assigned (Normalized)': method_data.values - }) + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_data), "Percent Assigned (Normalized)": method_data.values} + ) violin_data = pd.concat([violin_data, method_df], axis=0) - - violin_data.to_csv(output_path / 'percent_assigned_normalized.csv', index=True) + + violin_data.to_csv(output_path / "percent_assigned_normalized.csv", index=True) # Plot the violin plots plt.figure(figsize=(12, 8)) - ax = sns.violinplot(x='Segmentation Method', y='Percent Assigned (Normalized)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Percent Assigned (Normalized)", data=violin_data, palette=palette) # Add a dashed line for the 10X baseline - if '10X' in segmentations_dict: - baseline_height = percent_assigned_normalized['10X'].mean() - ax.axhline(y=baseline_height, color='gray', linestyle='--', linewidth=1.5, label='10X Baseline') + if "10X" in segmentations_dict: + baseline_height = percent_assigned_normalized["10X"].mean() + ax.axhline(y=baseline_height, color="gray", linestyle="--", linewidth=1.5, label="10X Baseline") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Percent Assigned (Normalized)') - plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Percent Assigned (Normalized)") + plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") # Save the figure as a PDF - plt.savefig(output_path / 'percent_assigned_normalized_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "percent_assigned_normalized_violin_plot.pdf", bbox_inches="tight") plt.show() + def plot_gene_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the normalized gene counts for each segmentation method. @@ -703,40 +676,37 @@ def plot_gene_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat normalized_counts_per_gene = total_counts_per_gene.divide(max_counts_per_gene, axis=0) # Prepare the data for the box plot - boxplot_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Normalized Counts': [] - }) + boxplot_data = pd.DataFrame({"Segmentation Method": [], "Normalized Counts": []}) for method in segmentations_dict.keys(): method_counts = normalized_counts_per_gene[method] - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_counts), - 'Normalized Counts': method_counts.values - }) + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_counts), "Normalized Counts": method_counts.values} + ) boxplot_data = pd.concat([boxplot_data, method_df], axis=0) - - boxplot_data.to_csv(output_path / 'gene_counts_normalized_data.csv', index=True) + + boxplot_data.to_csv(output_path / "gene_counts_normalized_data.csv", index=True) # Plot the box plots plt.figure(figsize=(3, 6)) - ax = sns.boxplot(x='Segmentation Method', y='Normalized Counts', data=boxplot_data, palette=palette, width=0.9) + ax = sns.boxplot(x="Segmentation Method", y="Normalized Counts", data=boxplot_data, palette=palette, width=0.9) # Add a dashed line for the 10X baseline - if '10X' in normalized_counts_per_gene: - baseline_height = normalized_counts_per_gene['10X'].mean() - plt.axhline(y=baseline_height, color='gray', linestyle='--', linewidth=1.5, label='10X Baseline') + if "10X" in normalized_counts_per_gene: + baseline_height = normalized_counts_per_gene["10X"].mean() + plt.axhline(y=baseline_height, color="gray", linestyle="--", linewidth=1.5, label="10X Baseline") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Normalized Counts') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Normalized Counts") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'gene_counts_normalized_boxplot_by_method.pdf', bbox_inches='tight') + plt.savefig(output_path / "gene_counts_normalized_boxplot_by_method.pdf", bbox_inches="tight") plt.show() + def plot_counts_per_cell(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the counts per cell (log2) for each segmentation method. @@ -745,36 +715,33 @@ def plot_counts_per_cell(segmentations_dict: Dict[str, sc.AnnData], output_path: output_path (Path): Path to the directory where the plot will be saved. """ # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Counts per Cell (log2)': [] - }) + violin_data = pd.DataFrame({"Segmentation Method": [], "Counts per Cell (log2)": []}) for method, adata in segmentations_dict.items(): - method_counts = adata.obs['transcripts'] + 1 - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_counts), - 'Counts per Cell (log2)': method_counts.values - }) + method_counts = adata.obs["transcripts"] + 1 + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_counts), "Counts per Cell (log2)": method_counts.values} + ) violin_data = pd.concat([violin_data, method_df], axis=0) - - violin_data.to_csv(output_path / 'counts_per_cell_data.csv', index=True) + + violin_data.to_csv(output_path / "counts_per_cell_data.csv", index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) - ax = sns.violinplot(x='Segmentation Method', y='Counts per Cell (log2)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Counts per Cell (log2)", data=violin_data, palette=palette) ax.set(ylim=(5, 300)) # Add a dashed line for the 10X-nucleus median - if '10X-nucleus' in segmentations_dict: - median_10X_nucleus = np.median(segmentations_dict['10X-nucleus'].obs['transcripts'] + 1) - ax.axhline(y=median_10X_nucleus, color='gray', linestyle='--', linewidth=1.5, label='10X-nucleus Median') + if "10X-nucleus" in segmentations_dict: + median_10X_nucleus = np.median(segmentations_dict["10X-nucleus"].obs["transcripts"] + 1) + ax.axhline(y=median_10X_nucleus, color="gray", linestyle="--", linewidth=1.5, label="10X-nucleus Median") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Counts per Cell (log2)') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Counts per Cell (log2)") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'counts_per_cell_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "counts_per_cell_violin_plot.pdf", bbox_inches="tight") plt.show() + def plot_cell_area(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the cell area (log2) for each segmentation method. @@ -783,37 +750,36 @@ def plot_cell_area(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, output_path (Path): Path to the directory where the plot will be saved. """ # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Cell Area (log2)': [] - }) + violin_data = pd.DataFrame({"Segmentation Method": [], "Cell Area (log2)": []}) for method in segmentations_dict.keys(): - if 'cell_area' in segmentations_dict[method].obs.columns: - method_area = segmentations_dict[method].obs['cell_area'] + 1 - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_area), - 'Cell Area (log2)': method_area.values - }) + if "cell_area" in segmentations_dict[method].obs.columns: + method_area = segmentations_dict[method].obs["cell_area"] + 1 + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_area), "Cell Area (log2)": method_area.values} + ) violin_data = pd.concat([violin_data, method_df], axis=0) - violin_data.to_csv(output_path / 'cell_area_log2_data.csv', index=True) + violin_data.to_csv(output_path / "cell_area_log2_data.csv", index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) - ax = sns.violinplot(x='Segmentation Method', y='Cell Area (log2)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Cell Area (log2)", data=violin_data, palette=palette) ax.set(ylim=(5, 100)) # Add a dashed line for the 10X-nucleus median - if '10X-nucleus' in segmentations_dict: - median_10X_nucleus_area = np.median(segmentations_dict['10X-nucleus'].obs['cell_area'] + 1) - ax.axhline(y=median_10X_nucleus_area, color='gray', linestyle='--', linewidth=1.5, label='10X-nucleus Median') + if "10X-nucleus" in segmentations_dict: + median_10X_nucleus_area = np.median(segmentations_dict["10X-nucleus"].obs["cell_area"] + 1) + ax.axhline(y=median_10X_nucleus_area, color="gray", linestyle="--", linewidth=1.5, label="10X-nucleus Median") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Cell Area (log2)') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Cell Area (log2)") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'cell_area_log2_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "cell_area_log2_violin_plot.pdf", bbox_inches="tight") plt.show() -def plot_transcript_density(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: + +def plot_transcript_density( + segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the transcript density (log2) for each segmentation method. Args: @@ -821,43 +787,53 @@ def plot_transcript_density(segmentations_dict: Dict[str, sc.AnnData], output_pa output_path (Path): Path to the directory where the plot will be saved. """ # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Transcript Density (log2)': [] - }) + violin_data = pd.DataFrame({"Segmentation Method": [], "Transcript Density (log2)": []}) for method in segmentations_dict.keys(): - if 'cell_area' in segmentations_dict[method].obs.columns: - method_density = segmentations_dict[method].obs['transcripts'] / segmentations_dict[method].obs['cell_area'] + if "cell_area" in segmentations_dict[method].obs.columns: + method_density = segmentations_dict[method].obs["transcripts"] / segmentations_dict[method].obs["cell_area"] method_density_log2 = np.log2(method_density + 1) - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_density_log2), - 'Transcript Density (log2)': method_density_log2.values - }) + method_df = pd.DataFrame( + { + "Segmentation Method": [method] * len(method_density_log2), + "Transcript Density (log2)": method_density_log2.values, + } + ) violin_data = pd.concat([violin_data, method_df], axis=0) - - violin_data.to_csv(output_path / 'transcript_density_log2_data.csv', index=True) + + violin_data.to_csv(output_path / "transcript_density_log2_data.csv", index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) - ax = sns.violinplot(x='Segmentation Method', y='Transcript Density (log2)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Transcript Density (log2)", data=violin_data, palette=palette) # Add a dashed line for the 10X-nucleus median - if '10X-nucleus' in segmentations_dict: - median_10X_nucleus_density_log2 = np.median(np.log2(segmentations_dict['10X-nucleus'].obs['transcripts'] / segmentations_dict['10X-nucleus'].obs['cell_area'] + 1)) - ax.axhline(y=median_10X_nucleus_density_log2, color='gray', linestyle='--', linewidth=1.5, label='10X-nucleus Median') + if "10X-nucleus" in segmentations_dict: + median_10X_nucleus_density_log2 = np.median( + np.log2( + segmentations_dict["10X-nucleus"].obs["transcripts"] + / segmentations_dict["10X-nucleus"].obs["cell_area"] + + 1 + ) + ) + ax.axhline( + y=median_10X_nucleus_density_log2, color="gray", linestyle="--", linewidth=1.5, label="10X-nucleus Median" + ) # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Transcript Density (log2)') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Transcript Density (log2)") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'transcript_density_log2_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "transcript_density_log2_violin_plot.pdf", bbox_inches="tight") plt.show() -def plot_general_statistics_plots(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: + +def plot_general_statistics_plots( + segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] +) -> None: """Create a summary plot with all the general statistics subplots. Args: @@ -884,11 +860,13 @@ def plot_general_statistics_plots(segmentations_dict: Dict[str, sc.AnnData], out plot_transcript_density(segmentations_dict, output_path, palette=palette) plt.tight_layout() - plt.savefig(output_path / 'general_statistics_plots.pdf', bbox_inches='tight') + plt.savefig(output_path / "general_statistics_plots.pdf", bbox_inches="tight") plt.show() -def plot_mecr_results(mecr_results: Dict[str, Dict[Tuple[str, str], float]], output_path: Path, palette: Dict[str, str]) -> None: +def plot_mecr_results( + mecr_results: Dict[str, Dict[Tuple[str, str], float]], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the MECR (Mutually Exclusive Co-expression Rate) results for each segmentation method. Args: @@ -900,26 +878,25 @@ def plot_mecr_results(mecr_results: Dict[str, Dict[Tuple[str, str], float]], out plot_data = [] for method, mecr_dict in mecr_results.items(): for gene_pair, mecr_value in mecr_dict.items(): - plot_data.append({ - 'Segmentation Method': method, - 'Gene Pair': f"{gene_pair[0]} - {gene_pair[1]}", - 'MECR': mecr_value - }) + plot_data.append( + {"Segmentation Method": method, "Gene Pair": f"{gene_pair[0]} - {gene_pair[1]}", "MECR": mecr_value} + ) df = pd.DataFrame(plot_data) - df.to_csv(output_path / 'mcer_box.csv', index=True) + df.to_csv(output_path / "mcer_box.csv", index=True) plt.figure(figsize=(3, 6)) - sns.boxplot(x='Segmentation Method', y='MECR', data=df, palette=palette) - plt.title('Mutually Exclusive Co-expression Rate (MECR)') - plt.xlabel('Segmentation Method') - plt.ylabel('MECR') - plt.xticks(rotation=45, ha='right') + sns.boxplot(x="Segmentation Method", y="MECR", data=df, palette=palette) + plt.title("Mutually Exclusive Co-expression Rate (MECR)") + plt.xlabel("Segmentation Method") + plt.ylabel("MECR") + plt.xticks(rotation=45, ha="right") plt.tight_layout() - plt.savefig(output_path / 'mecr_results_boxplot.pdf', bbox_inches='tight') + plt.savefig(output_path / "mecr_results_boxplot.pdf", bbox_inches="tight") plt.show() - -def plot_quantized_mecr_counts(quantized_mecr_counts: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str]) -> None: +def plot_quantized_mecr_counts( + quantized_mecr_counts: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the quantized MECR values against transcript counts for each segmentation method, with point size proportional to the variance of MECR. Args: @@ -927,38 +904,40 @@ def plot_quantized_mecr_counts(quantized_mecr_counts: Dict[str, pd.DataFrame], o output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - quantized_mecr_counts.to_csv(output_path / 'quantized_mecr_counts.csv', index=True) + quantized_mecr_counts.to_csv(output_path / "quantized_mecr_counts.csv", index=True) plt.figure(figsize=(9, 6)) for method, df in quantized_mecr_counts.items(): plt.plot( - df['average_counts'], - df['average_mecr'], - marker='o', - linestyle='-', - color=palette.get(method, '#333333'), + df["average_counts"], + df["average_mecr"], + marker="o", + linestyle="-", + color=palette.get(method, "#333333"), label=method, - markersize=0 # No markers, only lines + markersize=0, # No markers, only lines ) plt.scatter( - df['average_counts'], - df['average_mecr'], - s=df['variance_mecr'] * 1e5, # Size of points based on the variance of MECR - color=palette.get(method, '#333333'), + df["average_counts"], + df["average_mecr"], + s=df["variance_mecr"] * 1e5, # Size of points based on the variance of MECR + color=palette.get(method, "#333333"), alpha=0.7, # Slight transparency for overlapping points - edgecolor='w', # White edge color for better visibility - linewidth=0.5 # Thin edge line + edgecolor="w", # White edge color for better visibility + linewidth=0.5, # Thin edge line ) - plt.title('Quantized MECR by Transcript Counts') - plt.xlabel('Average Transcript Counts') - plt.ylabel('Average MECR') + plt.title("Quantized MECR by Transcript Counts") + plt.xlabel("Average Transcript Counts") + plt.ylabel("Average MECR") # Place the legend outside the plot on the top right - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") plt.tight_layout() - plt.savefig(output_path / 'quantized_mecr_counts_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "quantized_mecr_counts_plot.pdf", bbox_inches="tight") plt.show() - - -def plot_quantized_mecr_area(quantized_mecr_area: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str]) -> None: + + +def plot_quantized_mecr_area( + quantized_mecr_area: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the quantized MECR values against cell areas for each segmentation method, with point size proportional to the variance of MECR. Args: @@ -966,40 +945,41 @@ def plot_quantized_mecr_area(quantized_mecr_area: Dict[str, pd.DataFrame], outpu output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - quantized_mecr_area.to_csv(output_path / 'quantized_mecr_area.csv', index=True) + quantized_mecr_area.to_csv(output_path / "quantized_mecr_area.csv", index=True) plt.figure(figsize=(6, 4)) for method, df in quantized_mecr_area.items(): plt.plot( - df['average_area'], - df['average_mecr'], - marker='o', + df["average_area"], + df["average_mecr"], + marker="o", # s=df['variance_mecr'] * 1e5, - linestyle='-', - color=palette.get(method, '#333333'), + linestyle="-", + color=palette.get(method, "#333333"), label=method, - markersize=0 + markersize=0, ) plt.scatter( - df['average_area'], - df['average_mecr'], - s=df['variance_mecr'] * 1e5, # Size of points based on the variance of MECR - color=palette.get(method, '#333333'), + df["average_area"], + df["average_mecr"], + s=df["variance_mecr"] * 1e5, # Size of points based on the variance of MECR + color=palette.get(method, "#333333"), alpha=0.7, # Slight transparency for overlapping points - edgecolor='w', # White edge color for better visibility - linewidth=0.5 # Thin edge line + edgecolor="w", # White edge color for better visibility + linewidth=0.5, # Thin edge line ) - plt.title('Quantized MECR by Cell Area') - plt.xlabel('Average Cell Area') - plt.ylabel('Average MECR') + plt.title("Quantized MECR by Cell Area") + plt.xlabel("Average Cell Area") + plt.ylabel("Average MECR") # Place the legend outside the plot on the top right - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") plt.tight_layout() - plt.savefig(output_path / 'quantized_mecr_area_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "quantized_mecr_area_plot.pdf", bbox_inches="tight") plt.show() - -def plot_contamination_results(contamination_results: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str]) -> None: +def plot_contamination_results( + contamination_results: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str] +) -> None: """Plot contamination results for each segmentation method. Args: @@ -1007,18 +987,18 @@ def plot_contamination_results(contamination_results: Dict[str, pd.DataFrame], o output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - contamination_results.to_csv(output_path / 'contamination_results.csv', index=True) + contamination_results.to_csv(output_path / "contamination_results.csv", index=True) for method, df in contamination_results.items(): plt.figure(figsize=(10, 6)) - sns.heatmap(df, annot=True, cmap='coolwarm', linewidths=0.5) - plt.title(f'Contamination Matrix for {method}') - plt.xlabel('Target Cell Type') - plt.ylabel('Source Cell Type') + sns.heatmap(df, annot=True, cmap="coolwarm", linewidths=0.5) + plt.title(f"Contamination Matrix for {method}") + plt.xlabel("Target Cell Type") + plt.ylabel("Source Cell Type") plt.tight_layout() - plt.savefig(output_path / f'{method}_contamination_matrix.pdf', bbox_inches='tight') + plt.savefig(output_path / f"{method}_contamination_matrix.pdf", bbox_inches="tight") plt.show() - - + + def plot_contamination_boxplots(boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: """Plot boxplots for contamination values across different segmentation methods. @@ -1027,31 +1007,25 @@ def plot_contamination_boxplots(boxplot_data: pd.DataFrame, output_path: Path, p output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - boxplot_data.to_csv(output_path / 'contamination_box_results.csv', index=True) + boxplot_data.to_csv(output_path / "contamination_box_results.csv", index=True) plt.figure(figsize=(14, 8)) - sns.boxplot( - x='Source Cell Type', - y='Contamination', - hue='Segmentation Method', - data=boxplot_data, - palette=palette - ) - plt.title('Neighborhood Contamination') - plt.xlabel('Source Cell Type') - plt.ylabel('Contamination') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(rotation=45, ha='right') - + sns.boxplot(x="Source Cell Type", y="Contamination", hue="Segmentation Method", data=boxplot_data, palette=palette) + plt.title("Neighborhood Contamination") + plt.xlabel("Source Cell Type") + plt.ylabel("Contamination") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.xticks(rotation=45, ha="right") + plt.tight_layout() - plt.savefig(output_path / 'contamination_boxplots.pdf', bbox_inches='tight') + plt.savefig(output_path / "contamination_boxplots.pdf", bbox_inches="tight") plt.show() - - + + def plot_umaps_with_scores( - segmentations_dict: Dict[str, sc.AnnData], - clustering_scores: Dict[str, Tuple[float, float]], - output_path: Path, - palette: Dict[str, str] + segmentations_dict: Dict[str, sc.AnnData], + clustering_scores: Dict[str, Tuple[float, float]], + output_path: Path, + palette: Dict[str, str], ) -> None: """Plot UMAPs colored by cell type for each segmentation method and display clustering scores in the title. Args: @@ -1069,17 +1043,15 @@ def plot_umaps_with_scores( plt.figure(figsize=(8, 6)) sc.pp.neighbors(adata_copy, n_neighbors=5) sc.tl.umap(adata_copy, spread=5) - sc.pl.umap(adata_copy, color='celltype_major', palette=palette, show=False) + sc.pl.umap(adata_copy, color="celltype_major", palette=palette, show=False) # Add clustering scores to the title - ch_score, sh_score = compute_clustering_scores(adata_copy, cell_type_column='celltype_major') + ch_score, sh_score = compute_clustering_scores(adata_copy, cell_type_column="celltype_major") plt.title(f"{method} - UMAP\nCalinski-Harabasz: {ch_score:.2f}, Silhouette: {sh_score:.2f}") # Save the figure - plt.savefig(output_path / f'{method}_umap_with_scores.pdf', bbox_inches='tight') + plt.savefig(output_path / f"{method}_umap_with_scores.pdf", bbox_inches="tight") plt.show() - - def plot_entropy_boxplots(entropy_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: """Plot boxplots for neighborhood entropy across different segmentation methods by cell type. @@ -1090,45 +1062,37 @@ def plot_entropy_boxplots(entropy_boxplot_data: pd.DataFrame, output_path: Path, """ plt.figure(figsize=(14, 8)) sns.boxplot( - x='Cell Type', - y='Neighborhood Entropy', - hue='Segmentation Method', - data=entropy_boxplot_data, - palette=palette + x="Cell Type", y="Neighborhood Entropy", hue="Segmentation Method", data=entropy_boxplot_data, palette=palette ) - plt.title('Neighborhood Entropy') - plt.xlabel('Cell Type') - plt.ylabel('Neighborhood Entropy') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(rotation=45, ha='right') + plt.title("Neighborhood Entropy") + plt.xlabel("Cell Type") + plt.ylabel("Neighborhood Entropy") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.xticks(rotation=45, ha="right") plt.tight_layout() - plt.savefig(output_path / 'neighborhood_entropy_boxplots.pdf', bbox_inches='tight') + plt.savefig(output_path / "neighborhood_entropy_boxplots.pdf", bbox_inches="tight") plt.show() - - -def plot_sensitivity_boxplots(sensitivity_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: +def plot_sensitivity_boxplots( + sensitivity_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str] +) -> None: """Plot boxplots for sensitivity across different segmentation methods by cell type. Args: sensitivity_boxplot_data (pd.DataFrame): DataFrame containing sensitivity data for all segmentation methods. output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - sensitivity_boxplot_data.to_csv(output_path / 'sensitivity_results.csv', index=True) + sensitivity_boxplot_data.to_csv(output_path / "sensitivity_results.csv", index=True) plt.figure(figsize=(14, 8)) sns.boxplot( - x='Cell Type', - y='Sensitivity', - hue='Segmentation Method', - data=sensitivity_boxplot_data, - palette=palette + x="Cell Type", y="Sensitivity", hue="Segmentation Method", data=sensitivity_boxplot_data, palette=palette ) - plt.title('Sensitivity Score') - plt.xlabel('Cell Type') - plt.ylabel('Sensitivity') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(rotation=45, ha='right') + plt.title("Sensitivity Score") + plt.xlabel("Cell Type") + plt.ylabel("Sensitivity") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.xticks(rotation=45, ha="right") plt.tight_layout() - plt.savefig(output_path / 'sensitivity_boxplots.pdf', bbox_inches='tight') - plt.show() \ No newline at end of file + plt.savefig(output_path / "sensitivity_boxplots.pdf", bbox_inches="tight") + plt.show() diff --git a/src/segger/validation/xenium_explorer.py b/src/segger/validation/xenium_explorer.py index d4edcd9..6ad5fc9 100644 --- a/src/segger/validation/xenium_explorer.py +++ b/src/segger/validation/xenium_explorer.py @@ -10,7 +10,6 @@ from typing import Dict, Any, Optional, List, Tuple - def str_to_uint32(cell_id_str: str) -> Tuple[int, int]: """Convert a string cell ID back to uint32 format. @@ -20,18 +19,31 @@ def str_to_uint32(cell_id_str: str) -> Tuple[int, int]: Returns: Tuple[int, int]: The cell ID in uint32 format and the dataset suffix. """ - prefix, suffix = cell_id_str.split('-') + prefix, suffix = cell_id_str.split("-") str_to_hex_mapping = { - 'a': '0', 'b': '1', 'c': '2', 'd': '3', - 'e': '4', 'f': '5', 'g': '6', 'h': '7', - 'i': '8', 'j': '9', 'k': 'a', 'l': 'b', - 'm': 'c', 'n': 'd', 'o': 'e', 'p': 'f' + "a": "0", + "b": "1", + "c": "2", + "d": "3", + "e": "4", + "f": "5", + "g": "6", + "h": "7", + "i": "8", + "j": "9", + "k": "a", + "l": "b", + "m": "c", + "n": "d", + "o": "e", + "p": "f", } - hex_prefix = ''.join([str_to_hex_mapping[char] for char in prefix]) + hex_prefix = "".join([str_to_hex_mapping[char] for char in prefix]) cell_id_uint32 = int(hex_prefix, 16) dataset_suffix = int(suffix) return cell_id_uint32, dataset_suffix + def get_indices_indptr(input_array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Get the indices and indptr arrays for sparse matrix representation. @@ -47,13 +59,14 @@ def get_indices_indptr(input_array: np.ndarray) -> Tuple[np.ndarray, np.ndarray] for cluster in clusters: cluster_indices = np.where(input_array == cluster)[0] - indptr[cluster-1] = len(indices) + indptr[cluster - 1] = len(indices) indices.extend(cluster_indices) indices.extend(-np.zeros(len(input_array[input_array == 0]))) indices = np.array(indices, dtype=np.int32).astype(np.uint32) return indices, indptr + def save_cell_clustering(merged: pd.DataFrame, zarr_path: str, columns: List[str]) -> None: """Save cell clustering information to a Zarr file. @@ -64,35 +77,38 @@ def save_cell_clustering(merged: pd.DataFrame, zarr_path: str, columns: List[str """ import zarr - new_zarr = zarr.open(zarr_path, mode='w') - new_zarr.create_group('/cell_groups') + new_zarr = zarr.open(zarr_path, mode="w") + new_zarr.create_group("/cell_groups") mappings = [] for index, column in enumerate(columns): - new_zarr['cell_groups'].create_group(index) + new_zarr["cell_groups"].create_group(index) classes = list(np.unique(merged[column].astype(str))) - mapping_dict = {key: i for i, key in zip(range(1, len(classes)), [k for k in classes if k != 'nan'])} - mapping_dict['nan'] = 0 + mapping_dict = {key: i for i, key in zip(range(1, len(classes)), [k for k in classes if k != "nan"])} + mapping_dict["nan"] = 0 clusters = merged[column].astype(str).replace(mapping_dict).values.astype(int) indices, indptr = get_indices_indptr(clusters) - new_zarr['cell_groups'][index].create_dataset('indices', data=indices) - new_zarr['cell_groups'][index].create_dataset('indptr', data=indptr) + new_zarr["cell_groups"][index].create_dataset("indices", data=indices) + new_zarr["cell_groups"][index].create_dataset("indptr", data=indptr) mappings.append(mapping_dict) - new_zarr['cell_groups'].attrs.update({ - "major_version": 1, - "minor_version": 0, - "number_groupings": len(columns), - "grouping_names": columns, - "group_names": [ - [k for k, v in sorted(mapping_dict.items(), key=lambda item: item[1])][1:] for mapping_dict in mappings - ] - }) + new_zarr["cell_groups"].attrs.update( + { + "major_version": 1, + "minor_version": 0, + "number_groupings": len(columns), + "grouping_names": columns, + "group_names": [ + [k for k, v in sorted(mapping_dict.items(), key=lambda item: item[1])][1:] for mapping_dict in mappings + ], + } + ) new_zarr.store.close() -def draw_umap(adata, column: str = 'leiden') -> None: + +def draw_umap(adata, column: str = "leiden") -> None: """Draw UMAP plots for the given AnnData object. Args: @@ -102,12 +118,13 @@ def draw_umap(adata, column: str = 'leiden') -> None: sc.pl.umap(adata, color=[column]) plt.show() - sc.pl.umap(adata, color=['KRT5', 'KRT7'], vmax='p95') + sc.pl.umap(adata, color=["KRT5", "KRT7"], vmax="p95") plt.show() - sc.pl.umap(adata, color=['ACTA2', 'PTPRC'], vmax='p95') + sc.pl.umap(adata, color=["ACTA2", "PTPRC"], vmax="p95") plt.show() + def get_leiden_umap(adata, draw: bool = False): """Perform Leiden clustering and UMAP visualization on the given AnnData object. @@ -123,12 +140,9 @@ def get_leiden_umap(adata, draw: bool = False): gene_names = adata.var_names mean_expression_values = adata.X.mean(axis=0) - gene_mean_expression_df = pd.DataFrame({ - 'gene_name': gene_names, - 'mean_expression': mean_expression_values - }) - top_genes = gene_mean_expression_df.sort_values(by='mean_expression', ascending=False).head(30) - top_gene_names = top_genes['gene_name'].tolist() + gene_mean_expression_df = pd.DataFrame({"gene_name": gene_names, "mean_expression": mean_expression_values}) + top_genes = gene_mean_expression_df.sort_values(by="mean_expression", ascending=False).head(30) + top_gene_names = top_genes["gene_name"].tolist() sc.pp.normalize_total(adata) sc.pp.log1p(adata) @@ -137,11 +151,12 @@ def get_leiden_umap(adata, draw: bool = False): sc.tl.leiden(adata) if draw: - draw_umap(adata, 'leiden') + draw_umap(adata, "leiden") return adata -def get_median_expression_table(adata, column: str = 'leiden') -> pd.DataFrame: + +def get_median_expression_table(adata, column: str = "leiden") -> pd.DataFrame: """Get the median expression table for the given AnnData object. Args: @@ -151,7 +166,23 @@ def get_median_expression_table(adata, column: str = 'leiden') -> pd.DataFrame: Returns: pd.DataFrame: The median expression table. """ - top_genes = ['GATA3', 'ACTA2', 'KRT7', 'KRT8', 'KRT5', 'AQP1', 'SERPINA3', 'PTGDS', 'CXCR4', 'SFRP1', 'ENAH', 'MYH11', 'SVIL', 'KRT14', 'CD4'] + top_genes = [ + "GATA3", + "ACTA2", + "KRT7", + "KRT8", + "KRT5", + "AQP1", + "SERPINA3", + "PTGDS", + "CXCR4", + "SFRP1", + "ENAH", + "MYH11", + "SVIL", + "KRT14", + "CD4", + ] top_gene_indices = [adata.var_names.get_loc(gene) for gene in top_genes] clusters = adata.obs[column] @@ -160,26 +191,29 @@ def get_median_expression_table(adata, column: str = 'leiden') -> pd.DataFrame: for cluster in clusters.unique(): cluster_cells = adata[clusters == cluster].X cluster_expression = cluster_cells[:, top_gene_indices] - gene_medians = [pd.Series(cluster_expression[:, gene_idx]).median() for gene_idx in range(len(top_gene_indices))] - cluster_data[f'Cluster_{cluster}'] = gene_medians + gene_medians = [ + pd.Series(cluster_expression[:, gene_idx]).median() for gene_idx in range(len(top_gene_indices)) + ] + cluster_data[f"Cluster_{cluster}"] = gene_medians cluster_expression_df = pd.DataFrame(cluster_data, index=top_genes) sorted_columns = sorted(cluster_expression_df.columns.values, key=lambda x: int(x.split("_")[-1])) cluster_expression_df = cluster_expression_df[sorted_columns] - return cluster_expression_df.T.style.background_gradient(cmap='Greens') + return cluster_expression_df.T.style.background_gradient(cmap="Greens") + def seg2explorer( - seg_df: pd.DataFrame, - source_path: str, - output_dir: str, - cells_filename: str = 'seg_cells', - analysis_filename: str = "seg_analysis", + seg_df: pd.DataFrame, + source_path: str, + output_dir: str, + cells_filename: str = "seg_cells", + analysis_filename: str = "seg_analysis", xenium_filename: str = "seg_experiment.xenium", - analysis_df: Optional[pd.DataFrame] = None, - draw: bool = False, - cell_id_columns: str = 'seg_cell_id', + analysis_df: Optional[pd.DataFrame] = None, + draw: bool = False, + cell_id_columns: str = "seg_cell_id", area_low: float = 10, - area_high: float = 100 + area_high: float = 100, ) -> None: """Convert seg output to a format compatible with Xenium explorer. @@ -214,8 +248,8 @@ def seg2explorer( for cell_incremental_id, (seg_cell_id, seg_cell) in tqdm(enumerate(grouped_by), total=len(grouped_by)): if len(seg_cell) < 5: continue - - cell_convex_hull = ConvexHull(seg_cell[['x_location', 'y_location']]) + + cell_convex_hull = ConvexHull(seg_cell[["x_location", "y_location"]]) if cell_convex_hull.area > area_high: continue if cell_convex_hull.area < area_low: @@ -224,25 +258,31 @@ def seg2explorer( uint_cell_id = cell_incremental_id + 1 cell_id2old_id[uint_cell_id] = seg_cell_id - seg_nucleous = seg_cell[seg_cell['overlaps_nucleus'] == 1] + seg_nucleous = seg_cell[seg_cell["overlaps_nucleus"] == 1] if len(seg_nucleous) >= 3: - nucleus_convex_hull = ConvexHull(seg_nucleous[['x_location', 'y_location']]) + nucleus_convex_hull = ConvexHull(seg_nucleous[["x_location", "y_location"]]) cell_id.append(uint_cell_id) - cell_summary.append({ - "cell_centroid_x": seg_cell['x_location'].mean(), - "cell_centroid_y": seg_cell['y_location'].mean(), - "cell_area": cell_convex_hull.area, - "nucleus_centroid_x": seg_cell['x_location'].mean(), - "nucleus_centroid_y": seg_cell['y_location'].mean(), - "nucleus_area": cell_convex_hull.area, - "z_level": (seg_cell.z_location.mean() // 3).round(0) * 3 - }) - - polygon_num_vertices[0].append(len(cell_convex_hull.vertices)) + cell_summary.append( + { + "cell_centroid_x": seg_cell["x_location"].mean(), + "cell_centroid_y": seg_cell["y_location"].mean(), + "cell_area": cell_convex_hull.area, + "nucleus_centroid_x": seg_cell["x_location"].mean(), + "nucleus_centroid_y": seg_cell["y_location"].mean(), + "nucleus_area": cell_convex_hull.area, + "z_level": (seg_cell.z_location.mean() // 3).round(0) * 3, + } + ) + + polygon_num_vertices[0].append(len(cell_convex_hull.vertices)) polygon_num_vertices[1].append(len(nucleus_convex_hull.vertices) if len(seg_nucleous) >= 3 else 0) - polygon_vertices[0].append(seg_cell[['x_location', 'y_location']].values[cell_convex_hull.vertices]) - polygon_vertices[1].append(seg_nucleous[['x_location', 'y_location']].values[nucleus_convex_hull.vertices] if len(seg_nucleous) >= 3 else np.array([[], []]).T) + polygon_vertices[0].append(seg_cell[["x_location", "y_location"]].values[cell_convex_hull.vertices]) + polygon_vertices[1].append( + seg_nucleous[["x_location", "y_location"]].values[nucleus_convex_hull.vertices] + if len(seg_nucleous) >= 3 + else np.array([[], []]).T + ) seg_mask_value.append(cell_incremental_id + 1) cell_polygon_vertices = get_flatten_version(polygon_vertices[0], max_value=21) @@ -251,66 +291,80 @@ def seg2explorer( cells = { "cell_id": np.array([np.array(cell_id), np.ones(len(cell_id))], dtype=np.uint32).T, "cell_summary": pd.DataFrame(cell_summary).values.astype(np.float64), - "polygon_num_vertices": np.array([ - [min(x+1, x+1) for x in polygon_num_vertices[1]], - [min(x+1, x+1) for x in polygon_num_vertices[0]] - ], dtype=np.int32), + "polygon_num_vertices": np.array( + [ + [min(x + 1, x + 1) for x in polygon_num_vertices[1]], + [min(x + 1, x + 1) for x in polygon_num_vertices[0]], + ], + dtype=np.int32, + ), "polygon_vertices": np.array([nucl_polygon_vertices, cell_polygon_vertices]).astype(np.float32), - "seg_mask_value": np.array(seg_mask_value, dtype=np.int32) + "seg_mask_value": np.array(seg_mask_value, dtype=np.int32), } - - existing_store = zarr.open(source_path / 'cells.zarr.zip', mode='r') - new_store = zarr.open(storage / f'{cells_filename}.zarr.zip', mode='w') - - new_store['cell_id'] = cells['cell_id'] - new_store['polygon_num_vertices'] = cells['polygon_num_vertices'] - new_store['polygon_vertices'] = cells['polygon_vertices'] - new_store['seg_mask_value'] = cells['seg_mask_value'] - + + existing_store = zarr.open(source_path / "cells.zarr.zip", mode="r") + new_store = zarr.open(storage / f"{cells_filename}.zarr.zip", mode="w") + + new_store["cell_id"] = cells["cell_id"] + new_store["polygon_num_vertices"] = cells["polygon_num_vertices"] + new_store["polygon_vertices"] = cells["polygon_vertices"] + new_store["seg_mask_value"] = cells["seg_mask_value"] + new_store.attrs.update(existing_store.attrs) - new_store.attrs['number_cells'] = len(cells['cell_id']) + new_store.attrs["number_cells"] = len(cells["cell_id"]) new_store.store.close() - + if analysis_df is None: analysis_df = pd.DataFrame([cell_id2old_id[i] for i in cell_id], columns=[cell_id_columns]) - analysis_df['default'] = 'seg' - + analysis_df["default"] = "seg" + zarr_df = pd.DataFrame([cell_id2old_id[i] for i in cell_id], columns=[cell_id_columns]) - clustering_df = pd.merge(zarr_df, analysis_df, how='left', on=cell_id_columns) + clustering_df = pd.merge(zarr_df, analysis_df, how="left", on=cell_id_columns) clusters_names = [i for i in analysis_df.columns if i != cell_id_columns] - clusters_dict = {cluster: {j: i for i, j in zip(range(1, len(sorted(np.unique(clustering_df[cluster].dropna()))) + 1), sorted(np.unique(clustering_df[cluster].dropna())))} for cluster in clusters_names} + clusters_dict = { + cluster: { + j: i + for i, j in zip( + range(1, len(sorted(np.unique(clustering_df[cluster].dropna()))) + 1), + sorted(np.unique(clustering_df[cluster].dropna())), + ) + } + for cluster in clusters_names + } - new_zarr = zarr.open(storage / (analysis_filename + ".zarr.zip"), mode='w') - new_zarr.create_group('/cell_groups') + new_zarr = zarr.open(storage / (analysis_filename + ".zarr.zip"), mode="w") + new_zarr.create_group("/cell_groups") clusters = [[clusters_dict[cluster].get(x, 0) for x in list(clustering_df[cluster])] for cluster in clusters_names] for i in range(len(clusters)): - new_zarr['cell_groups'].create_group(i) + new_zarr["cell_groups"].create_group(i) indices, indptr = get_indices_indptr(np.array(clusters[i])) - new_zarr['cell_groups'][i].create_dataset('indices', data=indices) - new_zarr['cell_groups'][i].create_dataset('indptr', data=indptr) - - new_zarr['cell_groups'].attrs.update({ - "major_version": 1, - "minor_version": 0, - "number_groupings": len(clusters_names), - "grouping_names": clusters_names, - "group_names": [ - [x[0] for x in sorted(clusters_dict[cluster].items(), key=lambda x: x[1])] - for cluster in clusters_names - ] - }) + new_zarr["cell_groups"][i].create_dataset("indices", data=indices) + new_zarr["cell_groups"][i].create_dataset("indptr", data=indptr) + + new_zarr["cell_groups"].attrs.update( + { + "major_version": 1, + "minor_version": 0, + "number_groupings": len(clusters_names), + "grouping_names": clusters_names, + "group_names": [ + [x[0] for x in sorted(clusters_dict[cluster].items(), key=lambda x: x[1])] for cluster in clusters_names + ], + } + ) new_zarr.store.close() generate_experiment_file( - template_path=source_path / 'experiment.xenium', + template_path=source_path / "experiment.xenium", output_path=storage / xenium_filename, cells_name=cells_filename, - analysis_name=analysis_filename + analysis_name=analysis_filename, ) + def get_flatten_version(polygons: List[np.ndarray], max_value: int = 21) -> np.ndarray: """Get the flattened version of polygon vertices. @@ -326,23 +380,21 @@ def get_flatten_version(polygons: List[np.ndarray], max_value: int = 21) -> np.n for i, polygon in tqdm(enumerate(polygons), total=len(polygons)): num_points = len(polygon) if num_points == 0: - result[i] = np.zeros(n*2) + result[i] = np.zeros(n * 2) continue elif num_points < max_value: repeated_points = np.tile(polygon[0], (n - num_points, 1)) padded_polygon = np.concatenate((polygon, repeated_points), axis=0) else: padded_polygon = np.zeros((n, 2)) - padded_polygon[:min(num_points, n)] = polygon[:min(num_points, n)] + padded_polygon[: min(num_points, n)] = polygon[: min(num_points, n)] padded_polygon[-1] = polygon[0] result[i] = padded_polygon.flatten() return result + def generate_experiment_file( - template_path: str, - output_path: str, - cells_name: str = "seg_cells", - analysis_name: str = 'seg_analysis' + template_path: str, output_path: str, cells_name: str = "seg_cells", analysis_name: str = "seg_analysis" ) -> None: """Generate the experiment file for Xenium. @@ -357,12 +409,12 @@ def generate_experiment_file( with open(template_path) as f: experiment = json.load(f) - experiment['images'].pop('morphology_filepath') - experiment['images'].pop('morphology_focus_filepath') + experiment["images"].pop("morphology_filepath") + experiment["images"].pop("morphology_focus_filepath") - experiment['xenium_explorer_files']['cells_zarr_filepath'] = f"{cells_name}.zarr.zip" - experiment['xenium_explorer_files'].pop('cell_features_zarr_filepath') - experiment['xenium_explorer_files']['analysis_zarr_filepath'] = f"{analysis_name}.zarr.zip" + experiment["xenium_explorer_files"]["cells_zarr_filepath"] = f"{cells_name}.zarr.zip" + experiment["xenium_explorer_files"].pop("cell_features_zarr_filepath") + experiment["xenium_explorer_files"]["analysis_zarr_filepath"] = f"{analysis_name}.zarr.zip" - with open(output_path, 'w') as f: + with open(output_path, "w") as f: json.dump(experiment, f, indent=2) diff --git a/tests/test_data.py b/tests/test_data.py index 3f3c5b8..540b7b7 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -4,49 +4,48 @@ import torch from torch_geometric.data import Data from segger.data.utils import * +from segger.data import XeniumSample import unittest import pandas as pd + class TestDataUtils(unittest.TestCase): def test_filter_transcripts(self): - data = { - 'qv': [30, 10, 25], - 'feature_name': ['gene1', 'NegControlProbe_gene2', 'gene3'] - } + data = {"qv": [30, 10, 25], "feature_name": ["gene1", "NegControlProbe_gene2", "gene3"]} df = pd.DataFrame(data) filtered_df = filter_transcripts(df, min_qv=20) self.assertEqual(len(filtered_df), 2) - self.assertTrue('gene1' in filtered_df['feature_name'].values) - self.assertTrue('gene3' in filtered_df['feature_name'].values) + self.assertTrue("gene1" in filtered_df["feature_name"].values) + self.assertTrue("gene3" in filtered_df["feature_name"].values) def test_compute_transcript_metrics(self): data = { - 'qv': [40, 40, 25, 25], - 'feature_name': ['gene1', 'gene2', 'gene1', 'gene2'], - 'cell_id': [1, 1, -1, 2], - 'overlaps_nucleus': [1, 0, 0, 1] + "qv": [40, 40, 25, 25], + "feature_name": ["gene1", "gene2", "gene1", "gene2"], + "cell_id": [1, 1, -1, 2], + "overlaps_nucleus": [1, 0, 0, 1], } df = pd.DataFrame(data) metrics = compute_transcript_metrics(df, qv_threshold=30) - self.assertAlmostEqual(metrics['percent_assigned'], 50.0) - self.assertAlmostEqual(metrics['percent_cytoplasmic'], 50.0) - self.assertAlmostEqual(metrics['percent_nucleus'], 50.0) - self.assertAlmostEqual(metrics['percent_non_assigned_cytoplasmic'], 100.0) - self.assertEqual(len(metrics['gene_metrics']), 2) - self.assertTrue('gene1' in metrics['gene_metrics']['feature_name'].values) - self.assertTrue('gene2' in metrics['gene_metrics']['feature_name'].values) - + self.assertAlmostEqual(metrics["percent_assigned"], 50.0) + self.assertAlmostEqual(metrics["percent_cytoplasmic"], 50.0) + self.assertAlmostEqual(metrics["percent_nucleus"], 50.0) + self.assertAlmostEqual(metrics["percent_non_assigned_cytoplasmic"], 100.0) + self.assertEqual(len(metrics["gene_metrics"]), 2) + self.assertTrue("gene1" in metrics["gene_metrics"]["feature_name"].values) + self.assertTrue("gene2" in metrics["gene_metrics"]["feature_name"].values) + def setUp(self): data = { - 'x_location': [100, 200, 300], - 'y_location': [100, 200, 300], - 'z_location': [0, 0, 0], - 'qv': [40, 40, 25], - 'feature_name': ['gene1', 'gene2', 'gene3'], - 'transcript_id': [1, 2, 3], - 'overlaps_nucleus': [1, 0, 1], - 'cell_id': [1, -1, 2] + "x_location": [100, 200, 300], + "y_location": [100, 200, 300], + "z_location": [0, 0, 0], + "qv": [40, 40, 25], + "feature_name": ["gene1", "gene2", "gene3"], + "transcript_id": [1, 2, 3], + "overlaps_nucleus": [1, 0, 1], + "cell_id": [1, -1, 2], } self.df = pd.DataFrame(data) self.sample = XeniumSample(self.df) @@ -54,21 +53,18 @@ def setUp(self): def test_crop_transcripts(self): cropped_sample = self.sample.crop_transcripts(50, 50, 200, 200) self.assertEqual(len(cropped_sample.transcripts_df), 1) - self.assertEqual(cropped_sample.transcripts_df.iloc[0]['feature_name'], 'gene1') + self.assertEqual(cropped_sample.transcripts_df.iloc[0]["feature_name"], "gene1") def test_filter_transcripts(self): filtered_df = XeniumSample.filter_transcripts(self.df, min_qv=30) self.assertEqual(len(filtered_df), 2) - self.assertTrue('gene1' in filtered_df['feature_name'].values) - self.assertTrue('gene2' in filtered_df['feature_name'].values) + self.assertTrue("gene1" in filtered_df["feature_name"].values) + self.assertTrue("gene2" in filtered_df["feature_name"].values) def test_unassign_all_except_nucleus(self): unassigned_df = XeniumSample.unassign_all_except_nucleus(self.df) - self.assertEqual(unassigned_df.loc[unassigned_df['overlaps_nucleus'] == 0, 'cell_id'].values[0], 'UNASSIGNED') + self.assertEqual(unassigned_df.loc[unassigned_df["overlaps_nucleus"] == 0, "cell_id"].values[0], "UNASSIGNED") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() - - - diff --git a/tests/test_model.py b/tests/test_model.py index 802c17b..b6dfdf0 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -4,21 +4,18 @@ from torch_geometric.nn import to_hetero from torch_geometric.data import HeteroData + class TestSeggerModel(unittest.TestCase): def setUp(self): - model = Segger( - init_emb=16, hidden_channels=32, out_channels=32, heads=3 - ) - metadata = ( - ["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")] - ) - self.model = to_hetero(model, metadata=metadata, aggr='sum') + model = Segger(init_emb=16, hidden_channels=32, out_channels=32, heads=3) + metadata = (["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")]) + self.model = to_hetero(model, metadata=metadata, aggr="sum") self.data = HeteroData() - self.data['tx'].x = torch.randn(10, 16) - self.data['nc'].x = torch.randn(5, 16) - self.data['tx', 'belongs', 'nc'].edge_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) - self.data['tx', 'neighbors', 'tx'].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) + self.data["tx"].x = torch.randn(10, 16) + self.data["nc"].x = torch.randn(5, 16) + self.data["tx", "belongs", "nc"].edge_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) + self.data["tx", "neighbors", "tx"].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) def test_forward(self): out = self.model(self.data.x_dict, self.data.edge_index_dict) @@ -26,13 +23,15 @@ def test_forward(self): self.assertTrue("nc" in out) self.assertEqual(out["tx"].shape[1], 32 * 3) self.assertEqual(out["nc"].shape[1], 32 * 3) - ''' + + """ def test_decode(self): z = {'tx': torch.randn(10, 16), 'nc': torch.randn(5, 16)} edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) out = self.model.decode(z, edge_label_index) self.assertEqual(out.shape[0], 3) - ''' + """ + if __name__ == "__main__": unittest.main() diff --git a/tests/test_prediction.py b/tests/test_prediction.py index 9d90316..fd77227 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -4,21 +4,23 @@ from segger.models.segger_model import Segger from torch_geometric.data import HeteroData + class TestPrediction(unittest.TestCase): def setUp(self): self.model = Segger(init_emb=16, hidden_channels=32, out_channels=32, heads=3) - self.lit_model = load_model("path/to/checkpoint", 16, 32, 32, 3, 'sum') + self.lit_model = load_model("path/to/checkpoint", 16, 32, 32, 3, "sum") self.data = HeteroData() - self.data['tx'].x = torch.randn(10, 16) - self.data['nc'].x = torch.randn(5, 16) - self.data['tx', 'belongs', 'nc'].edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) - self.data['tx', 'neighbors', 'tx'].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) + self.data["tx"].x = torch.randn(10, 16) + self.data["nc"].x = torch.randn(5, 16) + self.data["tx", "belongs", "nc"].edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) + self.data["tx", "neighbors", "tx"].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) def test_predict(self): output_path = "path/to/output.csv.gz" predict(self.lit_model, "path/to/dataset", output_path, 0.5, 4, 20, 5, 10) self.assertTrue(os.path.exists(output_path)) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/tests/test_training.py b/tests/test_training.py index 5154fef..11615f8 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -4,40 +4,32 @@ from torch_geometric.data import HeteroData import torch + class TestTraining(unittest.TestCase): def setUp(self): # Setup model and data - metadata = ( - ["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")] - ) + metadata = (["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")]) self.lit_segger = LitSegger( init_emb=16, hidden_channels=32, out_channels=32, heads=3, metadata=metadata, - aggr='sum', + aggr="sum", ) self.data = HeteroData() self.data["tx"].x = torch.randn(10, 16) self.data["nc"].x = torch.randn(5, 16) - self.data["tx", "belongs", "nc"].edge_label_index = torch.tensor( - [[0, 1, 2], [0, 1, 2]], dtype=torch.long - ) - self.data["tx", "belongs", "nc"].edge_label = torch.tensor( - [1.0, 0.0, 1.0], dtype=torch.float - ) - self.data["tx", "neighbors", "tx"].edge_index = torch.tensor( - [[0, 1], [1, 2]], dtype=torch.long - ) - + self.data["tx", "belongs", "nc"].edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) + self.data["tx", "belongs", "nc"].edge_label = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float) + self.data["tx", "neighbors", "tx"].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) + # Move model and data to GPU self.lit_segger.cuda() self.data.to("cuda") - def test_training_step(self): optimizer = self.lit_segger.configure_optimizers() self.lit_segger.train() @@ -47,5 +39,6 @@ def test_training_step(self): optimizer.step() self.assertGreater(loss.item(), 0) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() From 97118c07b3f62b8bd4487f7bdecf7468517861e9 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 18:54:31 +0200 Subject: [PATCH 083/156] fix failed imports due to missing cupy in macos --- src/segger/__init__.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/segger/__init__.py b/src/segger/__init__.py index 59dde2b..fe1b2ec 100644 --- a/src/segger/__init__.py +++ b/src/segger/__init__.py @@ -1,9 +1,14 @@ -__all__ = ["data", "models", "prediction", "training"] +import importlib.util + +cupy_available = importlib.util.find_spec("cupy") is not None +from segger.data import * +from segger.models import * +from segger.training import * +from segger.validation import * -from .data import * +# segger.prediction requires cupy, which is not available in macOS +if cupy_available: + from segger.prediction import * -# from .models import * -# from .prediction import * -# from .training import * -# from .validation import * +__all__ = ["data", "models", "prediction", "training"] From 909f092c95739971632102d6b6c501d3f0c8231a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 20:17:05 +0200 Subject: [PATCH 084/156] running on xenium spatialdata zarr --- src/segger/cli/train_model.py | 1 - src/segger/data/constants.py | 28 ++++++++ src/segger/data/io.py | 128 ++++++++++++++++++++++++++++++---- 3 files changed, 143 insertions(+), 14 deletions(-) diff --git a/src/segger/cli/train_model.py b/src/segger/cli/train_model.py index a3a23a7..0fe1439 100644 --- a/src/segger/cli/train_model.py +++ b/src/segger/cli/train_model.py @@ -44,7 +44,6 @@ def train_model(args: Namespace): # Import packages logging.info("Importing packages...") - from segger.data.io import XeniumSample from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule from lightning.pytorch.loggers import CSVLogger diff --git a/src/segger/data/constants.py b/src/segger/data/constants.py index b48350f..14f9287 100644 --- a/src/segger/data/constants.py +++ b/src/segger/data/constants.py @@ -85,3 +85,31 @@ class MerscopeKeys(Enum): COUNTS_CELL_KEY = "cell" CELL_X = "center_x" CELL_Y = "center_y" + + +class SpatialDataKeys(Enum): + """Keys for *MERSCOPE* data (Vizgen platform).""" + + # File mappings + TRANSCRIPTS_FILE = "detected_transcripts.csv" + BOUNDARIES_FILE = "cell_boundaries.parquet" + CELL_METADATA_FILE = "cell_metadata.csv" + + # Cell identifiers + CELL_ID = "cell_id" + TRANSCRIPTS_ID = "transcript_id" + + # Coordinates and locations + TRANSCRIPTS_X = "x" + TRANSCRIPTS_Y = "y" + BOUNDARIES_VERTEX_X = "center_x" + BOUNDARIES_VERTEX_Y = "center_y" + + # Metadata + FEATURE_NAME = "" + QUALITY_VALUE = "qv" + OVERLAPS_BOUNDARY = "overlaps_nucleus" + METADATA_CELL_KEY = None + COUNTS_CELL_KEY = None + CELL_X = None + CELL_Y = None diff --git a/src/segger/data/io.py b/src/segger/data/io.py index 1441227..3bb52e2 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -167,6 +167,7 @@ def load_transcripts( ] # Load the dataset lazily with filters applied for the bounding box + columns = set(dd.read_parquet(file_path).columns) transcripts_df = dd.read_parquet(file_path, columns=columns_to_read, filters=filters).compute() # Convert transcript and cell IDs to strings lazily @@ -253,12 +254,48 @@ def load_boundaries( ] # Load the dataset lazily with filters applied for the bounding box - boundaries_df = dd.read_parquet(path, columns=columns_to_read, filters=filters) + columns = set(dd.read_parquet(path).columns) + if "geometry" in columns: + bbox = (x_min, y_min, x_max, y_max) + # TODO: check that SpatialData objects write the "bbox covering metadata" to the parquet file + gdf = dgpd.read_parquet(path, bbox=bbox) + id_col, x_col, y_col = ( + self.keys.CELL_ID.value, + self.keys.BOUNDARIES_VERTEX_X.value, + self.keys.BOUNDARIES_VERTEX_Y.value, + ) - # Convert the cell IDs to strings lazily - boundaries_df[self.keys.CELL_ID.value] = boundaries_df[self.keys.CELL_ID.value].apply( - lambda x: str(x) if pd.notnull(x) else None, meta=("cell_id", "object") - ) + # Function to expand each polygon into a list of vertices + def expand_polygon(row): + expanded_data = [] + polygon = row["geometry"] + if polygon.geom_type == "Polygon": + exterior_coords = polygon.exterior.coords + for x, y in exterior_coords: + expanded_data.append({id_col: row.name, x_col: x, y_col: y}) + else: + # Instead of expanding the gdf and then having code later to recreate it (when computing the pyg graph) + # we could directly have this function returning a Dask GeoDataFrame. This means that we don't need + # to implement this else black + raise ValueError(f"Unsupported geometry type: {polygon.geom_type}") + return expanded_data + + # Apply the function to each partition and collect results + def process_partition(df): + expanded_data = [expand_polygon(row) for _, row in df.iterrows()] + # Flatten the list of lists + flattened_data = [item for sublist in expanded_data for item in sublist] + return pd.DataFrame(flattened_data) + + # Use map_partitions to apply the function and convert it into a Dask DataFrame + boundaries_df = gdf.map_partitions(process_partition, meta={id_col: str, x_col: float, y_col: float}) + else: + boundaries_df = dd.read_parquet(path, columns=columns_to_read, filters=filters) + + # Convert the cell IDs to strings lazily + boundaries_df[self.keys.CELL_ID.value] = boundaries_df[self.keys.CELL_ID.value].apply( + lambda x: str(x) if pd.notnull(x) else None, meta=("cell_id", "object") + ) if self.verbose: print(f"Loaded boundaries from '{path}' within bounding box ({x_min}, {x_max}, {y_min}, {y_max}).") @@ -272,11 +309,6 @@ def set_metadata(self) -> None: instead of one-hot encodings and store the lookup table for later mapping. """ # Load the Parquet file metadata - ## - # luca's experiment - import time - - # old method parquet_file = pq.read_table(self.transcripts_path) # Get the column names for X, Y, and feature names from the class's keys @@ -300,7 +332,7 @@ def set_metadata(self) -> None: "UnassignedCodeword_", ) - row_group_size = 1_000_000 + row_group_size = 4_000_000 start = 0 n = len(parquet_file) while start < n: @@ -486,7 +518,7 @@ def compute_transcript_overlap_with_boundaries( # if self.verbose: print(f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}.") polygons_gdf = self.generate_and_scale_polygons(boundaries_df, scale_factor) - if polygons_gdf.empty(): + if polygons_gdf.empty: raise ValueError("No valid polygons were generated from the boundaries.") else: if self.verbose: @@ -513,6 +545,12 @@ def check_overlap(transcript, polygons_gdf): # Apply the check_overlap function in parallel to each row using Dask's map_partitions if self.verbose: print(f"Starting overlap computation for transcripts with the boundary polygons.") + if isinstance(transcripts_df, pd.DataFrame): + # luca: I found this bug here + warnings.warn("BUG! This function expects Dask DataFrames, not Pandas DataFrames.") + # if we want to really have the below working in parallel, we need to add n_partitions>1 here + transcripts_df = dd.from_pandas(transcripts_df, npartitions=1) + transcripts_df.compute().columns transcripts_df = transcripts_df.map_partitions( lambda df: df.assign( **{ @@ -986,7 +1024,9 @@ def build_pyg_data_from_tile( if self.keys.OVERLAPS_BOUNDARY.value not in transcripts_df.columns: if self.verbose: print(f"Computing overlaps for transcripts...") - transcripts_df = self.compute_transcript_overlap_with_boundaries(transcripts_df, bd_gdf, scale_factor=1.0) + transcripts_df = self.compute_transcript_overlap_with_boundaries( + transcripts_df, polygons_gdf=bd_gdf, scale_factor=1.0 + ) # Connect transcripts with their corresponding boundaries (e.g., nuclei, cells) if self.verbose: @@ -1099,3 +1139,65 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) # Add custom Merscope-specific filtering logic if needed # For now, apply only the quality value filter return transcripts_df[transcripts_df[self.keys.QUALITY_VALUE.value] >= min_qv] + + +class SpatialDataSample(SpatialTranscriptomicsSample): + def __init__( + self, + transcripts_df: dd.DataFrame = None, + transcripts_radius: int = 10, + boundaries_graph: bool = False, + embedding_df: pd.DataFrame = None, + feature_name: str | None = None, + verbose: bool = True, + ): + if feature_name is not None: + # luca: just a quick hack for now, I propose to use dataclasses instead of enums to address this + SpatialDataKeys.FEATURE_NAME._value_ = feature_name + else: + raise ValueError( + "the automatic determination of a feature_name from a SpatialData object is not enabled yet" + ) + + super().__init__( + transcripts_df, transcripts_radius, boundaries_graph, embedding_df, SpatialDataKeys, verbose=verbose + ) + + def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: + """ + Filters transcripts based on quality value and removes unwanted transcripts for Xenium using Dask. + + Parameters: + transcripts_df (dd.DataFrame): The Dask DataFrame containing transcript data. + min_qv (float, optional): The minimum quality value threshold for filtering transcripts. + + Returns: + dd.DataFrame: The filtered Dask DataFrame. + """ + filter_codewords = ( + "NegControlProbe_", + "antisense_", + "NegControlCodeword_", + "BLANK_", + "DeprecatedCodeword_", + "UnassignedCodeword_", + ) + + # Ensure FEATURE_NAME is a string type for proper filtering (compatible with Dask) + # Handle potential bytes to string conversion for Dask DataFrame + if pd.api.types.is_object_dtype(transcripts_df[self.keys.FEATURE_NAME.value]): + transcripts_df[self.keys.FEATURE_NAME.value] = transcripts_df[self.keys.FEATURE_NAME.value].apply( + lambda x: x.decode("utf-8") if isinstance(x, bytes) else x + ) + + # Apply the quality value filter using Dask + mask_quality = transcripts_df[self.keys.QUALITY_VALUE.value] >= min_qv + + # Apply the filter for unwanted codewords using Dask string functions + mask_codewords = ~transcripts_df[self.keys.FEATURE_NAME.value].str.startswith(filter_codewords) + + # Combine the filters and return the filtered Dask DataFrame + mask = mask_quality & mask_codewords + + # Return the filtered DataFrame lazily + return transcripts_df[mask] From a857464aadaecc635e4b769c2a45315645660fd6 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 20:17:44 +0200 Subject: [PATCH 085/156] run-precommit From 0903fe88056da45f31f7d38ff84999287e782778 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:18:42 +0000 Subject: [PATCH 086/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../\342\236\225-performance-improvement.md" | 5 +- .../\360\237\220\233-bug-report.md" | 6 +- .../\360\237\232\200-feature-request.md" | 7 +- .github/workflows/python-publish.yml | 33 +- .github/workflows/static.yml | 2 +- .pre-commit-config.yaml | 32 +- .scripts/create_dataset.py | 44 +- .scripts/predict.py | 24 +- .scripts/train_model.py | 36 +- docs/notebooks/benchmark_bc.py | 114 ++- docs/source/conf.py | 15 +- docs/user_guide/data_creation.md | 41 +- docs/user_guide/training.md | 12 +- scripts/create_data_sample.py | 23 +- scripts/predict_model_sample.py | 33 +- scripts/train_model.py | 62 +- scripts/train_model_sample.py | 26 +- src/segger/__init__.py | 2 +- src/segger/cli/cli.py | 4 +- src/segger/cli/configs/train/default.yaml | 6 +- src/segger/cli/create_dataset.py | 75 +- src/segger/cli/create_dataset_fast.py | 52 +- src/segger/cli/predict.py | 86 ++- src/segger/cli/train_model.py | 56 +- src/segger/cli/utils.py | 34 +- src/segger/data/README.md | 89 +-- src/segger/data/__init__.py | 33 +- src/segger/data/constants.py | 7 +- src/segger/data/io.py | 549 +++++++------- src/segger/data/parquet/_experimental.py | 36 +- src/segger/data/parquet/_ndtree.py | 37 +- src/segger/data/parquet/_settings/xenium.yaml | 10 +- src/segger/data/parquet/_utils.py | 87 ++- src/segger/data/parquet/pyg_dataset.py | 24 +- src/segger/data/parquet/sample.py | 250 +++---- .../data/parquet/transcript_embedding.py | 24 +- src/segger/data/utils.py | 259 ++++--- src/segger/models/README.md | 20 +- src/segger/models/__init__.py | 4 +- src/segger/models/segger_model.py | 29 +- src/segger/prediction/__init__.py | 5 +- src/segger/prediction/predict.py | 228 +++--- src/segger/training/README.md | 6 + src/segger/training/segger_data_module.py | 6 +- src/segger/training/train.py | 40 +- src/segger/validation/__init__.py | 2 +- src/segger/validation/utils.py | 678 +++++++++--------- src/segger/validation/xenium_explorer.py | 282 +++++--- tests/test_data.py | 63 +- tests/test_model.py | 25 +- tests/test_prediction.py | 14 +- tests/test_training.py | 25 +- 52 files changed, 1855 insertions(+), 1807 deletions(-) diff --git "a/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" "b/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" index b281b2f..9b5cb1f 100644 --- "a/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" +++ "b/.github/ISSUE_TEMPLATE/\342\236\225-performance-improvement.md" @@ -1,10 +1,9 @@ --- name: "➕ Performance Improvement" about: Suggest an improvement in the performance -title: '' -labels: '' +title: "" +labels: "" assignees: andrewmoorman, EliHei2 - --- **Describe the issue with the current implementation** diff --git "a/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" "b/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" index b899e5a..5809219 100644 --- "a/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" +++ "b/.github/ISSUE_TEMPLATE/\360\237\220\233-bug-report.md" @@ -2,12 +2,12 @@ name: "\U0001F41B Bug Report" about: Create a report to help us improve title: "[BUG]" -labels: '' +labels: "" assignees: andrewmoorman, EliHei2 - --- --- + name: Bug Report about: Report a bug or unexpected behavior title: "[BUG] " @@ -21,6 +21,7 @@ A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: + 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' @@ -33,6 +34,7 @@ A clear and concise description of what you expected to happen. If applicable, add screenshots or logs to help explain your problem. **Environment (please complete the following information):** + - OS: [e.g. macOS, Windows, Linux] - Python version: [e.g. 3.9] - Package version: [e.g. 1.2.3] diff --git "a/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" "b/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" index 08679f6..67644f2 100644 --- "a/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" +++ "b/.github/ISSUE_TEMPLATE/\360\237\232\200-feature-request.md" @@ -1,10 +1,9 @@ --- name: "\U0001F680 Feature Request" about: Suggest an idea for this project -title: '' -labels: '' -assignees: '' - +title: "" +labels: "" +assignees: "" --- **Is your feature request related to a problem? Please describe.** diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index b7a704b..c16ebea 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -17,23 +17,22 @@ permissions: jobs: deploy: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index b6a7e3a..146ad51 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -25,7 +25,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - name: Install package and documentation dependencies run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 781c996..a1e1760 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,22 @@ fail_fast: false default_language_version: - python: python3 + python: python3 default_stages: - - commit - - push + - commit + - push minimum_pre_commit_version: 2.16.0 ci: - skip: [] + skip: [] repos: - - repo: https://github.com/psf/black - rev: 24.8.0 - hooks: - - id: black - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v4.0.0-alpha.8 - hooks: - - id: prettier - - repo: https://github.com/asottile/blacken-docs - rev: 1.18.0 - hooks: - - id: blacken-docs + - repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v4.0.0-alpha.8 + hooks: + - id: prettier + - repo: https://github.com/asottile/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs diff --git a/.scripts/create_dataset.py b/.scripts/create_dataset.py index 27de8af..91ca6db 100644 --- a/.scripts/create_dataset.py +++ b/.scripts/create_dataset.py @@ -30,9 +30,7 @@ def main(args): download_file(transcripts_url, transcripts_path) download_file(nuclei_url, nuclei_path) - xs = XeniumSample().load_transcripts( - path=transcripts_path, min_qv=args.min_qv - ) + xs = XeniumSample().load_transcripts(path=transcripts_path, min_qv=args.min_qv) xs.load_nuclei(path=nuclei_path) if args.parallel: @@ -83,9 +81,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Create dataset from Xenium Human Pancreatic data." - ) + parser = argparse.ArgumentParser(description="Create dataset from Xenium Human Pancreatic data.") parser.add_argument( "--raw_data_dir", type=str, @@ -104,9 +100,7 @@ def main(args): required=True, help="URL for transcripts data.", ) - parser.add_argument( - "--nuclei_url", type=str, required=True, help="URL for nuclei data." - ) + parser.add_argument("--nuclei_url", type=str, required=True, help="URL for nuclei data.") parser.add_argument( "--min_qv", type=int, @@ -125,21 +119,11 @@ def main(args): default=180, help="Step size in y direction for tiles.", ) - parser.add_argument( - "--x_size", type=int, default=200, help="Width of each tile." - ) - parser.add_argument( - "--y_size", type=int, default=200, help="Height of each tile." - ) - parser.add_argument( - "--margin_x", type=int, default=None, help="Margin in x direction." - ) - parser.add_argument( - "--margin_y", type=int, default=None, help="Margin in y direction." - ) - parser.add_argument( - "--r_tx", type=int, default=3, help="Radius for building the graph." - ) + parser.add_argument("--x_size", type=int, default=200, help="Width of each tile.") + parser.add_argument("--y_size", type=int, default=200, help="Height of each tile.") + parser.add_argument("--margin_x", type=int, default=None, help="Margin in x direction.") + parser.add_argument("--margin_y", type=int, default=None, help="Margin in y direction.") + parser.add_argument("--r_tx", type=int, default=3, help="Radius for building the graph.") parser.add_argument( "--val_prob", type=float, @@ -158,9 +142,7 @@ def main(args): default=3, help="Number of nearest neighbors for nuclei.", ) - parser.add_argument( - "--dist_nc", type=int, default=10, help="Distance threshold for nuclei." - ) + parser.add_argument("--dist_nc", type=int, default=10, help="Distance threshold for nuclei.") parser.add_argument( "--k_tx", type=int, @@ -179,12 +161,8 @@ def main(args): default=True, help="Whether to compute edge labels.", ) - parser.add_argument( - "--sampling_rate", type=float, default=1, help="Rate of sampling tiles." - ) - parser.add_argument( - "--parallel", action="store_true", help="Use parallel processing." - ) + parser.add_argument("--sampling_rate", type=float, default=1, help="Rate of sampling tiles.") + parser.add_argument("--parallel", action="store_true", help="Use parallel processing.") parser.add_argument( "--num_workers", type=int, diff --git a/.scripts/predict.py b/.scripts/predict.py index f812822..9a095f4 100644 --- a/.scripts/predict.py +++ b/.scripts/predict.py @@ -30,9 +30,7 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Predict using the Segger model" - ) + parser = argparse.ArgumentParser(description="Predict using the Segger model") parser.add_argument( "--dataset_path", type=str, @@ -51,24 +49,16 @@ def main(args: argparse.Namespace) -> None: required=True, help="Path to the model checkpoint", ) - parser.add_argument( - "--init_emb", type=int, default=8, help="Initial embedding size" - ) + parser.add_argument("--init_emb", type=int, default=8, help="Initial embedding size") parser.add_argument( "--hidden_channels", type=int, default=64, help="Number of hidden channels", ) - parser.add_argument( - "--out_channels", type=int, default=16, help="Number of output channels" - ) - parser.add_argument( - "--heads", type=int, default=4, help="Number of attention heads" - ) - parser.add_argument( - "--aggr", type=str, default="sum", help="Aggregation method" - ) + parser.add_argument("--out_channels", type=int, default=16, help="Number of output channels") + parser.add_argument("--heads", type=int, default=4, help="Number of attention heads") + parser.add_argument("--aggr", type=str, default="sum", help="Aggregation method") parser.add_argument( "--score_cut", type=float, @@ -81,9 +71,7 @@ def main(args: argparse.Namespace) -> None: default=4, help="Number of nearest neighbors for nuclei", ) - parser.add_argument( - "--dist_nc", type=int, default=20, help="Distance threshold for nuclei" - ) + parser.add_argument("--dist_nc", type=int, default=20, help="Distance threshold for nuclei") parser.add_argument( "--k_tx", type=int, diff --git a/.scripts/train_model.py b/.scripts/train_model.py index 8a6ee85..2515a71 100644 --- a/.scripts/train_model.py +++ b/.scripts/train_model.py @@ -95,39 +95,21 @@ def main(args): default=4, help="Batch size for validation", ) - parser.add_argument( - "--init_emb", type=int, default=8, help="Initial embedding size" - ) + parser.add_argument("--init_emb", type=int, default=8, help="Initial embedding size") parser.add_argument( "--hidden_channels", type=int, default=64, help="Number of hidden channels", ) - parser.add_argument( - "--out_channels", type=int, default=16, help="Number of output channels" - ) - parser.add_argument( - "--heads", type=int, default=4, help="Number of attention heads" - ) - parser.add_argument( - "--aggr", type=str, default="sum", help="Aggregation method" - ) - parser.add_argument( - "--accelerator", type=str, default="cuda", help="Type of accelerator" - ) - parser.add_argument( - "--strategy", type=str, default="auto", help="Training strategy" - ) - parser.add_argument( - "--precision", type=str, default="16-mixed", help="Precision mode" - ) - parser.add_argument( - "--devices", type=int, default=4, help="Number of devices" - ) - parser.add_argument( - "--epochs", type=int, default=100, help="Number of epochs" - ) + parser.add_argument("--out_channels", type=int, default=16, help="Number of output channels") + parser.add_argument("--heads", type=int, default=4, help="Number of attention heads") + parser.add_argument("--aggr", type=str, default="sum", help="Aggregation method") + parser.add_argument("--accelerator", type=str, default="cuda", help="Type of accelerator") + parser.add_argument("--strategy", type=str, default="auto", help="Training strategy") + parser.add_argument("--precision", type=str, default="16-mixed", help="Precision mode") + parser.add_argument("--devices", type=int, default=4, help="Number of devices") + parser.add_argument("--epochs", type=int, default=100, help="Number of epochs") parser.add_argument( "--default_root_dir", type=str, diff --git a/docs/notebooks/benchmark_bc.py b/docs/notebooks/benchmark_bc.py index 31ac0bd..8b9a3fc 100644 --- a/docs/notebooks/benchmark_bc.py +++ b/docs/notebooks/benchmark_bc.py @@ -8,54 +8,54 @@ from segger.validation.utils import * # Define paths and output directories -benchmarks_path = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') -output_path = benchmarks_path / 'results+' -figures_path = output_path / 'figures' +benchmarks_path = Path("/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc") +output_path = benchmarks_path / "results+" +figures_path = output_path / "figures" figures_path.mkdir(parents=True, exist_ok=True) # Ensure the figures directory exists # Define colors for segmentation methods method_colors = { - 'segger': '#D55E00', - 'segger_n0': '#E69F00', - 'segger_n1': '#F0E442', - 'Baysor': '#0072B2', - '10X': '#009E73', - '10X-nucleus': '#CC79A7', - 'BIDCell': '#8B008B' + "segger": "#D55E00", + "segger_n0": "#E69F00", + "segger_n1": "#F0E442", + "Baysor": "#0072B2", + "10X": "#009E73", + "10X-nucleus": "#CC79A7", + "BIDCell": "#8B008B", } # Define colors for cell types major_colors = { - 'B-cells': '#d8f55e', - 'CAFs': '#532C8A', - 'Cancer Epithelial': '#C72228', - 'Endothelial': '#9e6762', - 'Myeloid': '#ffe012', - 'T-cells': '#3cb44b', - 'Normal Epithelial': '#0F4A9C', - 'PVL': '#c09d9a', - 'Plasmablasts': '#000075' + "B-cells": "#d8f55e", + "CAFs": "#532C8A", + "Cancer Epithelial": "#C72228", + "Endothelial": "#9e6762", + "Myeloid": "#ffe012", + "T-cells": "#3cb44b", + "Normal Epithelial": "#0F4A9C", + "PVL": "#c09d9a", + "Plasmablasts": "#000075", } # Define segmentation file paths segmentation_paths = { - 'segger': benchmarks_path / 'adata_segger.h5ad', - 'Baysor': benchmarks_path / 'adata_baysor.h5ad', - '10X': benchmarks_path / 'adata_10X.h5ad', - '10X-nucleus': benchmarks_path / 'adata_10X_nuc.h5ad', - 'BIDCell': benchmarks_path / 'adata_BIDCell.h5ad' + "segger": benchmarks_path / "adata_segger.h5ad", + "Baysor": benchmarks_path / "adata_baysor.h5ad", + "10X": benchmarks_path / "adata_10X.h5ad", + "10X-nucleus": benchmarks_path / "adata_10X_nuc.h5ad", + "BIDCell": benchmarks_path / "adata_BIDCell.h5ad", } # Load the segmentations and the scRNAseq data segmentations_dict = load_segmentations(segmentation_paths) segmentations_dict = {k: segmentations_dict[k] for k in method_colors.keys() if k in segmentations_dict} -scRNAseq_adata = sc.read(benchmarks_path / 'scRNAseq.h5ad') +scRNAseq_adata = sc.read(benchmarks_path / "scRNAseq.h5ad") # Generate general statistics plots plot_general_statistics_plots(segmentations_dict, figures_path, method_colors) # Find markers for scRNAseq data -markers = find_markers(scRNAseq_adata, cell_type_column='celltype_major', pos_percentile=30, neg_percentile=5) +markers = find_markers(scRNAseq_adata, cell_type_column="celltype_major", pos_percentile=30, neg_percentile=5) # Annotate spatial segmentations with scRNAseq reference data for method in segmentation_paths.keys(): @@ -68,9 +68,7 @@ # Find mutually exclusive genes based on scRNAseq data exclusive_gene_pairs = find_mutually_exclusive_genes( - adata=scRNAseq_adata, - markers=markers, - cell_type_column='celltype_major' + adata=scRNAseq_adata, markers=markers, cell_type_column="celltype_major" ) # Compute MECR for each segmentation method @@ -83,14 +81,12 @@ quantized_mecr_counts = {} for method in segmentations_dict.keys(): - if 'cell_area' in segmentations_dict[method].obs.columns: + if "cell_area" in segmentations_dict[method].obs.columns: quantized_mecr_area[method] = compute_quantized_mecr_area( - adata=segmentations_dict[method], - gene_pairs=exclusive_gene_pairs + adata=segmentations_dict[method], gene_pairs=exclusive_gene_pairs ) quantized_mecr_counts[method] = compute_quantized_mecr_counts( - adata=segmentations_dict[method], - gene_pairs=exclusive_gene_pairs + adata=segmentations_dict[method], gene_pairs=exclusive_gene_pairs ) # Plot MECR results @@ -99,26 +95,30 @@ plot_quantized_mecr_counts(quantized_mecr_counts, output_path=figures_path, palette=method_colors) # Filter segmentation methods for contamination analysis -new_segmentations_dict = {k: v for k, v in segmentations_dict.items() if k in ['segger', 'Baysor', '10X', '10X-nucleus', 'BIDCell']} +new_segmentations_dict = { + k: v for k, v in segmentations_dict.items() if k in ["segger", "Baysor", "10X", "10X-nucleus", "BIDCell"] +} # Compute contamination results contamination_results = {} for method, adata in new_segmentations_dict.items(): - if 'cell_centroid_x' in adata.obs.columns and 'cell_centroid_y' in adata.obs.columns: + if "cell_centroid_x" in adata.obs.columns and "cell_centroid_y" in adata.obs.columns: contamination_results[method] = calculate_contamination( adata=adata, markers=markers, # Assuming you have a dictionary of markers for cell types radius=15, n_neighs=20, - celltype_column='celltype_major', - num_cells=10000 + celltype_column="celltype_major", + num_cells=10000, ) # Prepare contamination data for boxplots boxplot_data = [] for method, df in contamination_results.items(): - melted_df = df.reset_index().melt(id_vars=['Source Cell Type'], var_name='Target Cell Type', value_name='Contamination') - melted_df['Segmentation Method'] = method + melted_df = df.reset_index().melt( + id_vars=["Source Cell Type"], var_name="Target Cell Type", value_name="Contamination" + ) + melted_df["Segmentation Method"] = method boxplot_data.append(melted_df) # Concatenate all contamination dataframes into one @@ -129,13 +129,13 @@ plot_contamination_boxplots(boxplot_data, output_path=figures_path, palette=method_colors) # Separate Segger into nucleus-positive and nucleus-negative cells -segmentations_dict['segger_n1'] = segmentations_dict['segger'][segmentations_dict['segger'].obs.has_nucleus] -segmentations_dict['segger_n0'] = segmentations_dict['segger'][~segmentations_dict['segger'].obs.has_nucleus] +segmentations_dict["segger_n1"] = segmentations_dict["segger"][segmentations_dict["segger"].obs.has_nucleus] +segmentations_dict["segger_n0"] = segmentations_dict["segger"][~segmentations_dict["segger"].obs.has_nucleus] # Compute clustering scores for all segmentation methods clustering_scores = {} for method, adata in segmentations_dict.items(): - ch_score, sh_score = compute_clustering_scores(adata, cell_type_column='celltype_major') + ch_score, sh_score = compute_clustering_scores(adata, cell_type_column="celltype_major") clustering_scores[method] = (ch_score, sh_score) # Plot UMAPs with clustering scores in the title @@ -143,20 +143,22 @@ # Compute neighborhood metrics for methods with spatial data for method, adata in segmentations_dict.items(): - if 'spatial' in list(adata.obsm.keys()): - compute_neighborhood_metrics(adata, radius=15, celltype_column='celltype_major') + if "spatial" in list(adata.obsm.keys()): + compute_neighborhood_metrics(adata, radius=15, celltype_column="celltype_major") # Prepare neighborhood entropy data for boxplots entropy_boxplot_data = [] for method, adata in segmentations_dict.items(): - if 'neighborhood_entropy' in adata.obs.columns: - entropy_df = pd.DataFrame({ - 'Cell Type': adata.obs['celltype_major'], - 'Neighborhood Entropy': adata.obs['neighborhood_entropy'], - 'Segmentation Method': method - }) + if "neighborhood_entropy" in adata.obs.columns: + entropy_df = pd.DataFrame( + { + "Cell Type": adata.obs["celltype_major"], + "Neighborhood Entropy": adata.obs["neighborhood_entropy"], + "Segmentation Method": method, + } + ) # Filter out NaN values, keeping only the subsetted cells - entropy_df = entropy_df.dropna(subset=['Neighborhood Entropy']) + entropy_df = entropy_df.dropna(subset=["Neighborhood Entropy"]) entropy_boxplot_data.append(entropy_df) # Concatenate all entropy dataframes into one @@ -166,7 +168,7 @@ plot_entropy_boxplots(entropy_boxplot_data, figures_path, palette=method_colors) # Find markers for sensitivity calculation -purified_markers = find_markers(scRNAseq_adata, 'celltype_major', pos_percentile=20, percentage=75) +purified_markers = find_markers(scRNAseq_adata, "celltype_major", pos_percentile=20, percentage=75) # Calculate sensitivity for each segmentation method sensitivity_results_per_method = {} @@ -178,11 +180,7 @@ sensitivity_boxplot_data = [] for method, sensitivity_results in sensitivity_results_per_method.items(): for cell_type, sensitivities in sensitivity_results.items(): - method_df = pd.DataFrame({ - 'Cell Type': cell_type, - 'Sensitivity': sensitivities, - 'Segmentation Method': method - }) + method_df = pd.DataFrame({"Cell Type": cell_type, "Sensitivity": sensitivities, "Segmentation Method": method}) sensitivity_boxplot_data.append(method_df) # Concatenate all sensitivity dataframes into one diff --git a/docs/source/conf.py b/docs/source/conf.py index 0cc5f81..17e7f88 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,23 +6,22 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'segger' -copyright = '2024, Elyas Heidari' -author = 'Elyas Heidari' -release = '0.01' +project = "segger" +copyright = "2024, Elyas Heidari" +author = "Elyas Heidari" +release = "0.01" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [] -templates_path = ['_templates'] +templates_path = ["_templates"] exclude_patterns = [] - # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'alabaster' -html_static_path = ['_static'] +html_theme = "alabaster" +html_static_path = ["_static"] diff --git a/docs/user_guide/data_creation.md b/docs/user_guide/data_creation.md index 571f9ef..8d27140 100644 --- a/docs/user_guide/data_creation.md +++ b/docs/user_guide/data_creation.md @@ -140,19 +140,22 @@ from pathlib import Path import scanpy as sc # Set up the file paths -raw_data_dir = Path('/path/to/xenium_output') -processed_data_dir = Path('path/to/processed_files') +raw_data_dir = Path("/path/to/xenium_output") +processed_data_dir = Path("path/to/processed_files") sample_tag = "sample/tag" # Load scRNA-seq data using Scanpy and subsample for efficiency -scRNAseq_path = 'path/to/scRNAseq.h5ad' +scRNAseq_path = "path/to/scRNAseq.h5ad" scRNAseq = sc.read(scRNAseq_path) sc.pp.subsample(scRNAseq, fraction=0.1) # Calculate gene cell type abundance embedding from scRNA-seq data from segger.utils import calculate_gene_celltype_abundance_embedding -celltype_column = 'celltype_column' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) + +celltype_column = "celltype_column" +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( + scRNAseq, celltype_column +) # Create a XeniumSample instance for spatial transcriptomics processing xenium_sample = XeniumSample() @@ -161,9 +164,9 @@ xenium_sample = XeniumSample() xenium_sample.load_transcripts( base_path=raw_data_dir, sample=sample_tag, - transcripts_filename='transcripts.parquet', + transcripts_filename="transcripts.parquet", file_format="parquet", - additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding} + additional_embeddings={"cell_type_abundance": gene_celltype_abundance_embedding}, ) # Set the embedding to "cell_type_abundance" to use it in further processing @@ -171,7 +174,7 @@ xenium_sample.set_embedding("cell_type_abundance") # Load nuclei data to define boundaries nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') +xenium_sample.load_boundaries(path=nuclei_path, file_format="parquet") # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = xenium_sample.build_pyg_data_from_tile( @@ -180,7 +183,7 @@ tile_pyg_data = xenium_sample.build_pyg_data_from_tile( r_tx=20, k_tx=20, use_precomputed=False, - workers=1 + workers=1, ) # Save dataset in processed format for segmentation @@ -199,7 +202,7 @@ xenium_sample.save_dataset_for_segger( test_prob=0.2, neg_sampling_ratio_approx=5, sampling_rate=1, - num_workers=1 + num_workers=1, ) ``` @@ -210,8 +213,8 @@ from segger.data import MerscopeSample from pathlib import Path # Set up the file paths -raw_data_dir = Path('path/to/merscope_outputs') -processed_data_dir = Path('path/to/processed_files') +raw_data_dir = Path("path/to/merscope_outputs") +processed_data_dir = Path("path/to/processed_files") sample_tag = "sample_tag" # Create a MerscopeSample instance for spatial transcriptomics processing @@ -221,16 +224,18 @@ merscope_sample = MerscopeSample() merscope_sample.load_transcripts( base_path=raw_data_dir, sample=sample_tag, - transcripts_filename='transcripts.csv', - file_format='csv' + transcripts_filename="transcripts.csv", + file_format="csv", ) # Optionally load cell boundaries cell_boundaries_path = raw_data_dir / sample_tag / "cell_boundaries.parquet" -merscope_sample.load_boundaries(path=cell_boundaries_path, file_format='parquet') +merscope_sample.load_boundaries(path=cell_boundaries_path, file_format="parquet") # Filter transcripts based on specific criteria -filtered_transcripts = merscope_sample.filter_transcripts(merscope_sample.transcripts_df) +filtered_transcripts = merscope_sample.filter_transcripts( + merscope_sample.transcripts_df +) # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = merscope_sample.build_pyg_data_from_tile( @@ -239,7 +244,7 @@ tile_pyg_data = merscope_sample.build_pyg_data_from_tile( r_tx=15, k_tx=15, use_precomputed=True, - workers=2 + workers=2, ) # Save dataset in processed format for segmentation @@ -258,6 +263,6 @@ merscope_sample.save_dataset_for_segger( test_prob=0.2, neg_sampling_ratio_approx=3, sampling_rate=1, - num_workers=2 + num_workers=2, ) ``` diff --git a/docs/user_guide/training.md b/docs/user_guide/training.md index 151fc66..8b78f0c 100644 --- a/docs/user_guide/training.md +++ b/docs/user_guide/training.md @@ -69,12 +69,12 @@ To instantiate and run the `segger` model: ```python model = segger( - num_tx_tokens=5000, # Number of unique 'tx' tokens - init_emb=32, # Initial embedding dimension - hidden_channels=64, # Number of hidden channels - num_mid_layers=2, # Number of middle layers - out_channels=128, # Number of output channels - heads=4 # Number of attention heads + num_tx_tokens=5000, # Number of unique 'tx' tokens + init_emb=32, # Initial embedding dimension + hidden_channels=64, # Number of hidden channels + num_mid_layers=2, # Number of middle layers + out_channels=128, # Number of output channels + heads=4, # Number of attention heads ) output = model(x, edge_index) diff --git a/scripts/create_data_sample.py b/scripts/create_data_sample.py index 8cdb137..3e37d23 100644 --- a/scripts/create_data_sample.py +++ b/scripts/create_data_sample.py @@ -8,16 +8,20 @@ from lightning.pytorch.plugins.environments import LightningEnvironment from matplotlib import pyplot as plt import seaborn as sns + # import pandas as pd from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os + # import Dask.DataFrame as dd -os.environ['DASK_DAEMON'] = 'False' +os.environ["DASK_DAEMON"] = "False" -xenium_data_dir = Path('/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230831-pan-cns-TMA-Xenium/output-XETG00078__0010722__TMA_AKSI__20230831__151713/') -segger_data_dir = Path('./data_tidy/pyg_datasets/pan_cns_AKSI') +xenium_data_dir = Path( + "/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230831-pan-cns-TMA-Xenium/output-XETG00078__0010722__TMA_AKSI__20230831__151713/" +) +segger_data_dir = Path("./data_tidy/pyg_datasets/pan_cns_AKSI") # models_dir = Path('./models/bc_embedding_1001') # scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' @@ -31,13 +35,11 @@ # gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) - - # Setup Xenium sample to create dataset -xs = XeniumSample(verbose=False) # , embedding_df=gene_celltype_abundance_embedding) +xs = XeniumSample(verbose=False) # , embedding_df=gene_celltype_abundance_embedding) xs.set_file_paths( - transcripts_path=xenium_data_dir / 'transcripts.parquet', - boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', + transcripts_path=xenium_data_dir / "transcripts.parquet", + boundaries_path=xenium_data_dir / "nucleus_boundaries.parquet", ) # dd.read_parquet(transcripts_path[0]) @@ -59,8 +61,7 @@ k_tx=5, val_prob=0.3, test_prob=0.1, - num_workers=6 + num_workers=6, ) except AssertionError as err: - print(f'Dataset already exists at {segger_data_dir}') - + print(f"Dataset already exists at {segger_data_dir}") diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 11c5e89..ef013ae 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -8,21 +8,22 @@ import dask.dataframe as dd import pandas as pd from pathlib import Path -os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" import cupy as cp from dask.distributed import Client, LocalCluster from dask_cuda import LocalCUDACluster import dask.dataframe as dd -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') -models_dir = Path('./models/bc_embedding_1001_small') -benchmarks_dir = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') -transcripts_file = 'data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet' +segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_1001") +models_dir = Path("./models/bc_embedding_1001_small") +benchmarks_dir = Path("/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc") +transcripts_file = "data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet" # Initialize the Lightning data module dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=1, - num_workers=1, + batch_size=1, + num_workers=1, ) dm.setup() @@ -31,22 +32,22 @@ model_version = 0 # Load in latest checkpoint -model_path = models_dir / 'lightning_logs' / f'version_{model_version}' -model = load_model(model_path / 'checkpoints') +model_path = models_dir / "lightning_logs" / f"version_{model_version}" +model = load_model(model_path / "checkpoints") -receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 5, 'dist_tx': 5} +receptive_field = {"k_bd": 4, "dist_bd": 12, "k_tx": 5, "dist_tx": 5} segment( model, dm, save_dir=benchmarks_dir, - seg_tag='segger_embedding_1001_0.5_cc', + seg_tag="segger_embedding_1001_0.5_cc", transcript_file=transcripts_file, - file_format='anndata', - receptive_field = receptive_field, + file_format="anndata", + receptive_field=receptive_field, min_transcripts=5, # max_transcripts=1500, - cell_id_col='segger_cell_id', + cell_id_col="segger_cell_id", use_cc=True, - knn_method='cuda' -) \ No newline at end of file + knn_method="cuda", +) diff --git a/scripts/train_model.py b/scripts/train_model.py index 7c25bc3..d94eda3 100644 --- a/scripts/train_model.py +++ b/scripts/train_model.py @@ -15,18 +15,20 @@ os.environ["PYTORCH_USE_CUDA_DSA"] = "1" os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + def check_and_create_raw_folder(directory): - raw_dir = directory / 'raw' + raw_dir = directory / "raw" if not raw_dir.exists(): raw_dir.mkdir(parents=True, exist_ok=True) - warnings.warn(f"'{raw_dir}' does not exist. Creating this dummy folder because SpatialTranscriptomicsDataset requires it.") + warnings.warn( + f"'{raw_dir}' does not exist. Creating this dummy folder because SpatialTranscriptomicsDataset requires it." + ) + def main(args): # CONFIG - - - - sys.path.insert(0, os.path.abspath('../..')) + + sys.path.insert(0, os.path.abspath("../..")) # Paths TRAIN_DIR = Path(args.train_dir) @@ -47,9 +49,9 @@ def main(args): hidden_channels=args.hidden_channels, out_channels=args.out_channels, heads=args.heads, - num_mid_layers=args.mid_layers # mid_layers is now included + num_mid_layers=args.mid_layers, # mid_layers is now included ) - model = to_hetero(model, (['tx', 'bd'], [('tx', 'belongs', 'bd'), ('tx', 'neighbors', 'tx')]), aggr=args.aggr) + model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr=args.aggr) batch = train_ds[0] model.forward(batch.x_dict, batch.edge_index_dict) @@ -73,25 +75,35 @@ def main(args): # Train the model trainer.fit(litsegger, train_loader, val_loader) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Train the Segger model") - parser.add_argument('--train_dir', type=str, required=True, help='Path to the training data directory') - parser.add_argument('--val_dir', type=str, required=True, help='Path to the validation data directory') - parser.add_argument('--batch_size_train', type=int, default=4, help='Batch size for training') - parser.add_argument('--batch_size_val', type=int, default=4, help='Batch size for validation') - parser.add_argument('--num_tx_tokens', type=int, default=500, help='Number of unique tx tokens for embedding') # num_tx_tokens default 500 - parser.add_argument('--init_emb', type=int, default=8, help='Initial embedding size') - parser.add_argument('--hidden_channels', type=int, default=64, help='Number of hidden channels') - parser.add_argument('--out_channels', type=int, default=16, help='Number of output channels') - parser.add_argument('--heads', type=int, default=4, help='Number of attention heads') - parser.add_argument('--mid_layers', type=int, default=1, help='Number of middle layers in the model') # mid_layers default 1 - parser.add_argument('--aggr', type=str, default='sum', help='Aggregation method') - parser.add_argument('--accelerator', type=str, default='cuda', help='Type of accelerator') - parser.add_argument('--strategy', type=str, default='auto', help='Training strategy') - parser.add_argument('--precision', type=str, default='16-mixed', help='Precision mode') - parser.add_argument('--devices', type=int, default=4, help='Number of devices') - parser.add_argument('--epochs', type=int, default=100, help='Number of epochs') - parser.add_argument('--default_root_dir', type=str, default='./models/pancreas', help='Default root directory for logs and checkpoints') + parser.add_argument("--train_dir", type=str, required=True, help="Path to the training data directory") + parser.add_argument("--val_dir", type=str, required=True, help="Path to the validation data directory") + parser.add_argument("--batch_size_train", type=int, default=4, help="Batch size for training") + parser.add_argument("--batch_size_val", type=int, default=4, help="Batch size for validation") + parser.add_argument( + "--num_tx_tokens", type=int, default=500, help="Number of unique tx tokens for embedding" + ) # num_tx_tokens default 500 + parser.add_argument("--init_emb", type=int, default=8, help="Initial embedding size") + parser.add_argument("--hidden_channels", type=int, default=64, help="Number of hidden channels") + parser.add_argument("--out_channels", type=int, default=16, help="Number of output channels") + parser.add_argument("--heads", type=int, default=4, help="Number of attention heads") + parser.add_argument( + "--mid_layers", type=int, default=1, help="Number of middle layers in the model" + ) # mid_layers default 1 + parser.add_argument("--aggr", type=str, default="sum", help="Aggregation method") + parser.add_argument("--accelerator", type=str, default="cuda", help="Type of accelerator") + parser.add_argument("--strategy", type=str, default="auto", help="Training strategy") + parser.add_argument("--precision", type=str, default="16-mixed", help="Precision mode") + parser.add_argument("--devices", type=int, default=4, help="Number of devices") + parser.add_argument("--epochs", type=int, default=100, help="Number of epochs") + parser.add_argument( + "--default_root_dir", + type=str, + default="./models/pancreas", + help="Default root directory for logs and checkpoints", + ) args = parser.parse_args() main(args) diff --git a/scripts/train_model_sample.py b/scripts/train_model_sample.py index ec3611a..8b834cc 100644 --- a/scripts/train_model_sample.py +++ b/scripts/train_model_sample.py @@ -8,19 +8,20 @@ from lightning.pytorch.plugins.environments import LightningEnvironment from matplotlib import pyplot as plt import seaborn as sns + # import pandas as pd from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') -models_dir = Path('./models/bc_embedding_1001_small') +segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_1001") +models_dir = Path("./models/bc_embedding_1001_small") dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=4, - num_workers=2, + batch_size=4, + num_workers=2, ) dm.setup() @@ -33,17 +34,17 @@ out_channels=8, heads=2, num_mid_layers=2, - aggr='sum', + aggr="sum", metadata=metadata, ) # Initialize the Lightning trainer trainer = Trainer( - accelerator='cuda', - strategy='auto', - precision='16-mixed', - devices=4, - max_epochs=200, + accelerator="cuda", + strategy="auto", + precision="16-mixed", + devices=4, + max_epochs=200, default_root_dir=models_dir, logger=CSVLogger(models_dir), ) @@ -52,7 +53,4 @@ ls.forward(batch) -trainer.fit( - model=ls, - datamodule=dm -) \ No newline at end of file +trainer.fit(model=ls, datamodule=dm) diff --git a/src/segger/__init__.py b/src/segger/__init__.py index b186ac0..e7f4c44 100644 --- a/src/segger/__init__.py +++ b/src/segger/__init__.py @@ -5,4 +5,4 @@ from .models import * from .prediction import * from .training import * -from .validation import * \ No newline at end of file +from .validation import * diff --git a/src/segger/cli/cli.py b/src/segger/cli/cli.py index 9332d08..18715ee 100644 --- a/src/segger/cli/cli.py +++ b/src/segger/cli/cli.py @@ -3,12 +3,14 @@ from segger.cli.predict import predict import click + # Setup main CLI command @click.group(help="Command line interface for the Segger segmentation package") def segger(): pass + # Add sub-commands to main CLI commands segger.add_command(create_dataset) segger.add_command(train) -segger.add_command(predict) \ No newline at end of file +segger.add_command(predict) diff --git a/src/segger/cli/configs/train/default.yaml b/src/segger/cli/configs/train/default.yaml index b685eac..cf27fc1 100644 --- a/src/segger/cli/configs/train/default.yaml +++ b/src/segger/cli/configs/train/default.yaml @@ -44,7 +44,7 @@ num_workers: help: Number of workers for data loading. accelerator: type: str - default: 'cuda' + default: "cuda" help: Device type to use for training (e.g., "cuda", "cpu"). max_epochs: type: int @@ -56,9 +56,9 @@ devices: help: Number of devices (GPUs) to use. strategy: type: str - default: 'auto' + default: "auto" help: Training strategy for the trainer. precision: type: str - default: '16-mixed' + default: "16-mixed" help: Precision for training. diff --git a/src/segger/cli/create_dataset.py b/src/segger/cli/create_dataset.py index b22e1d7..f82e85b 100644 --- a/src/segger/cli/create_dataset.py +++ b/src/segger/cli/create_dataset.py @@ -8,37 +8,56 @@ import time # Path to default YAML configuration file -data_yml = Path(__file__).parent / 'configs' / 'create_dataset' / 'default.yaml' +data_yml = Path(__file__).parent / "configs" / "create_dataset" / "default.yaml" # CLI command to create a Segger dataset help_msg = "Create Segger dataset from spatial transcriptomics data (Xenium or MERSCOPE)" + + @click.command(name="create_dataset", help=help_msg) @add_options(config_path=data_yml) -@click.option('--dataset_dir', type=Path, required=True, help='Directory containing the raw dataset.') -@click.option('--data_dir', type=Path, required=True, help='Directory to save the processed Segger dataset.') -@click.option('--sample_tag', type=str, required=True, help='Sample tag for the dataset.') -@click.option('--transcripts_file', type=str, required=True, help='Name of the transcripts file.') -@click.option('--boundaries_file', type=str, required=True, help='Name of the boundaries file.') -@click.option('--x_size', type=int, default=300, help='Size of each tile in x-direction.') -@click.option('--y_size', type=int, default=300, help='Size of each tile in y-direction.') -@click.option('--d_x', type=int, default=280, help='Tile overlap in x-direction.') -@click.option('--d_y', type=int, default=280, help='Tile overlap in y-direction.') -@click.option('--margin_x', type=int, default=10, help='Margin in x-direction.') -@click.option('--margin_y', type=int, default=10, help='Margin in y-direction.') -@click.option('--r_tx', type=int, default=5, help='Radius for computing neighborhood graph.') -@click.option('--k_tx', type=int, default=5, help='Number of nearest neighbors for the neighborhood graph.') -@click.option('--val_prob', type=float, default=0.1, help='Validation data split proportion.') -@click.option('--test_prob', type=float, default=0.2, help='Test data split proportion.') -@click.option('--neg_sampling_ratio', type=float, default=5, help='Ratio for negative sampling.') -@click.option('--sampling_rate', type=float, default=1, help='Sampling rate for the dataset.') -@click.option('--workers', type=int, default=1, help='Number of workers for parallel processing.') -@click.option('--gpu', is_flag=True, default=False, help='Use GPU if available.') -def create_dataset(args: Namespace, dataset_dir: Path, data_dir: Path, sample_tag: str, - transcripts_file: str, boundaries_file: str, x_size: int, y_size: int, - d_x: int, d_y: int, margin_x: int, margin_y: int, r_tx: int, k_tx: int, - val_prob: float, test_prob: float, neg_sampling_ratio: float, - sampling_rate: float, workers: int, gpu: bool): - +@click.option("--dataset_dir", type=Path, required=True, help="Directory containing the raw dataset.") +@click.option("--data_dir", type=Path, required=True, help="Directory to save the processed Segger dataset.") +@click.option("--sample_tag", type=str, required=True, help="Sample tag for the dataset.") +@click.option("--transcripts_file", type=str, required=True, help="Name of the transcripts file.") +@click.option("--boundaries_file", type=str, required=True, help="Name of the boundaries file.") +@click.option("--x_size", type=int, default=300, help="Size of each tile in x-direction.") +@click.option("--y_size", type=int, default=300, help="Size of each tile in y-direction.") +@click.option("--d_x", type=int, default=280, help="Tile overlap in x-direction.") +@click.option("--d_y", type=int, default=280, help="Tile overlap in y-direction.") +@click.option("--margin_x", type=int, default=10, help="Margin in x-direction.") +@click.option("--margin_y", type=int, default=10, help="Margin in y-direction.") +@click.option("--r_tx", type=int, default=5, help="Radius for computing neighborhood graph.") +@click.option("--k_tx", type=int, default=5, help="Number of nearest neighbors for the neighborhood graph.") +@click.option("--val_prob", type=float, default=0.1, help="Validation data split proportion.") +@click.option("--test_prob", type=float, default=0.2, help="Test data split proportion.") +@click.option("--neg_sampling_ratio", type=float, default=5, help="Ratio for negative sampling.") +@click.option("--sampling_rate", type=float, default=1, help="Sampling rate for the dataset.") +@click.option("--workers", type=int, default=1, help="Number of workers for parallel processing.") +@click.option("--gpu", is_flag=True, default=False, help="Use GPU if available.") +def create_dataset( + args: Namespace, + dataset_dir: Path, + data_dir: Path, + sample_tag: str, + transcripts_file: str, + boundaries_file: str, + x_size: int, + y_size: int, + d_x: int, + d_y: int, + margin_x: int, + margin_y: int, + r_tx: int, + k_tx: int, + val_prob: float, + test_prob: float, + neg_sampling_ratio: float, + sampling_rate: float, + workers: int, + gpu: bool, +): + # Setup logging ch = logging.StreamHandler() ch.setLevel(logging.INFO) @@ -47,9 +66,9 @@ def create_dataset(args: Namespace, dataset_dir: Path, data_dir: Path, sample_ta # Initialize the appropriate sample class based on dataset type logging.info("Initializing sample...") - if args.dataset_type == 'xenium': + if args.dataset_type == "xenium": sample = XeniumSample() - elif args.dataset_type == 'merscope': + elif args.dataset_type == "merscope": sample = MerscopeSample() else: raise ValueError("Unsupported dataset type. Please choose 'xenium' or 'merscope'.") diff --git a/src/segger/cli/create_dataset_fast.py b/src/segger/cli/create_dataset_fast.py index 8e6e9ee..33a3a63 100644 --- a/src/segger/cli/create_dataset_fast.py +++ b/src/segger/cli/create_dataset_fast.py @@ -9,29 +9,42 @@ import time # Path to default YAML configuration file -data_yml = Path(__file__).parent / 'configs' / 'create_dataset' / 'default_fast.yaml' +data_yml = Path(__file__).parent / "configs" / "create_dataset" / "default_fast.yaml" # CLI command to create a Segger dataset help_msg = "Create Segger dataset from spatial transcriptomics data (Xenium or MERSCOPE)" + + @click.command(name="create_dataset", help=help_msg) @add_options(config_path=data_yml) -@click.option('--base_dir', type=Path, required=True, help='Directory containing the raw dataset.') -@click.option('--data_dir', type=Path, required=True, help='Directory to save the processed Segger dataset.') -@click.option('--sample_type', type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".') -@click.option('--k_bd', type=int, default=3, help='Number of nearest neighbors for boundary nodes.') -@click.option('--dist_bd', type=float, default=15., help='Maximum distance for boundary neighbors.') -@click.option('--k_tx', type=int, default=3, help='Number of nearest neighbors for transcript nodes.') -@click.option('--dist_tx', type=float, default=5., help='Maximum distance for transcript neighbors.') -@click.option('--tile_size', type=int, default=None, help='If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`.') -@click.option('--tile_width', type=int, default=None, help='Width of the tiles in pixels. Ignored if `tile_size` is provided.') -@click.option('--tile_height', type=int, default=None, help='Height of the tiles in pixels. Ignored if `tile_size` is provided.') -@click.option('--neg_sampling_ratio', type=float, default=5., help='Ratio of negative samples.') -@click.option('--frac', type=float, default=1., help='Fraction of the dataset to process.') -@click.option('--val_prob', type=float, default=0.1, help='Proportion of data for use for validation split.') -@click.option('--test_prob', type=float, default=0.2, help='Proportion of data for use for test split.') -@click.option('--n_workers', type=int, default=1, help='Number of workers for parallel processing.') +@click.option("--base_dir", type=Path, required=True, help="Directory containing the raw dataset.") +@click.option("--data_dir", type=Path, required=True, help="Directory to save the processed Segger dataset.") +@click.option( + "--sample_type", type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".' +) +@click.option("--k_bd", type=int, default=3, help="Number of nearest neighbors for boundary nodes.") +@click.option("--dist_bd", type=float, default=15.0, help="Maximum distance for boundary neighbors.") +@click.option("--k_tx", type=int, default=3, help="Number of nearest neighbors for transcript nodes.") +@click.option("--dist_tx", type=float, default=5.0, help="Maximum distance for transcript neighbors.") +@click.option( + "--tile_size", + type=int, + default=None, + help="If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`.", +) +@click.option( + "--tile_width", type=int, default=None, help="Width of the tiles in pixels. Ignored if `tile_size` is provided." +) +@click.option( + "--tile_height", type=int, default=None, help="Height of the tiles in pixels. Ignored if `tile_size` is provided." +) +@click.option("--neg_sampling_ratio", type=float, default=5.0, help="Ratio of negative samples.") +@click.option("--frac", type=float, default=1.0, help="Fraction of the dataset to process.") +@click.option("--val_prob", type=float, default=0.1, help="Proportion of data for use for validation split.") +@click.option("--test_prob", type=float, default=0.2, help="Proportion of data for use for test split.") +@click.option("--n_workers", type=int, default=1, help="Number of workers for parallel processing.") def create_dataset(args: Namespace): - + # Setup logging ch = logging.StreamHandler() ch.setLevel(logging.INFO) @@ -67,5 +80,6 @@ def create_dataset(args: Namespace): logging.info(f"Time to save dataset: {end_time - start_time} seconds") logging.info("Dataset saved successfully.") -if __name__ == '__main__': - create_dataset() \ No newline at end of file + +if __name__ == "__main__": + create_dataset() diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py index eca5a4b..2cfe83e 100644 --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -5,34 +5,47 @@ import logging import os -os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + @click.command(name="run_segmentation", help="Run the Segger segmentation model.") -@click.option('--segger_data_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') -@click.option('--models_dir', type=Path, required=True, help='Directory containing the trained models.') -@click.option('--benchmarks_dir', type=Path, required=True, help='Directory to save the segmentation results.') -@click.option('--transcripts_file', type=str, required=True, help='Path to the transcripts file.') -@click.option('--batch_size', type=int, default=1, help='Batch size for processing.') -@click.option('--num_workers', type=int, default=1, help='Number of workers for data loading.') -@click.option('--model_version', type=int, default=0, help='Model version to load.') -@click.option('--save_tag', type=str, default='segger_embedding_1001_0.5', help='Tag for saving segmentation results.') -@click.option('--min_transcripts', type=int, default=5, help='Minimum number of transcripts for segmentation.') -@click.option('--cell_id_col', type=str, default='segger_cell_id', help='Column name for cell IDs.') -@click.option('--use_cc', is_flag=True, default=False, help='Use connected components if specified.') -@click.option('--knn_method', type=str, default='cuda', help='Method for KNN computation.') -@click.option('--file_format', type=str, default='anndata', help='File format for output data.') -@click.option('--k_bd', type=int, default=4, help='K value for boundary computation.') -@click.option('--dist_bd', type=int, default=12, help='Distance for boundary computation.') -@click.option('--k_tx', type=int, default=5, help='K value for transcript computation.') -@click.option('--dist_tx', type=int, default=5, help='Distance for transcript computation.') -def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Path, - transcripts_file: str, batch_size: int = 1, num_workers: int = 1, - model_version: int = 0, save_tag: str = 'segger_embedding_1001_0.5', - min_transcripts: int = 5, cell_id_col: str = 'segger_cell_id', - use_cc: bool = False, knn_method: str = 'cuda', - file_format: str = 'anndata', k_bd: int = 4, dist_bd: int = 12, - k_tx: int = 5, dist_tx: int = 5): - +@click.option("--segger_data_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option("--models_dir", type=Path, required=True, help="Directory containing the trained models.") +@click.option("--benchmarks_dir", type=Path, required=True, help="Directory to save the segmentation results.") +@click.option("--transcripts_file", type=str, required=True, help="Path to the transcripts file.") +@click.option("--batch_size", type=int, default=1, help="Batch size for processing.") +@click.option("--num_workers", type=int, default=1, help="Number of workers for data loading.") +@click.option("--model_version", type=int, default=0, help="Model version to load.") +@click.option("--save_tag", type=str, default="segger_embedding_1001_0.5", help="Tag for saving segmentation results.") +@click.option("--min_transcripts", type=int, default=5, help="Minimum number of transcripts for segmentation.") +@click.option("--cell_id_col", type=str, default="segger_cell_id", help="Column name for cell IDs.") +@click.option("--use_cc", is_flag=True, default=False, help="Use connected components if specified.") +@click.option("--knn_method", type=str, default="cuda", help="Method for KNN computation.") +@click.option("--file_format", type=str, default="anndata", help="File format for output data.") +@click.option("--k_bd", type=int, default=4, help="K value for boundary computation.") +@click.option("--dist_bd", type=int, default=12, help="Distance for boundary computation.") +@click.option("--k_tx", type=int, default=5, help="K value for transcript computation.") +@click.option("--dist_tx", type=int, default=5, help="Distance for transcript computation.") +def run_segmentation( + segger_data_dir: Path, + models_dir: Path, + benchmarks_dir: Path, + transcripts_file: str, + batch_size: int = 1, + num_workers: int = 1, + model_version: int = 0, + save_tag: str = "segger_embedding_1001_0.5", + min_transcripts: int = 5, + cell_id_col: str = "segger_cell_id", + use_cc: bool = False, + knn_method: str = "cuda", + file_format: str = "anndata", + k_bd: int = 4, + dist_bd: int = 12, + k_tx: int = 5, + dist_tx: int = 5, +): + # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -41,16 +54,16 @@ def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Pa # Initialize the Lightning data module dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=batch_size, - num_workers=num_workers, + batch_size=batch_size, + num_workers=num_workers, ) - + dm.setup() - + logger.info("Loading the model...") # Load in the latest checkpoint - model_path = models_dir / 'lightning_logs' / f'version_{model_version}' - model = load_model(model_path / 'checkpoints') + model_path = models_dir / "lightning_logs" / f"version_{model_version}" + model = load_model(model_path / "checkpoints") logger.info("Running segmentation...") segment( @@ -59,15 +72,16 @@ def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Pa save_dir=benchmarks_dir, seg_tag=save_tag, transcript_file=transcripts_file, - file_format=file_format, - receptive_field={'k_bd': k_bd, 'dist_bd': dist_bd, 'k_tx': k_tx, 'dist_tx': dist_tx}, + file_format=file_format, + receptive_field={"k_bd": k_bd, "dist_bd": dist_bd, "k_tx": k_tx, "dist_tx": dist_tx}, min_transcripts=min_transcripts, cell_id_col=cell_id_col, use_cc=use_cc, knn_method=knn_method, ) - + logger.info("Segmentation completed.") -if __name__ == '__main__': + +if __name__ == "__main__": run_segmentation() diff --git a/src/segger/cli/train_model.py b/src/segger/cli/train_model.py index 78bd5d7..a3a23a7 100644 --- a/src/segger/cli/train_model.py +++ b/src/segger/cli/train_model.py @@ -7,27 +7,33 @@ from argparse import Namespace # Path to default YAML configuration file -train_yml = Path(__file__).parent / 'configs' / 'train' / 'default.yaml' +train_yml = Path(__file__).parent / "configs" / "train" / "default.yaml" help_msg = "Train the Segger segmentation model." + + @click.command(name="train_model", help=help_msg) @add_options(config_path=train_yml) -@click.option('--dataset_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') -@click.option('--models_dir', type=Path, required=True, help='Directory to save the trained model and the training logs.') -@click.option('--sample_tag', type=str, required=True, help='Sample tag for the dataset.') -@click.option('--init_emb', type=int, default=8, help='Size of the embedding layer.') -@click.option('--hidden_channels', type=int, default=32, help='Size of hidden channels in the model.') -@click.option('--num_tx_tokens', type=int, default=500, help='Number of transcript tokens.') -@click.option('--out_channels', type=int, default=8, help='Number of output channels.') -@click.option('--heads', type=int, default=2, help='Number of attention heads.') -@click.option('--num_mid_layers', type=int, default=2, help='Number of mid layers in the model.') -@click.option('--batch_size', type=int, default=4, help='Batch size for training.') -@click.option('--num_workers', type=int, default=2, help='Number of workers for data loading.') -@click.option('--accelerator', type=str, default='cuda', help='Device type to use for training (e.g., "cuda", "cpu").') # Ask for accelerator -@click.option('--max_epochs', type=int, default=200, help='Number of epochs for training.') -@click.option('--devices', type=int, default=4, help='Number of devices (GPUs) to use.') -@click.option('--strategy', type=str, default='auto', help='Training strategy for the trainer.') -@click.option('--precision', type=str, default='16-mixed', help='Precision for training.') +@click.option("--dataset_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option( + "--models_dir", type=Path, required=True, help="Directory to save the trained model and the training logs." +) +@click.option("--sample_tag", type=str, required=True, help="Sample tag for the dataset.") +@click.option("--init_emb", type=int, default=8, help="Size of the embedding layer.") +@click.option("--hidden_channels", type=int, default=32, help="Size of hidden channels in the model.") +@click.option("--num_tx_tokens", type=int, default=500, help="Number of transcript tokens.") +@click.option("--out_channels", type=int, default=8, help="Number of output channels.") +@click.option("--heads", type=int, default=2, help="Number of attention heads.") +@click.option("--num_mid_layers", type=int, default=2, help="Number of mid layers in the model.") +@click.option("--batch_size", type=int, default=4, help="Batch size for training.") +@click.option("--num_workers", type=int, default=2, help="Number of workers for data loading.") +@click.option( + "--accelerator", type=str, default="cuda", help='Device type to use for training (e.g., "cuda", "cpu").' +) # Ask for accelerator +@click.option("--max_epochs", type=int, default=200, help="Number of epochs for training.") +@click.option("--devices", type=int, default=4, help="Number of devices (GPUs) to use.") +@click.option("--strategy", type=str, default="auto", help="Training strategy for the trainer.") +@click.option("--precision", type=str, default="16-mixed", help="Precision for training.") def train_model(args: Namespace): # Setup logging @@ -43,6 +49,7 @@ def train_model(args: Namespace): from segger.training.segger_data_module import SeggerDataModule from lightning.pytorch.loggers import CSVLogger from pytorch_lightning import Trainer + logging.info("Done.") # Load datasets @@ -66,7 +73,7 @@ def train_model(args: Namespace): out_channels=args.out_channels, # Hard-coded value heads=args.heads, # Hard-coded value num_mid_layers=args.num_mid_layers, # Hard-coded value - aggr='sum', # Hard-coded value + aggr="sum", # Hard-coded value metadata=metadata, ) @@ -80,15 +87,12 @@ def train_model(args: Namespace): default_root_dir=args.models_dir, logger=CSVLogger(args.models_dir), ) - + logging.info("Done.") # Train model logging.info("Training model...") - trainer.fit( - model=ls, - datamodule=dm - ) + trainer.fit(model=ls, datamodule=dm) logging.info("Done.") @@ -97,11 +101,13 @@ def train_model(args: Namespace): def train_slurm(args): train_model(args) + @click.group(help="Train the Segger model") def train(): pass + train.add_command(train_slurm) -if __name__ == '__main__': - train_model() \ No newline at end of file +if __name__ == "__main__": + train_model() diff --git a/src/segger/cli/utils.py b/src/segger/cli/utils.py index 2a38610..df6e816 100644 --- a/src/segger/cli/utils.py +++ b/src/segger/cli/utils.py @@ -12,11 +12,11 @@ def add_options( show_default: bool = True, ): """ - A decorator to add command-line options to a Click command from a YAML + A decorator to add command-line options to a Click command from a YAML configuration file. Parameters: - config_path (os.PathLike): The path to the YAML configuration file + config_path (os.PathLike): The path to the YAML configuration file containing the options. show_default (bool): Whether to show default values in help. @@ -26,7 +26,7 @@ def add_options( The YAML configuration file should have the following format: ``` option_name: - type: "type_name" # Optional, the type of the option + type: "type_name" # Optional, the type of the option (e.g., "str", "int") help: "description" # Optional, the help text for the option default: value # Optional, the default value for the option @@ -52,24 +52,23 @@ def greet(args): click.echo(f"Hello, {args.name}! You are {args.age} years old.") ``` """ - def decorator( - function: typing.Callable - ): + + def decorator(function: typing.Callable): # Wrap the original function to convert kwargs to a Namespace object def wrapper(**kwargs): args_namespace = Namespace(**kwargs) return function(args_namespace) - + # Load the YAML configuration file - with open(config_path, 'r') as file: + with open(config_path, "r") as file: config = yaml.safe_load(file.read()) # Decorate function with all options for name, kwargs in reversed(config.items()): - kwargs['show_default'] = show_default - if 'type' in kwargs: - kwargs['type'] = locate(kwargs['type']) - wrapper = click.option(f'--{name}', **kwargs)(wrapper) + kwargs["show_default"] = show_default + if "type" in kwargs: + kwargs["type"] = locate(kwargs["type"]) + wrapper = click.option(f"--{name}", **kwargs)(wrapper) return wrapper @@ -87,31 +86,32 @@ class CustomFormatter(logging.Formatter): bold_red (str): ANSI escape code for bold red color. reset (str): ANSI escape code to reset color. format (str): The format string for log messages. - FORMATS (dict): A dictionary mapping log levels to their respective + FORMATS (dict): A dictionary mapping log levels to their respective color-coded format strings. Methods: format(record): - Format the specified record as text, applying color codes based on the + Format the specified record as text, applying color codes based on the log level. """ + grey = "\x1b[38;20m" green = "\x1b[32;20m" yellow = "\x1b[33;20m" red = "\x1b[31;20m" bold_red = "\x1b[31;1m" reset = "\x1b[0m" - format='%(asctime)s %(levelname)s: %(message)s' + format = "%(asctime)s %(levelname)s: %(message)s" FORMATS = { logging.DEBUG: grey + format + reset, logging.INFO: green + format + reset, logging.WARNING: yellow + format + reset, logging.ERROR: red + format + reset, - logging.CRITICAL: bold_red + format + reset + logging.CRITICAL: bold_red + format + reset, } def format(self, record): log_fmt = self.FORMATS.get(record.levelno) formatter = logging.Formatter(log_fmt) - return formatter.format(record) \ No newline at end of file + return formatter.format(record) diff --git a/src/segger/data/README.md b/src/segger/data/README.md index df6e979..28d7df0 100644 --- a/src/segger/data/README.md +++ b/src/segger/data/README.md @@ -1,6 +1,6 @@ # segger - Data Preparation for Cell Segmentation -The `segger` package provides a comprehensive data preparation module for handling and processing spatial transcriptomics data, specifically designed to support **Xenium** and **Merscope** datasets. This module facilitates the creation of datasets for cell segmentation and subsequent graph-based deep learning tasks by leveraging scalable and efficient processing tools. +The `segger` package provides a comprehensive data preparation module for handling and processing spatial transcriptomics data, specifically designed to support **Xenium** and **Merscope** datasets. This module facilitates the creation of datasets for cell segmentation and subsequent graph-based deep learning tasks by leveraging scalable and efficient processing tools. ## Module Overview @@ -48,7 +48,6 @@ These classes inherit from `SpatialTranscriptomicsSample` and implement dataset- - **`XeniumSample`**: Tailored for **Xenium** datasets, it includes specific filtering rules to exclude unwanted transcripts based on naming patterns (e.g., `NegControlProbe_`, `BLANK_`). - **`MerscopeSample`**: Designed for **Merscope** datasets, allowing for custom filtering and processing logic as needed. - ## Workflow The dataset creation and processing workflow involves several key steps, each ensuring that the spatial transcriptomics data is appropriately prepared for downstream machine learning tasks. @@ -61,39 +60,42 @@ The dataset creation and processing workflow involves several key steps, each en ### Step 2: Tiling - **Spatial Segmentation**: The dataset is divided into smaller, manageable tiles of size $$x_{\text{size}} \times y_{\text{size}}$$, defined by their top-left corner coordinates $$(x_i, y_j)$$. - + $$ n_x = \left\lfloor \frac{x_{\text{max}} - x_{\text{min}}}{d_x} \right\rfloor, \quad n_y = \left\lfloor \frac{y_{\text{max}} - y_{\text{min}}}{d_y} \right\rfloor $$ - - Where: - - $$x_{\text{min}}, y_{\text{min}}$$: Minimum spatial coordinates. - - $$x_{\text{max}}, y_{\text{max}}$$: Maximum spatial coordinates. - - $$d_x, d_y$$: Step sizes along the $$x$$- and $$y$$-axes, respectively. + +Where: + +- $$x_{\text{min}}, y_{\text{min}}$$: Minimum spatial coordinates. +- $$x_{\text{max}}, y_{\text{max}}$$: Maximum spatial coordinates. +- $$d_x, d_y$$: Step sizes along the $$x$$- and $$y$$-axes, respectively. - **Transcript and Boundary Inclusion**: For each tile, transcripts and boundaries within the spatial bounds (with optional margins) are included: - -$$ -x_i - \text{margin}_x \leq x_t < x_i + x_{\text{size}} + \text{margin}_x, \quad y_j - \text{margin}_y \leq y_t < y_j + y_{\text{size}} + \text{margin}_y + +$$ +x_i - \text{margin}_x \leq x_t < x_i + x_{\text{size}} + \text{margin}_x, \quad y_j - \text{margin}_y \leq y_t < y_j + y_{\text{size}} + \text{margin}_y $$ - - Where: - - $$x_t, y_t$$: Transcript coordinates. - - $$\text{margin}_x, \text{margin}_y$$: Optional margins to include contextual data. + +Where: + +- $$x_t, y_t$$: Transcript coordinates. +- $$\text{margin}_x, \text{margin}_y$$: Optional margins to include contextual data. ### Step 3: Graph Construction For each tile, a graph $$G$$ is constructed with: - **Nodes ($$V$$)**: + - **Transcripts**: Represented by their spatial coordinates $$(x_t, y_t)$$ and feature vectors $$\mathbf{f}_t$$. - **Boundaries**: Represented by centroid coordinates $$(x_b, y_b)$$ and associated properties (e.g., area). - **Edges ($$E$$)**: - Created based on spatial proximity using methods like KD-Tree or FAISS. - Defined by a distance threshold $$d$$ and the number of nearest neighbors $$k$$: - -$$ + +$$ E = \{ (v_i, v_j) \mid \text{dist}(v_i, v_j) < d, \, v_i \in V, \, v_j \in V \} $$ @@ -102,7 +104,7 @@ $$ If enabled, edges can be labeled based on relationships, such as whether a transcript belongs to a boundary: $$ -\text{label}(t, b) = +\text{label}(t, b) = \begin{cases} 1 & \text{if } t \text{ belongs to } b \\ 0 & \text{otherwise} @@ -123,7 +125,6 @@ Each tile is randomly assigned to one of these sets according to the specified p The final output consists of a set of tiles, each containing a graph representation of the spatial transcriptomics data. These tiles are stored in designated directories (`train_tiles`, `val_tiles`, `test_tiles`) and are ready for integration into machine learning pipelines. - ## Example Usage Below are examples demonstrating how to utilize the `segger` data preparation module for both Xenium and Merscope datasets. @@ -137,25 +138,29 @@ from segger.data.utils import calculate_gene_celltype_abundance_embedding import scanpy as sc import os -xenium_data_dir = Path('./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1') -segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_0919') -models_dir = Path('./models/bc_embedding_0919') +xenium_data_dir = Path("./data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1") +segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_0919") +models_dir = Path("./models/bc_embedding_0919") -scRNAseq_path = '/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad' +scRNAseq_path = ( + "/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad" +) scRNAseq = sc.read(scRNAseq_path) sc.pp.subsample(scRNAseq, 0.1) # Step 1: Calculate the gene cell type abundance embedding -celltype_column = 'celltype_minor' -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, celltype_column) +celltype_column = "celltype_minor" +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( + scRNAseq, celltype_column +) # Setup Xenium sample to create dataset -xs = XeniumSample(verbose=False , embedding_df=gene_celltype_abundance_embedding) +xs = XeniumSample(verbose=False, embedding_df=gene_celltype_abundance_embedding) xs.set_file_paths( - transcripts_path=xenium_data_dir / 'transcripts.parquet', - boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', + transcripts_path=xenium_data_dir / "transcripts.parquet", + boundaries_path=xenium_data_dir / "nucleus_boundaries.parquet", ) xs.set_metadata() @@ -164,7 +169,7 @@ xenium_sample.set_embedding("cell_type_abundance") # Load nuclei data to define boundaries nuclei_path = raw_data_dir / sample_tag / "nucleus_boundaries.parquet" -xenium_sample.load_boundaries(path=nuclei_path, file_format='parquet') +xenium_sample.load_boundaries(path=nuclei_path, file_format="parquet") # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = xenium_sample.build_pyg_data_from_tile( @@ -173,7 +178,7 @@ tile_pyg_data = xenium_sample.build_pyg_data_from_tile( r_tx=20, k_tx=20, use_precomputed=False, - workers=1 + workers=1, ) @@ -191,10 +196,10 @@ try: k_tx=10, val_prob=0.4, test_prob=0.1, - num_workers=6 + num_workers=6, ) except AssertionError as err: - print(f'Dataset already exists at {segger_data_dir}') + print(f"Dataset already exists at {segger_data_dir}") ``` ### Merscope Data @@ -204,8 +209,8 @@ from segger.data import MerscopeSample from pathlib import Path # Set up the file paths -raw_data_dir = Path('data_raw/merscope/') -processed_data_dir = Path('data_tidy/pyg_datasets') +raw_data_dir = Path("data_raw/merscope/") +processed_data_dir = Path("data_tidy/pyg_datasets") sample_tag = "Merscope_Sample_1" # Create a MerscopeSample instance for spatial transcriptomics processing @@ -215,16 +220,18 @@ merscope_sample = MerscopeSample() merscope_sample.load_transcripts( base_path=raw_data_dir, sample=sample_tag, - transcripts_filename='transcripts.csv', - file_format='csv' + transcripts_filename="transcripts.csv", + file_format="csv", ) # Optionally load cell boundaries cell_boundaries_path = raw_data_dir / sample_tag / "cell_boundaries.parquet" -merscope_sample.load_boundaries(path=cell_boundaries_path, file_format='parquet') +merscope_sample.load_boundaries(path=cell_boundaries_path, file_format="parquet") # Filter transcripts based on specific criteria -filtered_transcripts = merscope_sample.filter_transcripts(merscope_sample.transcripts_df) +filtered_transcripts = merscope_sample.filter_transcripts( + merscope_sample.transcripts_df +) # Build PyTorch Geometric (PyG) data from a tile of the dataset tile_pyg_data = merscope_sample.build_pyg_data_from_tile( @@ -233,12 +240,12 @@ tile_pyg_data = merscope_sample.build_pyg_data_from_tile( r_tx=15, k_tx=15, use_precomputed=True, - workers=2 + workers=2, ) # Save dataset in processed format for segmentation merscope_sample.save_dataset_for_segger( - processed_dir=processed_data_dir / 'embedding', + processed_dir=processed_data_dir / "embedding", x_size=360, y_size=360, d_x=180, @@ -252,6 +259,6 @@ merscope_sample.save_dataset_for_segger( test_prob=0.2, neg_sampling_ratio_approx=3, sampling_rate=1, - num_workers=2 + num_workers=2, ) ``` diff --git a/src/segger/data/__init__.py b/src/segger/data/__init__.py index 1d60059..380a815 100644 --- a/src/segger/data/__init__.py +++ b/src/segger/data/__init__.py @@ -5,35 +5,30 @@ """ __all__ = [ - "XeniumSample", - "MerscopeSample", - "SpatialTranscriptomicsDataset", - "filter_transcripts", - "create_anndata", - "compute_transcript_metrics", + "XeniumSample", + "MerscopeSample", + "SpatialTranscriptomicsDataset", + "filter_transcripts", + "create_anndata", + "compute_transcript_metrics", "SpatialTranscriptomicsSample", "calculate_gene_celltype_abundance_embedding", "get_edge_index", ] from .utils import ( - filter_transcripts, - create_anndata, - compute_transcript_metrics, - get_edge_index, + filter_transcripts, + create_anndata, + compute_transcript_metrics, + get_edge_index, calculate_gene_celltype_abundance_embedding, - SpatialTranscriptomicsDataset + SpatialTranscriptomicsDataset, ) from .io import ( - XeniumSample, - MerscopeSample, + XeniumSample, + MerscopeSample, SpatialTranscriptomicsSample, ) -from .constants import ( - SpatialTranscriptomicsKeys, - XeniumKeys, - MerscopeKeys -) - +from .constants import SpatialTranscriptomicsKeys, XeniumKeys, MerscopeKeys diff --git a/src/segger/data/constants.py b/src/segger/data/constants.py index 7cd1fb6..b48350f 100644 --- a/src/segger/data/constants.py +++ b/src/segger/data/constants.py @@ -1,5 +1,6 @@ from enum import Enum, auto + class SpatialTranscriptomicsKeys(Enum): """Unified keys for spatial transcriptomics data, supporting multiple platforms.""" @@ -7,11 +8,11 @@ class SpatialTranscriptomicsKeys(Enum): TRANSCRIPTS_FILE = auto() BOUNDARIES_FILE = auto() CELL_METADATA_FILE = auto() - + # Cell identifiers CELL_ID = auto() TRANSCRIPTS_ID = auto() - + # Coordinates and locations TRANSCRIPTS_X = auto() TRANSCRIPTS_Y = auto() @@ -19,7 +20,7 @@ class SpatialTranscriptomicsKeys(Enum): BOUNDARIES_VERTEX_Y = auto() GLOBAL_X = auto() GLOBAL_Y = auto() - + # Metadata METADATA_CELL_KEY = auto() COUNTS_CELL_KEY = auto() diff --git a/src/segger/data/io.py b/src/segger/data/io.py index a369b9f..fdfa059 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -30,11 +30,8 @@ import logging import warnings -for msg in [ - r".*Geometry is in a geographic CRS.*", - r".*You did not provide metadata.*" -]: - warnings.filterwarnings('ignore', category=UserWarning, message=msg) +for msg in [r".*Geometry is in a geographic CRS.*", r".*You did not provide metadata.*"]: + warnings.filterwarnings("ignore", category=UserWarning, message=msg) class SpatialTranscriptomicsSample(ABC): @@ -60,10 +57,8 @@ def __init__( self.boundaries_graph = boundaries_graph self.keys = keys self.embedding_df = embedding_df - self.current_embedding = 'token' + self.current_embedding = "token" self.verbose = verbose - - @abstractmethod def filter_transcripts(self, transcripts_df: pd.DataFrame, min_qv: float = 20.0) -> pd.DataFrame: @@ -78,8 +73,7 @@ def filter_transcripts(self, transcripts_df: pd.DataFrame, min_qv: float = 20.0) pd.DataFrame: The filtered dataframe. """ pass - - + def set_file_paths(self, transcripts_path: Path, boundaries_path: Path) -> None: """ Set the paths for the transcript and boundary files. @@ -90,10 +84,11 @@ def set_file_paths(self, transcripts_path: Path, boundaries_path: Path) -> None: """ self.transcripts_path = transcripts_path self.boundaries_path = boundaries_path - - if self.verbose: print(f"Set transcripts file path to {transcripts_path}") - if self.verbose: print(f"Set boundaries file path to {boundaries_path}") + if self.verbose: + print(f"Set transcripts file path to {transcripts_path}") + if self.verbose: + print(f"Set boundaries file path to {boundaries_path}") def load_transcripts( self, @@ -153,22 +148,22 @@ def load_transcripts( self.keys.TRANSCRIPTS_X.value, self.keys.TRANSCRIPTS_Y.value, self.keys.FEATURE_NAME.value, - self.keys.CELL_ID.value + self.keys.CELL_ID.value, ] # Check if the QUALITY_VALUE key exists in the dataset, and add it to the columns list if present if self.keys.QUALITY_VALUE.value in available_columns: columns_to_read.append(self.keys.QUALITY_VALUE.value) - + if self.keys.OVERLAPS_BOUNDARY.value in available_columns: columns_to_read.append(self.keys.OVERLAPS_BOUNDARY.value) # Use filters to only load data within the specified bounding box (x_min, x_max, y_min, y_max) filters = [ - (self.keys.TRANSCRIPTS_X.value, '>=', x_min), - (self.keys.TRANSCRIPTS_X.value, '<=', x_max), - (self.keys.TRANSCRIPTS_Y.value, '>=', y_min), - (self.keys.TRANSCRIPTS_Y.value, '<=', y_max) + (self.keys.TRANSCRIPTS_X.value, ">=", x_min), + (self.keys.TRANSCRIPTS_X.value, "<=", x_max), + (self.keys.TRANSCRIPTS_Y.value, ">=", y_min), + (self.keys.TRANSCRIPTS_Y.value, "<=", y_max), ] # Load the dataset lazily with filters applied for the bounding box @@ -195,27 +190,27 @@ def load_transcripts( # Lazily count the number of rows in the DataFrame before filtering initial_count = delayed(lambda df: df.shape[0])(transcripts_df) # Filter the DataFrame lazily based on valid genes from embeddings - transcripts_df = transcripts_df[ - transcripts_df[self.keys.FEATURE_NAME.value].isin(valid_genes) - ] + transcripts_df = transcripts_df[transcripts_df[self.keys.FEATURE_NAME.value].isin(valid_genes)] final_count = delayed(lambda df: df.shape[0])(transcripts_df) - if self.verbose: print(f"Dropped {initial_count - final_count} transcripts not found in {key} embedding.") + if self.verbose: + print(f"Dropped {initial_count - final_count} transcripts not found in {key} embedding.") # Ensure that the 'OVERLAPS_BOUNDARY' column is boolean if it exists if self.keys.OVERLAPS_BOUNDARY.value in transcripts_df.columns: - transcripts_df[self.keys.OVERLAPS_BOUNDARY.value] = transcripts_df[self.keys.OVERLAPS_BOUNDARY.value].astype(bool) + transcripts_df[self.keys.OVERLAPS_BOUNDARY.value] = transcripts_df[ + self.keys.OVERLAPS_BOUNDARY.value + ].astype(bool) return transcripts_df - def load_boundaries( - self, - path: Path, - file_format: str = "parquet", - x_min: float = None, - x_max: float = None, - y_min: float = None, - y_max: float = None + self, + path: Path, + file_format: str = "parquet", + x_min: float = None, + x_max: float = None, + y_min: float = None, + y_max: float = None, ) -> dd.DataFrame: """ Load boundaries data lazily using Dask, filtering by the specified bounding box. @@ -233,7 +228,7 @@ def load_boundaries( """ if file_format != "parquet": raise ValueError(f"Unsupported file format: {file_format}") - + self.boundaries_path = path # Use bounding box values from set_metadata if not explicitly provided @@ -246,15 +241,15 @@ def load_boundaries( columns_to_read = [ self.keys.BOUNDARIES_VERTEX_X.value, self.keys.BOUNDARIES_VERTEX_Y.value, - self.keys.CELL_ID.value + self.keys.CELL_ID.value, ] # Use filters to only load data within the specified bounding box (x_min, x_max, y_min, y_max) filters = [ - (self.keys.BOUNDARIES_VERTEX_X.value, '>=', x_min), - (self.keys.BOUNDARIES_VERTEX_X.value, '<=', x_max), - (self.keys.BOUNDARIES_VERTEX_Y.value, '>=', y_min), - (self.keys.BOUNDARIES_VERTEX_Y.value, '<=', y_max) + (self.keys.BOUNDARIES_VERTEX_X.value, ">=", x_min), + (self.keys.BOUNDARIES_VERTEX_X.value, "<=", x_max), + (self.keys.BOUNDARIES_VERTEX_Y.value, ">=", y_min), + (self.keys.BOUNDARIES_VERTEX_Y.value, "<=", y_max), ] # Load the dataset lazily with filters applied for the bounding box @@ -265,17 +260,15 @@ def load_boundaries( lambda x: str(x) if pd.notnull(x) else None ) - if self.verbose: print(f"Loaded boundaries from '{path}' within bounding box ({x_min}, {x_max}, {y_min}, {y_max}).") + if self.verbose: + print(f"Loaded boundaries from '{path}' within bounding box ({x_min}, {x_max}, {y_min}, {y_max}).") return boundaries_df - - - def set_metadata(self) -> None: """ Set metadata for the transcript dataset, including bounding box limits and unique gene names, - without reading the entire Parquet file. Additionally, return integer tokens for unique gene names + without reading the entire Parquet file. Additionally, return integer tokens for unique gene names instead of one-hot encodings and store the lookup table for later mapping. """ # Load the Parquet file metadata @@ -287,7 +280,7 @@ def set_metadata(self) -> None: feature_col = self.keys.FEATURE_NAME.value # Initialize variables to track min/max values for X and Y - x_min, x_max, y_min, y_max = float('inf'), float('-inf'), float('inf'), float('-inf') + x_min, x_max, y_min, y_max = float("inf"), float("-inf"), float("inf"), float("-inf") # Extract unique gene names and ensure they're strings gene_set = set() @@ -299,7 +292,7 @@ def set_metadata(self) -> None: "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", - "UnassignedCodeword_" + "UnassignedCodeword_", ) # Iterate over row groups to extract statistics and unique gene names @@ -316,8 +309,12 @@ def set_metadata(self) -> None: y_max = max(y_max, y_values.max()) # Convert feature values (gene names) to strings and filter out unwanted codewords - feature_values = row_group_table[feature_col].to_pandas().apply( - lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x), + feature_values = ( + row_group_table[feature_col] + .to_pandas() + .apply( + lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), + ) ) # Filter out unwanted codewords @@ -332,11 +329,15 @@ def set_metadata(self) -> None: self.y_min = y_min self.y_max = y_max - if self.verbose: print(f"Bounding box limits set: x_min={self.x_min}, x_max={self.x_max}, y_min={self.y_min}, y_max={self.y_max}") + if self.verbose: + print( + f"Bounding box limits set: x_min={self.x_min}, x_max={self.x_max}, y_min={self.y_min}, y_max={self.y_max}" + ) # Convert the set of unique genes into a sorted list for consistent ordering self.unique_genes = sorted(gene_set) - if self.verbose: print(f"Extracted {len(self.unique_genes)} unique gene names for integer tokenization.") + if self.verbose: + print(f"Extracted {len(self.unique_genes)} unique gene names for integer tokenization.") # Initialize a LabelEncoder to convert unique genes into integer tokens self.tx_encoder = LabelEncoder() @@ -345,18 +346,19 @@ def set_metadata(self) -> None: self.tx_encoder.fit(self.unique_genes) # Store the integer tokens mapping to gene names - self.gene_to_token_map = dict(zip(self.tx_encoder.classes_, self.tx_encoder.transform(self.tx_encoder.classes_))) - + self.gene_to_token_map = dict( + zip(self.tx_encoder.classes_, self.tx_encoder.transform(self.tx_encoder.classes_)) + ) - if self.verbose: print("Integer tokens have been computed and stored based on unique gene names.") + if self.verbose: + print("Integer tokens have been computed and stored based on unique gene names.") # Optional: Create a reverse mapping for lookup purposes (token to gene) self.token_to_gene_map = {v: k for k, v in self.gene_to_token_map.items()} + if self.verbose: + print("Lookup tables (gene_to_token_map and token_to_gene_map) have been created.") - if self.verbose: print("Lookup tables (gene_to_token_map and token_to_gene_map) have been created.") - - def set_embedding(self, embedding_name: str) -> None: """ Set the current embedding type for the transcripts. @@ -370,8 +372,7 @@ def set_embedding(self, embedding_name: str) -> None: self.current_embedding = embedding_name else: raise ValueError(f"Embedding {embedding_name} not found in embeddings_dict.") - - + @staticmethod def create_scaled_polygon(group: pd.DataFrame, scale_factor: float, keys) -> gpd.GeoDataFrame: """ @@ -386,9 +387,9 @@ def create_scaled_polygon(group: pd.DataFrame, scale_factor: float, keys) -> gpd gpd.GeoDataFrame: A GeoDataFrame containing the scaled Polygon and cell_id. """ # Extract coordinates and cell ID from the group using keys - x_coords = group[keys['vertex_x']] - y_coords = group[keys['vertex_y']] - cell_id = group[keys['cell_id']].iloc[0] + x_coords = group[keys["vertex_x"]] + y_coords = group[keys["vertex_y"]] + cell_id = group[keys["cell_id"]].iloc[0] # Ensure there are at least 3 points to form a polygon if len(x_coords) >= 3: @@ -398,19 +399,13 @@ def create_scaled_polygon(group: pd.DataFrame, scale_factor: float, keys) -> gpd # Scale the polygon by the provided factor scaled_polygon = polygon.buffer(scale_factor) if scaled_polygon.is_valid and not scaled_polygon.is_empty: - return gpd.GeoDataFrame({ - 'geometry': [scaled_polygon], - keys['cell_id']: [cell_id] - }, geometry='geometry', crs="EPSG:4326") + return gpd.GeoDataFrame( + {"geometry": [scaled_polygon], keys["cell_id"]: [cell_id]}, geometry="geometry", crs="EPSG:4326" + ) # Return an empty GeoDataFrame if no valid polygon is created - return gpd.GeoDataFrame({ - 'geometry': [None], - keys['cell_id']: [cell_id] - }, geometry='geometry', crs="EPSG:4326") - - def generate_and_scale_polygons( - self, boundaries_df: dd.DataFrame, scale_factor: float = 1.0 - ) -> dgpd.GeoDataFrame: + return gpd.GeoDataFrame({"geometry": [None], keys["cell_id"]: [cell_id]}, geometry="geometry", crs="EPSG:4326") + + def generate_and_scale_polygons(self, boundaries_df: dd.DataFrame, scale_factor: float = 1.0) -> dgpd.GeoDataFrame: """ Generate and scale polygons from boundary coordinates using Dask. Keeps class structure intact by using static method for the core polygon generation. @@ -428,40 +423,39 @@ def generate_and_scale_polygons( cell_id_column = self.keys.CELL_ID.value vertex_x_column = self.keys.BOUNDARIES_VERTEX_X.value vertex_y_column = self.keys.BOUNDARIES_VERTEX_Y.value - + create_polygon = self.create_scaled_polygon # Use a lambda to wrap the static method call and avoid passing the function object directly to Dask polygons_ddf = boundaries_df.groupby(cell_id_column).apply( lambda group: create_polygon( - group=group, scale_factor=scale_factor, + group=group, + scale_factor=scale_factor, keys={ # Pass keys as a dict for the lambda function - 'vertex_x': vertex_x_column, - 'vertex_y': vertex_y_column, - 'cell_id': cell_id_column - } + "vertex_x": vertex_x_column, + "vertex_y": vertex_y_column, + "cell_id": cell_id_column, + }, ) ) - + # Lazily compute centroids for each polygon - if self.verbose: print("Adding centroids to the polygons...") - polygons_ddf['centroid_x'] = polygons_ddf.geometry.centroid.x - polygons_ddf['centroid_y'] = polygons_ddf.geometry.centroid.y - + if self.verbose: + print("Adding centroids to the polygons...") + polygons_ddf["centroid_x"] = polygons_ddf.geometry.centroid.x + polygons_ddf["centroid_y"] = polygons_ddf.geometry.centroid.y + polygons_ddf = polygons_ddf.drop_duplicates() # polygons_ddf = polygons_ddf.to_crs("EPSG:3857") return polygons_ddf - - - def compute_transcript_overlap_with_boundaries( self, transcripts_df: dd.DataFrame, boundaries_df: dd.DataFrame = None, polygons_gdf: dgpd.GeoDataFrame = None, - scale_factor: float = 1.0 - ) -> dd.DataFrame: + scale_factor: float = 1.0, + ) -> dd.DataFrame: """ Computes the overlap of transcript locations with scaled boundary polygons and assigns corresponding cell IDs to the transcripts using Dask. @@ -479,15 +473,16 @@ def compute_transcript_overlap_with_boundaries( if polygons_gdf is None: if boundaries_df is None: raise ValueError("Both boundaries_df and polygons_gdf cannot be None. Provide at least one.") - + # Generate polygons from boundaries_df if polygons_gdf is None # if self.verbose: print(f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}.") polygons_gdf = self.generate_and_scale_polygons(boundaries_df, scale_factor) - + if polygons_gdf.empty(): raise ValueError("No valid polygons were generated from the boundaries.") else: - if self.verbose: print(f"Polygons are available. Proceeding with overlap computation.") + if self.verbose: + print(f"Polygons are available. Proceeding with overlap computation.") # Create a delayed function to check if a point is within any polygon def check_overlap(transcript, polygons_gdf): @@ -508,11 +503,14 @@ def check_overlap(transcript, polygons_gdf): return overlap, cell_id # Apply the check_overlap function in parallel to each row using Dask's map_partitions - if self.verbose: print(f"Starting overlap computation for transcripts with the boundary polygons.") + if self.verbose: + print(f"Starting overlap computation for transcripts with the boundary polygons.") transcripts_df = transcripts_df.map_partitions( lambda df: df.assign( **{ - self.keys.OVERLAPS_BOUNDARY.value: df.apply(lambda row: delayed(check_overlap)(row, polygons_gdf)[0], axis=1), + self.keys.OVERLAPS_BOUNDARY.value: df.apply( + lambda row: delayed(check_overlap)(row, polygons_gdf)[0], axis=1 + ), self.keys.CELL_ID.value: df.apply(lambda row: delayed(check_overlap)(row, polygons_gdf)[1], axis=1), } ) @@ -520,9 +518,6 @@ def check_overlap(transcript, polygons_gdf): return transcripts_df - - - def compute_boundaries_geometries( self, boundaries_df: dd.DataFrame = None, @@ -552,38 +547,47 @@ def compute_boundaries_geometries( if polygons_gdf is None: if boundaries_df is None: raise ValueError("Both boundaries_df and polygons_gdf cannot be None. Provide at least one.") - + # Generate polygons from boundaries_df if polygons_gdf is None - if self.verbose: print(f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}.") + if self.verbose: + print( + f"No precomputed polygons provided. Computing polygons from boundaries with a scale factor of {scale_factor}." + ) polygons_gdf = self.generate_and_scale_polygons(boundaries_df, scale_factor) - + # Check if the generated polygons_gdf is empty if polygons_gdf.shape[0] == 0: raise ValueError("No valid polygons were generated from the boundaries.") else: - if self.verbose: print(f"Polygons are available. Proceeding with geometrical computations.") - + if self.verbose: + print(f"Polygons are available. Proceeding with geometrical computations.") + # Compute additional geometrical properties polygons = polygons_gdf.geometry # Compute additional geometrical properties if area: - if self.verbose: print("Computing area...") - polygons_gdf['area'] = polygons.area + if self.verbose: + print("Computing area...") + polygons_gdf["area"] = polygons.area if convexity: - if self.verbose: print("Computing convexity...") - polygons_gdf['convexity'] = polygons.convex_hull.area / polygons.area + if self.verbose: + print("Computing convexity...") + polygons_gdf["convexity"] = polygons.convex_hull.area / polygons.area if elongation: - if self.verbose: print("Computing elongation...") + if self.verbose: + print("Computing elongation...") r = polygons.minimum_rotated_rectangle() - polygons_gdf['elongation'] = (r.length * r.length) / r.area + polygons_gdf["elongation"] = (r.length * r.length) / r.area if circularity: - if self.verbose: print("Computing circularity...") + if self.verbose: + print("Computing circularity...") r = polygons_gdf.minimum_bounding_radius() - polygons_gdf['circularity'] = polygons.area / (r * r) + polygons_gdf["circularity"] = polygons.area / (r * r) + + if self.verbose: + print("Geometrical computations completed.") - if self.verbose: print("Geometrical computations completed.") - return polygons_gdf.reset_index(drop=True) def save_dataset_for_segger( @@ -604,9 +608,9 @@ def save_dataset_for_segger( sampling_rate: float = 1, num_workers: int = 1, scale_boundaries: float = 1.0, - method: str = 'kd_tree', + method: str = "kd_tree", gpu: bool = False, - workers: int = 1 + workers: int = 1, ) -> None: """ Saves the dataset for Segger in a processed format using Dask for parallel and lazy processing. @@ -631,49 +635,64 @@ def save_dataset_for_segger( method (str, optional): Method for computing edge indices (e.g., 'kd_tree', 'faiss'). gpu (bool, optional): Whether to use GPU acceleration for edge index computation. workers (int, optional): Number of workers to use to compute the neighborhood graph (per tile). - + """ # Prepare directories for storing processed tiles self._prepare_directories(processed_dir) - + # Get x and y coordinate ranges for tiling x_range, y_range = self._get_ranges(d_x, d_y) - + # Generate parameters for each tile tile_params = self._generate_tile_params( - x_range, y_range, x_size, y_size, margin_x, margin_y, compute_labels, - r_tx, k_tx, val_prob, test_prob, neg_sampling_ratio_approx, sampling_rate, - processed_dir, scale_boundaries, method, gpu, workers + x_range, + y_range, + x_size, + y_size, + margin_x, + margin_y, + compute_labels, + r_tx, + k_tx, + val_prob, + test_prob, + neg_sampling_ratio_approx, + sampling_rate, + processed_dir, + scale_boundaries, + method, + gpu, + workers, ) # Process each tile using Dask to parallelize the task - if self.verbose: print("Starting tile processing...") + if self.verbose: + print("Starting tile processing...") tasks = [delayed(self._process_tile)(params) for params in tile_params] - + with ProgressBar(): - # Use Dask to process all tiles in parallel + # Use Dask to process all tiles in parallel dask.compute(*tasks, num_workers=num_workers) - if self.verbose: print("Tile processing completed.") - + if self.verbose: + print("Tile processing completed.") def _prepare_directories(self, processed_dir: Path) -> None: """Prepares directories for saving tiles.""" processed_dir = Path(processed_dir) # by default, convert to Path object - for data_type in ['train', 'test', 'val']: - for data_stage in ['raw', 'processed']: - tile_dir = processed_dir / f'{data_type}_tiles' / data_stage + for data_type in ["train", "test", "val"]: + for data_stage in ["raw", "processed"]: + tile_dir = processed_dir / f"{data_type}_tiles" / data_stage tile_dir.mkdir(parents=True, exist_ok=True) if os.listdir(tile_dir): msg = f"Directory '{tile_dir}' must be empty." raise AssertionError(msg) - def _get_ranges(self, d_x: float, d_y: float) -> Tuple[np.ndarray, np.ndarray]: """Generates ranges for tiling.""" x_range = np.arange(self.x_min // 1000 * 1000, self.x_max, d_x) y_range = np.arange(self.y_min // 1000 * 1000, self.y_max, d_y) return x_range, y_range - + def _generate_tile_params( self, x_range: np.ndarray, @@ -693,7 +712,7 @@ def _generate_tile_params( scale_boundaries: float, method: str, gpu: bool, - workers: int + workers: int, ) -> List[Tuple]: """ Generates parameters for processing tiles using the bounding box approach. @@ -707,22 +726,36 @@ def _generate_tile_params( # Generate tile parameters based on ranges and margins tile_params = [ ( - i, j, x_size, y_size, x_range[i], y_range[j], margin_x, margin_y, - compute_labels, r_tx, k_tx, neg_sampling_ratio_approx, val_prob, - test_prob, processed_dir, scale_boundaries, sampling_rate, - method, gpu, workers + i, + j, + x_size, + y_size, + x_range[i], + y_range[j], + margin_x, + margin_y, + compute_labels, + r_tx, + k_tx, + neg_sampling_ratio_approx, + val_prob, + test_prob, + processed_dir, + scale_boundaries, + sampling_rate, + method, + gpu, + workers, ) - for i in range(len(x_range)) + for i in range(len(x_range)) for j in range(len(y_range)) ] return tile_params - - # def _process_tiles(self, tile_params: List[Tuple], num_workers: int) -> None: # """ # Processes the tiles using Dask's parallelization utilities. - + # Parameters: # ----------- # tile_params : List[Tuple] @@ -741,7 +774,6 @@ def _generate_tile_params( # if self.verbose: print("Tile processing completed.") - def _process_tile(self, tile_params: Tuple) -> None: """ Process a single tile using Dask for parallelism and lazy evaluation, and save the data. @@ -751,33 +783,54 @@ def _process_tile(self, tile_params: Tuple) -> None: Parameters for the tile processing. """ ( - i, j, x_size, y_size, x_loc, y_loc, margin_x, margin_y, compute_labels, - r_tx, k_tx, neg_sampling_ratio_approx, val_prob, test_prob, processed_dir, - scale_boundaries, sampling_rate, method, gpu, workers + i, + j, + x_size, + y_size, + x_loc, + y_loc, + margin_x, + margin_y, + compute_labels, + r_tx, + k_tx, + neg_sampling_ratio_approx, + val_prob, + test_prob, + processed_dir, + scale_boundaries, + sampling_rate, + method, + gpu, + workers, ) = tile_params - if self.verbose: print(f"Processing tile at location (x_min: {x_loc}, y_min: {y_loc}), size (width: {x_size}, height: {y_size})") + if self.verbose: + print( + f"Processing tile at location (x_min: {x_loc}, y_min: {y_loc}), size (width: {x_size}, height: {y_size})" + ) # Sampling rate to decide if the tile should be processed if random.random() > sampling_rate: - if self.verbose: print(f"Skipping tile at (x_min: {x_loc}, y_min: {y_loc}) due to sampling rate.") + if self.verbose: + print(f"Skipping tile at (x_min: {x_loc}, y_min: {y_loc}) due to sampling rate.") return # Read only the required boundaries and transcripts for this tile using delayed loading boundaries_df = delayed(self.load_boundaries)( path=self.boundaries_path, - x_min=x_loc - margin_x, - x_max=x_loc + x_size + margin_x, - y_min=y_loc - margin_y, - y_max=y_loc + y_size + margin_y + x_min=x_loc - margin_x, + x_max=x_loc + x_size + margin_x, + y_min=y_loc - margin_y, + y_max=y_loc + y_size + margin_y, ).compute() - + transcripts_df = delayed(self.load_transcripts)( path=self.transcripts_path, x_min=x_loc - margin_x, - x_max=x_loc + x_size , + x_max=x_loc + x_size, y_min=y_loc - margin_y, - y_max=y_loc + y_size + y_max=y_loc + y_size, ).compute() # If no data is found in transcripts or boundaries, skip the tile @@ -788,62 +841,78 @@ def _process_tile(self, tile_params: Tuple) -> None: # If the number of transcripts is less than 20 or the number of nuclei is less than 2, skip the tile if transcripts_df_count < 20 or boundaries_df_count < 2: - if self.verbose: print(f"Dropping tile (x_min: {x_loc}, y_min: {y_loc}) due to insufficient data (transcripts: {transcripts_df_count}, boundaries: {boundaries_df_count}).") + if self.verbose: + print( + f"Dropping tile (x_min: {x_loc}, y_min: {y_loc}) due to insufficient data (transcripts: {transcripts_df_count}, boundaries: {boundaries_df_count})." + ) return # Build PyG data structure from tile-specific data - if self.verbose: print(f"Building PyG data for tile at (x_min: {x_loc}, y_min: {y_loc})...") + if self.verbose: + print(f"Building PyG data for tile at (x_min: {x_loc}, y_min: {y_loc})...") data = delayed(self.build_pyg_data_from_tile)( - boundaries_df, transcripts_df, r_tx=r_tx, k_tx=k_tx, method=method, gpu=gpu, workers=workers, scale_boundaries=scale_boundaries + boundaries_df, + transcripts_df, + r_tx=r_tx, + k_tx=k_tx, + method=method, + gpu=gpu, + workers=workers, + scale_boundaries=scale_boundaries, ) - + data = data.compute() - if self.verbose: print(data) + if self.verbose: + print(data) try: # Probability to assign to train-val-test split prob = random.random() if compute_labels and (prob > test_prob): - if self.verbose: print(f"Computing labels for tile at (x_min: {x_loc}, y_min: {y_loc})...") + if self.verbose: + print(f"Computing labels for tile at (x_min: {x_loc}, y_min: {y_loc})...") transform = RandomLinkSplit( - num_val=0, num_test=0, is_undirected=True, edge_types=[('tx', 'belongs', 'bd')], + num_val=0, + num_test=0, + is_undirected=True, + edge_types=[("tx", "belongs", "bd")], neg_sampling_ratio=neg_sampling_ratio_approx * 2, ) data = delayed(transform)(data).compute()[0] - + # if self.verbose: print(data) # Save the tile data to the appropriate directory based on split - if self.verbose: print(f"Saving data for tile at (x_min: {x_loc}, y_min: {y_loc})...") + if self.verbose: + print(f"Saving data for tile at (x_min: {x_loc}, y_min: {y_loc})...") filename = f"tiles_x={x_loc}_y={y_loc}_w={x_size}_h={y_size}.pt" if prob > val_prob + test_prob: - torch.save(data, processed_dir / 'train_tiles' / 'processed' / filename) + torch.save(data, processed_dir / "train_tiles" / "processed" / filename) elif prob > test_prob: - torch.save(data, processed_dir / 'val_tiles' / 'processed' / filename) + torch.save(data, processed_dir / "val_tiles" / "processed" / filename) else: - torch.save(data, processed_dir / 'test_tiles' / 'processed' / filename) + torch.save(data, processed_dir / "test_tiles" / "processed" / filename) # Use Dask to save the file in parallel # save_task.compute() - if self.verbose: print(f"Tile at (x_min: {x_loc}, y_min: {y_loc}) processed and saved successfully.") + if self.verbose: + print(f"Tile at (x_min: {x_loc}, y_min: {y_loc}) processed and saved successfully.") except Exception as e: - if self.verbose: print(f"Error processing tile at (x_min: {x_loc}, y_min: {y_loc}): {e}") - - + if self.verbose: + print(f"Error processing tile at (x_min: {x_loc}, y_min: {y_loc}): {e}") def build_pyg_data_from_tile( - self, - boundaries_df: dd.DataFrame, - transcripts_df: dd.DataFrame, - r_tx: float = 5.0, - k_tx: int = 3, - method: str = 'kd_tree', - gpu: bool = False, + self, + boundaries_df: dd.DataFrame, + transcripts_df: dd.DataFrame, + r_tx: float = 5.0, + k_tx: int = 3, + method: str = "kd_tree", + gpu: bool = False, workers: int = 1, - scale_boundaries: float = 1.0 - + scale_boundaries: float = 1.0, ) -> HeteroData: """ Builds PyG data from a tile of boundaries and transcripts data using Dask utilities for efficient processing. @@ -857,7 +926,7 @@ def build_pyg_data_from_tile( gpu (bool, optional): Whether to use GPU acceleration for edge index computation. workers (int, optional): Number of workers to use for parallel processing. scale_boundaries (float, optional): The factor by which to scale the boundary polygons. Default is 1.0. - + Returns: HeteroData: PyG Heterogeneous Data object. """ @@ -865,100 +934,93 @@ def build_pyg_data_from_tile( data = HeteroData() # Lazily compute boundaries geometries using Dask - if self.verbose: print("Computing boundaries geometries...") + if self.verbose: + print("Computing boundaries geometries...") bd_gdf = self.compute_boundaries_geometries(boundaries_df, scale_factor=scale_boundaries) - bd_gdf = bd_gdf[bd_gdf['geometry'].notnull()] - + bd_gdf = bd_gdf[bd_gdf["geometry"].notnull()] + # Add boundary node data to PyG HeteroData lazily - data['bd'].id = bd_gdf[self.keys.CELL_ID.value].values - data['bd'].pos = torch.as_tensor(bd_gdf[['centroid_x', 'centroid_y']].values.astype(float)) - - if data['bd'].pos.isnan().any(): - raise ValueError(data['bd'].id[data['bd'].pos.isnan().any(1)]) - - bd_x = bd_gdf.iloc[:, 4:] - data['bd'].x = torch.as_tensor(bd_x.to_numpy(), dtype=torch.float32) + data["bd"].id = bd_gdf[self.keys.CELL_ID.value].values + data["bd"].pos = torch.as_tensor(bd_gdf[["centroid_x", "centroid_y"]].values.astype(float)) + + if data["bd"].pos.isnan().any(): + raise ValueError(data["bd"].id[data["bd"].pos.isnan().any(1)]) + bd_x = bd_gdf.iloc[:, 4:] + data["bd"].x = torch.as_tensor(bd_x.to_numpy(), dtype=torch.float32) # Extract the transcript coordinates lazily - if self.verbose: print("Preparing transcript features and positions...") + if self.verbose: + print("Preparing transcript features and positions...") x_xyz = transcripts_df[[self.keys.TRANSCRIPTS_X.value, self.keys.TRANSCRIPTS_Y.value]].to_numpy() - data['tx'].id = torch.as_tensor(transcripts_df[self.keys.TRANSCRIPTS_ID.value].values.astype(int)) - data['tx'].pos = torch.tensor(x_xyz, dtype=torch.float32) + data["tx"].id = torch.as_tensor(transcripts_df[self.keys.TRANSCRIPTS_ID.value].values.astype(int)) + data["tx"].pos = torch.tensor(x_xyz, dtype=torch.float32) - - - # Lazily prepare transcript embeddings (if available) - if self.verbose: print("Preparing transcript embeddings..") + if self.verbose: + print("Preparing transcript embeddings..") token_encoding = self.tx_encoder.transform(transcripts_df[self.keys.FEATURE_NAME.value]) - transcripts_df['token'] = token_encoding # Store the integer tokens in the 'token' column - data['tx'].token = torch.as_tensor(token_encoding).int() + transcripts_df["token"] = token_encoding # Store the integer tokens in the 'token' column + data["tx"].token = torch.as_tensor(token_encoding).int() # Handle additional embeddings lazily as well if not self.embedding_df.empty: - embeddings = delayed(lambda df: self.embedding_df.loc[ - df[self.keys.FEATURE_NAME.value].values - ].values)(transcripts_df) - else: + embeddings = delayed(lambda df: self.embedding_df.loc[df[self.keys.FEATURE_NAME.value].values].values)( + transcripts_df + ) + else: embeddings = token_encoding embeddings = embeddings.compute() x_features = torch.as_tensor(embeddings).int() - data['tx'].x = x_features + data["tx"].x = x_features # Check if the overlap column exists, if not, compute it lazily using Dask if self.keys.OVERLAPS_BOUNDARY.value not in transcripts_df.columns: - if self.verbose: print(f"Computing overlaps for transcripts...") - transcripts_df = self.compute_transcript_overlap_with_boundaries( - transcripts_df, bd_gdf, scale_factor=1.0 - ) + if self.verbose: + print(f"Computing overlaps for transcripts...") + transcripts_df = self.compute_transcript_overlap_with_boundaries(transcripts_df, bd_gdf, scale_factor=1.0) # Connect transcripts with their corresponding boundaries (e.g., nuclei, cells) - if self.verbose: print("Connecting transcripts with boundaries...") + if self.verbose: + print("Connecting transcripts with boundaries...") overlaps = transcripts_df[self.keys.OVERLAPS_BOUNDARY.value].values valid_cell_ids = bd_gdf[self.keys.CELL_ID.value].values - ind = np.where( - overlaps & transcripts_df[self.keys.CELL_ID.value].isin(valid_cell_ids) - )[0] - tx_bd_edge_index = np.column_stack(( - ind, - np.searchsorted( - valid_cell_ids, - transcripts_df.iloc[ind][self.keys.CELL_ID.value] - ) - )) + ind = np.where(overlaps & transcripts_df[self.keys.CELL_ID.value].isin(valid_cell_ids))[0] + tx_bd_edge_index = np.column_stack( + (ind, np.searchsorted(valid_cell_ids, transcripts_df.iloc[ind][self.keys.CELL_ID.value])) + ) # Add transcript-boundary edge index to PyG HeteroData - data['tx', 'belongs', 'bd'].edge_index = torch.as_tensor(tx_bd_edge_index.T, dtype=torch.long) + data["tx", "belongs", "bd"].edge_index = torch.as_tensor(tx_bd_edge_index.T, dtype=torch.long) # Compute transcript-to-transcript (tx-tx) edges using Dask (lazy computation) - if self.verbose: print("Computing tx-tx edges...") + if self.verbose: + print("Computing tx-tx edges...") tx_positions = transcripts_df[[self.keys.TRANSCRIPTS_X.value, self.keys.TRANSCRIPTS_Y.value]].values delayed_tx_edge_index = delayed(get_edge_index)( - tx_positions, - tx_positions, - k=k_tx, - dist=r_tx, - method=method, - gpu=gpu, - workers=workers + tx_positions, tx_positions, k=k_tx, dist=r_tx, method=method, gpu=gpu, workers=workers ) tx_edge_index = delayed_tx_edge_index.compute() # Add the tx-tx edge index to the PyG HeteroData object - data['tx', 'neighbors', 'tx'].edge_index = torch.as_tensor(tx_edge_index.T, dtype=torch.long) - - - if self.verbose: print("Finished building PyG data for the tile.") - return data - - - + data["tx", "neighbors", "tx"].edge_index = torch.as_tensor(tx_edge_index.T, dtype=torch.long) + if self.verbose: + print("Finished building PyG data for the tile.") + return data class XeniumSample(SpatialTranscriptomicsSample): - def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, embedding_df: pd.DataFrame = None, verbose: bool = True): - super().__init__(transcripts_df, transcripts_radius, boundaries_graph, embedding_df, XeniumKeys, verbose=verbose) + def __init__( + self, + transcripts_df: dd.DataFrame = None, + transcripts_radius: int = 10, + boundaries_graph: bool = False, + embedding_df: pd.DataFrame = None, + verbose: bool = True, + ): + super().__init__( + transcripts_df, transcripts_radius, boundaries_graph, embedding_df, XeniumKeys, verbose=verbose + ) def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: """ @@ -977,14 +1039,14 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", - "UnassignedCodeword_" + "UnassignedCodeword_", ) # Ensure FEATURE_NAME is a string type for proper filtering (compatible with Dask) # Handle potential bytes to string conversion for Dask DataFrame if pd.api.types.is_object_dtype(transcripts_df[self.keys.FEATURE_NAME.value]): transcripts_df[self.keys.FEATURE_NAME.value] = transcripts_df[self.keys.FEATURE_NAME.value].apply( - lambda x: x.decode('utf-8') if isinstance(x, bytes) else x + lambda x: x.decode("utf-8") if isinstance(x, bytes) else x ) # Apply the quality value filter using Dask @@ -1001,7 +1063,14 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) class MerscopeSample(SpatialTranscriptomicsSample): - def __init__(self, transcripts_df: dd.DataFrame = None, transcripts_radius: int = 10, boundaries_graph: bool = False, embedding_df: pd.DataFrame = None, verbose: bool = True): + def __init__( + self, + transcripts_df: dd.DataFrame = None, + transcripts_radius: int = 10, + boundaries_graph: bool = False, + embedding_df: pd.DataFrame = None, + verbose: bool = True, + ): super().__init__(transcripts_df, transcripts_radius, boundaries_graph, embedding_df, MerscopeKeys) def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) -> dd.DataFrame: @@ -1021,5 +1090,3 @@ def filter_transcripts(self, transcripts_df: dd.DataFrame, min_qv: float = 20.0) # Add custom Merscope-specific filtering logic if needed # For now, apply only the quality value filter return transcripts_df[transcripts_df[self.keys.QUALITY_VALUE.value] >= min_qv] - - diff --git a/src/segger/data/parquet/_experimental.py b/src/segger/data/parquet/_experimental.py index f8af0f1..739ff23 100644 --- a/src/segger/data/parquet/_experimental.py +++ b/src/segger/data/parquet/_experimental.py @@ -1,9 +1,9 @@ - from typing import TYPE_CHECKING -if TYPE_CHECKING: # False at runtime +if TYPE_CHECKING: # False at runtime import dask, cudf, dask_cudf, pandas as pd + class BackendHandler: """ A class to handle different DataFrame backends for reading and processing @@ -19,15 +19,15 @@ class BackendHandler: Methods ------- read_parquet(): - Returns the function to read Parquet files according to the selected + Returns the function to read Parquet files according to the selected backend. """ _valid_backends = { - 'pandas', - 'dask', - 'cudf', - 'dask_cudf', + "pandas", + "dask", + "cudf", + "dask_cudf", } def __init__(self, backend): @@ -35,31 +35,31 @@ def __init__(self, backend): if backend in self._valid_backends: self.backend = backend else: - valid = ', '.join(map(lambda o: f"'{o}'", self._valid_backends)) + valid = ", ".join(map(lambda o: f"'{o}'", self._valid_backends)) msg = f"Unsupported backend: {backend}. Valid options are {valid}." raise ValueError(msg) # Dynamically import packages only if requested - if self.backend == 'pandas': + if self.backend == "pandas": import pandas as pd - elif self.backend == 'dask': + elif self.backend == "dask": import dask - elif self.backend == 'cudf': + elif self.backend == "cudf": import cudf - elif self.backend == 'dask_cudf': + elif self.backend == "dask_cudf": import dask_cudf else: - raise ValueError('Internal Error') + raise ValueError("Internal Error") @property def read_parquet(self): - if self.backend == 'pandas': + if self.backend == "pandas": return pd.read_parquet - elif self.backend == 'dask': + elif self.backend == "dask": return dask.dataframe.read_parquet - elif self.backend == 'cudf': + elif self.backend == "cudf": return cudf.read_parquet - elif self.backend == 'dask_cudf': + elif self.backend == "dask_cudf": return dask_cudf.read_parquet else: - raise ValueError('Internal Error') \ No newline at end of file + raise ValueError("Internal Error") diff --git a/src/segger/data/parquet/_ndtree.py b/src/segger/data/parquet/_ndtree.py index cc68ef0..bad3ee5 100644 --- a/src/segger/data/parquet/_ndtree.py +++ b/src/segger/data/parquet/_ndtree.py @@ -3,10 +3,11 @@ import numpy as np import math -class NDTree(): + +class NDTree: """ - NDTree is a data structure for recursively splitting multi-dimensional data - into smaller regions until each leaf node contains less than or equal to a + NDTree is a data structure for recursively splitting multi-dimensional data + into smaller regions until each leaf node contains less than or equal to a specified number of points. It stores these regions in a balanced binary tree. @@ -19,7 +20,7 @@ class NDTree(): idx : np.ndarray The indices of the input data points. boxes : list - A list to store the bounding boxes (as shapely polygons) of each region + A list to store the bounding boxes (as shapely polygons) of each region in the tree. rect : Rectangle The bounding box of the entire input data space. @@ -46,7 +47,8 @@ def __init__(self, data, n): self.rect = Rectangle(data.min(0), data.max(0)) self.tree = innernode(self.n, self.idx, self.rect, self) -class innernode(): + +class innernode: """ Represents a node in the NDTree. Each node either stores a bounding box for the data it contains (leaf nodes) or splits the data into two child nodes. @@ -66,7 +68,7 @@ class innernode(): split_point : float The value along the split dimension used to divide the data. less : innernode - The child node containing data points less than or equal to the split + The child node containing data points less than or equal to the split point. greater : innernode The child node containing data points greater than the split point. @@ -85,10 +87,10 @@ def __init__(self, n, idx, rect, tree): else: box = shapely.box(*self.rect.mins, *self.rect.maxes) self.tree.boxes.append(box) - + def split(self): """ - Recursively splits the node's data into two child nodes along the + Recursively splits the node's data into two child nodes along the dimension with the largest spread. """ less = math.floor(self.n // 2) @@ -98,19 +100,6 @@ def split(self): data = data[:, self.split_dim] self.split_point = np.quantile(data, less / (less + greater)) mask = data <= self.split_point - less_rect, greater_rect = self.rect.split( - self.split_dim, - self.split_point - ) - self.less = innernode( - less, - self.idx[mask], - less_rect, - self.tree - ) - self.greater = innernode( - greater, - self.idx[~mask], - greater_rect, - self.tree - ) \ No newline at end of file + less_rect, greater_rect = self.rect.split(self.split_dim, self.split_point) + self.less = innernode(less, self.idx[mask], less_rect, self.tree) + self.greater = innernode(greater, self.idx[~mask], greater_rect, self.tree) diff --git a/src/segger/data/parquet/_settings/xenium.yaml b/src/segger/data/parquet/_settings/xenium.yaml index 7304aa7..6c5333e 100644 --- a/src/segger/data/parquet/_settings/xenium.yaml +++ b/src/segger/data/parquet/_settings/xenium.yaml @@ -13,14 +13,14 @@ transcripts: - "BLANK_" - "DeprecatedCodeword_" - "UnassignedCodeword_" - xy: + xy: - "x_location" - "y_location" - xyz: + xyz: - "x_location" - "y_location" - "z_location" - columns: + columns: - "x_location" - "y_location" - "z_location" @@ -36,10 +36,10 @@ boundaries: y: "vertex_y" id: "cell_id" label: "cell_id" - xy: + xy: - "vertex_x" - "vertex_y" - columns: + columns: - "vertex_x" - "vertex_y" - "cell_id" diff --git a/src/segger/data/parquet/_utils.py b/src/segger/data/parquet/_utils.py index 6f29cec..8c3ffec 100644 --- a/src/segger/data/parquet/_utils.py +++ b/src/segger/data/parquet/_utils.py @@ -10,6 +10,7 @@ from pathlib import Path import yaml + def get_xy_extents( filepath, x: str, @@ -50,6 +51,7 @@ def get_xy_extents( bounds = shapely.box(x_min, y_min, x_max, y_max) return bounds + def read_parquet_region( filepath, x: str, @@ -89,14 +91,17 @@ def read_parquet_region( # Find bounds of full file if not supplied if bounds is None: bounds = get_xy_bounds(filepath, x, y) - + # Load pre-filtered data from Parquet file - filters = [[ - (x, '>', bounds.bounds[0]), - (y, '>', bounds.bounds[1]), - (x, '<', bounds.bounds[2]), - (y, '<', bounds.bounds[3]), - ] + extra_filters] + filters = [ + [ + (x, ">", bounds.bounds[0]), + (y, ">", bounds.bounds[1]), + (x, "<", bounds.bounds[2]), + (y, "<", bounds.bounds[3]), + ] + + extra_filters + ] columns = list({x, y} | set(extra_columns)) @@ -107,6 +112,7 @@ def read_parquet_region( ) return region + def get_polygons_from_xy( boundaries: pd.DataFrame, x: str, @@ -114,13 +120,13 @@ def get_polygons_from_xy( label: str, ) -> gpd.GeoSeries: """ - Convert boundary coordinates from a cuDF DataFrame to a GeoSeries of + Convert boundary coordinates from a cuDF DataFrame to a GeoSeries of polygons. Parameters ---------- boundaries : pd.DataFrame - A DataFrame containing the boundary data with x and y coordinates + A DataFrame containing the boundary data with x and y coordinates and identifiers. x : str The name of the column representing the x-coordinate. @@ -133,7 +139,7 @@ def get_polygons_from_xy( Returns ------- gpd.GeoSeries - A GeoSeries containing the polygons created from the boundary + A GeoSeries containing the polygons created from the boundary coordinates. """ # Polygon offsets in coords @@ -152,6 +158,7 @@ def get_polygons_from_xy( return gs + def filter_boundaries( boundaries: pd.DataFrame, inset: shapely.Polygon, @@ -161,13 +168,13 @@ def filter_boundaries( label: str, ): """ - Filter boundary polygons based on their overlap with specified inset and + Filter boundary polygons based on their overlap with specified inset and outset regions. Parameters ---------- boundaries : cudf.DataFrame - A DataFrame containing the boundary data with x and y coordinates and + A DataFrame containing the boundary data with x and y coordinates and identifiers. inset : shapely.Polygon A polygon representing the inner region to filter the boundaries. @@ -187,43 +194,46 @@ def filter_boundaries( Notes ----- - The function determines overlaps of boundary polygons with the specified - inset and outset regions. It creates boolean masks for overlaps with the - top, left, right, and bottom sides of the outset region, as well as the - center region defined by the inset polygon. The filtering logic includes + The function determines overlaps of boundary polygons with the specified + inset and outset regions. It creates boolean masks for overlaps with the + top, left, right, and bottom sides of the outset region, as well as the + center region defined by the inset polygon. The filtering logic includes polygons that: - Are completely within the center region. - Overlap with the center and the left side, but not the bottom side. - Overlap with the center and the top side, but not the right side. """ + # Determine overlaps of boundary polygons def in_region(region): in_x = boundaries[x].between(region.bounds[0], region.bounds[2]) in_y = boundaries[y].between(region.bounds[1], region.bounds[3]) return in_x & in_y + x1, y1, x4, y4 = outset.bounds x2, y2, x3, y3 = inset.bounds - boundaries['top'] = in_region(shapely.box(x1, y1, x4, y2)) - boundaries['left'] = in_region(shapely.box(x1, y1, x2, y4)) - boundaries['right'] = in_region(shapely.box(x3, y1, x4, y4)) - boundaries['bottom'] = in_region(shapely.box(x1, y3, x4, y4)) - boundaries['center'] = in_region(inset) + boundaries["top"] = in_region(shapely.box(x1, y1, x4, y2)) + boundaries["left"] = in_region(shapely.box(x1, y1, x2, y4)) + boundaries["right"] = in_region(shapely.box(x3, y1, x4, y4)) + boundaries["bottom"] = in_region(shapely.box(x1, y3, x4, y4)) + boundaries["center"] = in_region(inset) # Filter boundary polygons # Include overlaps with top and left, not bottom and right gb = boundaries.groupby(label, sort=False) - total = gb['center'].transform('size') - in_top = gb['top'].transform('sum') - in_left = gb['left'].transform('sum') - in_right = gb['right'].transform('sum') - in_bottom = gb['bottom'].transform('sum') - in_center = gb['center'].transform('sum') + total = gb["center"].transform("size") + in_top = gb["top"].transform("sum") + in_left = gb["left"].transform("sum") + in_right = gb["right"].transform("sum") + in_bottom = gb["bottom"].transform("sum") + in_center = gb["center"].transform("sum") keep = in_center == total - keep |= ((in_center > 0) & (in_left > 0) & (in_bottom == 0)) - keep |= ((in_center > 0) & (in_top > 0) & (in_right == 0)) + keep |= (in_center > 0) & (in_left > 0) & (in_bottom == 0) + keep |= (in_center > 0) & (in_top > 0) & (in_right == 0) inset_boundaries = boundaries.loc[keep] return inset_boundaries + def filter_transcripts( transcripts_df: pd.DataFrame, label: Optional[str] = None, @@ -256,9 +266,10 @@ def filter_transcripts( mask &= transcripts_df["qv"].ge(min_qv) return transcripts_df[mask] + def load_settings(sample_type: str) -> SimpleNamespace: """ - Loads a matching YAML file from the _settings/ directory and converts its + Loads a matching YAML file from the _settings/ directory and converts its contents into a SimpleNamespace. Parameters @@ -276,25 +287,23 @@ def load_settings(sample_type: str) -> SimpleNamespace: ValueError If `sample_type` does not match any filenames. """ - settings_dir = Path(__file__).parent.resolve() / '_settings' + settings_dir = Path(__file__).parent.resolve() / "_settings" # Get a list of YAML filenames (without extensions) in the _settings dir - filenames = [file.stem for file in settings_dir.glob('*.yaml')] + filenames = [file.stem for file in settings_dir.glob("*.yaml")] # Convert sample_type to lowercase and check if it matches any filename sample_type = sample_type.lower() if sample_type not in filenames: - msg = ( - f"Sample type '{sample_type}' not found in settings. " - f"Available options: {', '.join(filenames)}" - ) + msg = f"Sample type '{sample_type}' not found in settings. " f"Available options: {', '.join(filenames)}" raise FileNotFoundError(msg) # Load the matching YAML file yaml_file_path = settings_dir / f"{sample_type}.yaml" - with yaml_file_path.open('r') as file: + with yaml_file_path.open("r") as file: data = yaml.safe_load(file) - + # Convert the YAML data into a SimpleNamespace recursively return _dict_to_namespace(data) + def _dict_to_namespace(d): """ Recursively converts a dictionary to a SimpleNamespace. @@ -302,4 +311,4 @@ def _dict_to_namespace(d): if isinstance(d, dict): d = {k: _dict_to_namespace(v) for k, v in d.items()} return SimpleNamespace(**d) - return d \ No newline at end of file + return d diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index 5599cb3..d64b9e6 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -5,17 +5,19 @@ from pathlib import Path import torch + class STPyGDataset(InMemoryDataset): """ - An in-memory dataset class for handling training using spatial + An in-memory dataset class for handling training using spatial transcriptomics data. """ + def __init__( self, root: str, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, - pre_filter: Optional[Callable] = None + pre_filter: Optional[Callable] = None, ): super().__init__(root, transform, pre_transform, pre_filter) @@ -37,7 +39,7 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - paths = glob.glob(f'{self.processed_dir}/tiles_x*_y*_*_*.pt') + paths = glob.glob(f"{self.processed_dir}/tiles_x*_y*_*_*.pt") # paths = paths.append(paths = glob.glob(f'{self.processed_dir}/tiles_x*_y*_*_*.pt')) file_names = list(map(os.path.basename, paths)) return file_names @@ -63,13 +65,13 @@ def get(self, idx: int) -> Data: """ filepath = Path(self.processed_dir) / self.processed_file_names[idx] data = torch.load(filepath) - data['tx'].x = data['tx'].x.to_dense() - if data['tx'].x.dim() == 1: - data['tx'].x = data['tx'].x.unsqueeze(1) - assert data['tx'].x.dim() == 2 + data["tx"].x = data["tx"].x.to_dense() + if data["tx"].x.dim() == 1: + data["tx"].x = data["tx"].x.unsqueeze(1) + assert data["tx"].x.dim() == 2 # this is an issue in PyG's RandomLinkSplit, dimensions are not consistent if there is only one edge in the graph - if data['tx', 'belongs', 'bd'].edge_label_index.dim() == 1: - data['tx', 'belongs', 'bd'].edge_label_index = data['tx', 'belongs', 'bd'].edge_label_index.unsqueeze(1) - data['tx', 'belongs', 'bd'].edge_label = data['tx', 'belongs', 'bd'].edge_label.unsqueeze(0) - assert data['tx', 'belongs', 'bd'].edge_label_index.dim() == 2 + if data["tx", "belongs", "bd"].edge_label_index.dim() == 1: + data["tx", "belongs", "bd"].edge_label_index = data["tx", "belongs", "bd"].edge_label_index.unsqueeze(1) + data["tx", "belongs", "bd"].edge_label = data["tx", "belongs", "bd"].edge_label.unsqueeze(0) + assert data["tx", "belongs", "bd"].edge_label_index.dim() == 2 return data diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 5e21366..6937a72 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -21,12 +21,12 @@ # TODO: Add documentation for settings -class STSampleParquet(): +class STSampleParquet: """ A class to manage spatial transcriptomics data stored in parquet files. - This class provides methods for loading, processing, and saving data related - to ST samples. It supports parallel processing and efficient handling of + This class provides methods for loading, processing, and saving data related + to ST samples. It supports parallel processing and efficient handling of transcript and boundary data. """ @@ -51,7 +51,7 @@ def __init__( Raises ------ FileNotFoundError - If the base directory does not exist or the required files are + If the base directory does not exist or the required files are missing. """ # Setup paths and resource constraints @@ -65,18 +65,17 @@ def __init__( # Setup logging logging.basicConfig(level=logging.INFO) - self.logger = logging.Logger(f'STSample@{base_dir}') + self.logger = logging.Logger(f"STSample@{base_dir}") # Internal caches self._extents = None self._transcripts_metadata = None self._boundaries_metadata = None - # Setup default embedding for transcripts - classes = self.transcripts_metadata['feature_names'] + # Setup default embedding for transcripts + classes = self.transcripts_metadata["feature_names"] self._transcript_embedding = TranscriptEmbedding(np.array(classes)) - @classmethod def _get_parquet_metadata( cls, @@ -91,7 +90,7 @@ def _get_parquet_metadata( filepath : os.PathLike The path to the parquet file. columns : Optional[List[str]], default None - List of columns to extract metadata for. If None, all columns + List of columns to extract metadata for. If None, all columns are used. Returns @@ -109,13 +108,13 @@ def _get_parquet_metadata( """ # Size in bytes of field dtypes size_map = { - 'BOOLEAN': 1, - 'INT32': 4, - 'FLOAT': 4, - 'INT64': 8, - 'DOUBLE': 8, - 'BYTE_ARRAY': 8, - 'INT96': 12, + "BOOLEAN": 1, + "INT32": 4, + "FLOAT": 4, + "INT64": 8, + "DOUBLE": 8, + "BYTE_ARRAY": 8, + "INT96": 12, } # Read in metadata @@ -129,21 +128,20 @@ def _get_parquet_metadata( # Grab important fields from metadata summary = dict() - summary['n_rows'] = metadata.num_rows - summary['n_columns'] = len(columns) - summary['column_sizes'] = dict() + summary["n_rows"] = metadata.num_rows + summary["n_columns"] = len(columns) + summary["column_sizes"] = dict() for c in columns: # Error where 10X saved BOOLEAN field as INT32 in schema - if c == 'overlaps_nucleus': - dtype = 'BOOLEAN' + if c == "overlaps_nucleus": + dtype = "BOOLEAN" else: i = metadata.schema.names.index(c) dtype = metadata.schema[i].physical_type - summary['column_sizes'][c] = size_map[dtype] + summary["column_sizes"][c] = size_map[dtype] return summary - @cached_property def transcripts_metadata(self) -> dict: """ @@ -152,7 +150,7 @@ def transcripts_metadata(self) -> dict: Returns ------- dict - Metadata dictionary for transcripts including column sizes and + Metadata dictionary for transcripts including column sizes and feature names. Raises @@ -169,13 +167,12 @@ def transcripts_metadata(self) -> dict: # Get filtered unique feature names table = pq.read_table(self._transcripts_filepath) names = pc.unique(table[self.settings.transcripts.label]) - pattern = '|'.join(self.settings.transcripts.filter_substrings) + pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) - metadata['feature_names'] = pc.filter(names, mask).tolist() + metadata["feature_names"] = pc.filter(names, mask).tolist() self._transcripts_metadata = metadata return self._transcripts_metadata - @cached_property def boundaries_metadata(self) -> dict: """ @@ -199,7 +196,6 @@ def boundaries_metadata(self) -> dict: self._boundaries_metadata = metadata return self._boundaries_metadata - @property def n_transcripts(self) -> int: """ @@ -210,8 +206,7 @@ def n_transcripts(self) -> int: int The number of transcripts. """ - return self.transcripts_metadata['n_rows'] - + return self.transcripts_metadata["n_rows"] @cached_property def extents(self) -> shapely.Polygon: @@ -236,7 +231,6 @@ def extents(self) -> shapely.Polygon: return self._extents - def _get_balanced_regions( self, ) -> List[shapely.Polygon]: @@ -252,10 +246,10 @@ def _get_balanced_regions( # If no. workers is 1, return full extents if self.n_workers == 1: return [self.extents] - + # Otherwise, split based on boundary distribution which is much smaller # than transcripts DataFrame. - # Note: Assumes boundaries are distributed similarly to transcripts at + # Note: Assumes boundaries are distributed similarly to transcripts at # a coarse level. data = pd.read_parquet( self._boundaries_filepath, @@ -265,7 +259,6 @@ def _get_balanced_regions( return ndtree.boxes - @staticmethod def _setup_directory( data_dir: os.PathLike, @@ -273,8 +266,8 @@ def _setup_directory( """ Sets up the directory structure for saving processed tiles. - Ensures that the necessary subdirectories for 'train', 'test', and - 'val' are created under the provided base directory. If any of these + Ensures that the necessary subdirectories for 'train', 'test', and + 'val' are created under the provided base directory. If any of these subdirectories already exist and are not empty, an error is raised. Directory structure created: @@ -298,15 +291,14 @@ def _setup_directory( If any of the 'processed' directories already contain files. """ data_dir = Path(data_dir) # by default, convert to Path object - for tile_type in ['train_tiles', 'test_tiles', 'val_tiles']: - for stage in ['raw', 'processed']: + for tile_type in ["train_tiles", "test_tiles", "val_tiles"]: + for stage in ["raw", "processed"]: tile_dir = data_dir / tile_type / stage tile_dir.mkdir(parents=True, exist_ok=True) if os.listdir(tile_dir): msg = f"Directory '{tile_dir}' must be empty." raise AssertionError(msg) - def set_transcript_embedding(self, weights: pd.DataFrame): """ Sets the transcript embedding for the sample. @@ -319,33 +311,32 @@ def set_transcript_embedding(self, weights: pd.DataFrame): Raises ------ ValueError - If the provided weights do not match the number of transcript + If the provided weights do not match the number of transcript features. """ - classes = self._transcripts_metadata['feature_names'] + classes = self._transcripts_metadata["feature_names"] self._transcript_embedding = TranscriptEmbedding(classes, weights) - def save( self, data_dir: os.PathLike, k_bd: int = 3, - dist_bd: float = 15., + dist_bd: float = 15.0, k_tx: int = 3, - dist_tx: float = 5., + dist_tx: float = 5.0, tile_size: Optional[int] = None, tile_width: Optional[float] = None, tile_height: Optional[float] = None, - neg_sampling_ratio: float = 5., - frac: float = 1., + neg_sampling_ratio: float = 5.0, + frac: float = 1.0, val_prob: float = 0.1, test_prob: float = 0.2, ): """ - Saves the tiles of the sample as PyTorch geometric datasets. See + Saves the tiles of the sample as PyTorch geometric datasets. See documentation for 'STTile' for more information on dataset contents. - Note: This function requires either 'tile_size' OR both 'tile_width' and + Note: This function requires either 'tile_size' OR both 'tile_width' and 'tile_height' to be provided. Parameters @@ -361,7 +352,7 @@ def save( dist_tx : float, optional, default 5.0 Maximum distance for transcript neighbors. tile_size : int, optional - If provided, specifies the size of the tile. Overrides `tile_width` + If provided, specifies the size of the tile. Overrides `tile_width` and `tile_height`. tile_width : int, optional Width of the tiles in pixels. Ignored if `tile_size` is provided. @@ -379,7 +370,7 @@ def save( Raises ------ ValueError - If the 'frac' parameter is greater than 1.0 or if the calculated + If the 'frac' parameter is greater than 1.0 or if the calculated number of tiles is zero. AssertionError If the specified directory structure is not properly set up. @@ -412,7 +403,7 @@ def func(region): for tile in tiles: # Choose training, test, or validation datasets data_type = np.random.choice( - a=['train_tiles', 'test_tiles', 'val_tiles'], + a=["train_tiles", "test_tiles", "val_tiles"], p=[1 - (test_prob + val_prob), test_prob, val_prob], ) xt = STTile(dataset=xm, extents=tile) @@ -425,9 +416,9 @@ def func(region): ) if pyg_data is not None: if pyg_data["tx", "belongs", "bd"].edge_index.numel() == 0: - # this tile is only for testing - data_type = 'test_tiles' - filepath = data_dir / data_type / 'processed' / f'{xt.uid}.pt' + # this tile is only for testing + data_type = "test_tiles" + filepath = data_dir / data_type / "processed" / f"{xt.uid}.pt" torch.save(pyg_data, filepath) # TODO: Add Dask backend @@ -436,12 +427,12 @@ def func(region): # TODO: Add documentation for settings -class STInMemoryDataset(): +class STInMemoryDataset: """ A class for handling in-memory representations of ST data. This class is used to load and manage ST sample data from parquet files, - filter boundaries and transcripts, and provide spatial tiling for further + filter boundaries and transcripts, and provide spatial tiling for further analysis. The class also pre-loads KDTrees for efficient spatial queries. Parameters @@ -467,7 +458,7 @@ class STInMemoryDataset(): The filtered boundaries within the dataset extents. kdtree_tx : KDTree The KDTree for fast spatial queries on the transcripts. - + Raises ------ ValueError @@ -482,7 +473,7 @@ def __init__( ): """ Initializes the STInMemoryDataset instance by loading transcripts - and boundaries from parquet files and pre-loading a KDTree for fast + and boundaries from parquet files and pre-loading a KDTree for fast spatial queries. Parameters @@ -505,11 +496,7 @@ def __init__( self._load_boundaries(self.sample._boundaries_filepath) # Pre-load KDTrees - self.kdtree_tx = KDTree( - self.transcripts[self.settings.transcripts.xy], - leafsize=100 - ) - + self.kdtree_tx = KDTree(self.transcripts[self.settings.transcripts.xy], leafsize=100) def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): """ @@ -528,7 +515,7 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): If the transcripts dataframe cannot be loaded or filtered. """ # Load and filter transcripts dataframe - bounds = self.extents.buffer(self.margin, join_style='mitre') + bounds = self.extents.buffer(self.margin, join_style="mitre") transcripts = utils.read_parquet_region( path, x=self.settings.transcripts.x, @@ -542,11 +529,10 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): self.settings.transcripts.filter_substrings, min_qv, ) - + # Only set object properties once everything finishes successfully self.transcripts = transcripts - def _load_boundaries(self, path: os.PathLike): """ Loads and filters the boundaries dataframe for the dataset. @@ -562,7 +548,7 @@ def _load_boundaries(self, path: os.PathLike): If the boundaries dataframe cannot be loaded or filtered. """ # Load and filter boundaries dataframe - outset = self.extents.buffer(self.margin, join_style='mitre') + outset = self.extents.buffer(self.margin, join_style="mitre") boundaries = utils.read_parquet_region( path, x=self.settings.boundaries.x, @@ -580,7 +566,6 @@ def _load_boundaries(self, path: os.PathLike): ) self.boundaries = boundaries - def _get_rectangular_tile_bounds( self, tile_width: float, @@ -607,7 +592,7 @@ def _get_rectangular_tile_bounds( x_coords = np.append(x_coords, x_max) y_coords = np.arange(y_min, y_max, tile_height) y_coords = np.append(y_coords, y_max) - + # Generate tiles from grid points tiles = [] for x_min, x_max in zip(x_coords[:-1], x_coords[1:]): @@ -616,7 +601,6 @@ def _get_rectangular_tile_bounds( return tiles - def _get_balanced_tile_bounds( self, max_size: Optional[int], @@ -657,14 +641,14 @@ def recurse(node, bounds): bounds = Rectangle(self.kdtree_tx.mins, self.kdtree_tx.maxes) return recurse(node, bounds) - - def _tile(self, + def _tile( + self, width: Optional[float] = None, height: Optional[float] = None, max_size: Optional[int] = None, - ) -> List[shapely.Polygon]: + ) -> List[shapely.Polygon]: """ - Generates tiles based on either fixed dimensions or balanced + Generates tiles based on either fixed dimensions or balanced partitioning. Parameters @@ -674,7 +658,7 @@ def _tile(self, height : Optional[float] The height of each tile. Required if `max_size` is not provided. max_size : Optional[int] - The maximum number of points in each tile. Required if `width` and + The maximum number of points in each tile. Required if `width` and `height` are not provided. Returns @@ -685,7 +669,7 @@ def _tile(self, Raises ------ ValueError - If both `width`/`height` and `max_size` are provided or none are + If both `width`/`height` and `max_size` are provided or none are provided. """ # Square tiling kwargs provided @@ -697,11 +681,8 @@ def _tile(self, # Bad set of kwargs else: args = list(compress(locals().keys(), locals().values())) - args.remove('self') - msg = ( - "Function requires either 'max_size' or both " - f"'width' and 'height'. Found: {', '.join(args)}." - ) + args.remove("self") + msg = "Function requires either 'max_size' or both " f"'width' and 'height'. Found: {', '.join(args)}." logging.error(msg) raise ValueError @@ -740,9 +721,9 @@ def __init__( Notes ----- - The `boundaries` and `transcripts` attributes are cached to avoid the - overhead of filtering when tiles are instantiated. This is particularly - useful in multiprocessing settings where generating tiles in parallel + The `boundaries` and `transcripts` attributes are cached to avoid the + overhead of filtering when tiles are instantiated. This is particularly + useful in multiprocessing settings where generating tiles in parallel could lead to high overhead. Internal Attributes @@ -761,22 +742,21 @@ def __init__( self._boundaries = None self._transcripts = None - @property def uid(self) -> str: """ - Generates a unique identifier for the tile based on its extents. This - UID is particularly useful for saving or indexing tiles in distributed + Generates a unique identifier for the tile based on its extents. This + UID is particularly useful for saving or indexing tiles in distributed processing environments. The UID is constructed using the minimum and maximum x and y coordinates - of the tile's bounding box, representing its position and size in the + of the tile's bounding box, representing its position and size in the sample. Returns ------- str - A unique identifier string in the format + A unique identifier string in the format 'x=_y=_w=_h=' where: - ``: Minimum x-coordinate of the tile's extents. - ``: Minimum y-coordinate of the tile's extents. @@ -790,52 +770,49 @@ def uid(self) -> str: 'x=100_y=200_w=50_h=50' """ x_min, y_min, x_max, y_max = map(int, self.extents.bounds) - uid = f'tiles_x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}' + uid = f"tiles_x={x_min}_y={y_min}_w={x_max-x_min}_h={y_max-y_min}" return uid - @cached_property def boundaries(self) -> pd.DataFrame: """ Returns the filtered boundaries within the tile extents, cached for efficiency. - The boundaries are computed only once and cached. If the boundaries - have not been computed yet, they are computed using + The boundaries are computed only once and cached. If the boundaries + have not been computed yet, they are computed using `get_filtered_boundaries()`. Returns ------- pd.DataFrame - A DataFrame containing the filtered boundaries within the tile + A DataFrame containing the filtered boundaries within the tile extents. """ if self._boundaries is None: self._boundaries = self.get_filtered_boundaries() return self._boundaries - @cached_property def transcripts(self) -> pd.DataFrame: """ Returns the filtered transcripts within the tile extents, cached for efficiency. - The transcripts are computed only once and cached. If the transcripts - have not been computed yet, they are computed using + The transcripts are computed only once and cached. If the transcripts + have not been computed yet, they are computed using `get_filtered_transcripts()`. Returns ------- pd.DataFrame - A DataFrame containing the filtered transcripts within the tile + A DataFrame containing the filtered transcripts within the tile extents. """ if self._transcripts is None: self._transcripts = self.get_filtered_transcripts() return self._transcripts - def get_filtered_boundaries(self) -> pd.DataFrame: """ Filters the boundaries in the sample to include only those within @@ -844,20 +821,19 @@ def get_filtered_boundaries(self) -> pd.DataFrame: Returns ------- pd.DataFrame - A DataFrame containing the filtered boundaries within the tile + A DataFrame containing the filtered boundaries within the tile extents. """ filtered_boundaries = utils.filter_boundaries( boundaries=self.dataset.boundaries, inset=self.extents, - outset=self.extents.buffer(self.margin, join_style='mitre'), + outset=self.extents.buffer(self.margin, join_style="mitre"), x=self.settings.boundaries.x, y=self.settings.boundaries.y, label=self.settings.boundaries.label, ) return filtered_boundaries - def get_filtered_transcripts(self) -> pd.DataFrame: """ Filters the transcripts in the sample to include only those within @@ -866,13 +842,13 @@ def get_filtered_transcripts(self) -> pd.DataFrame: Returns ------- pd.DataFrame - A DataFrame containing the filtered transcripts within the tile + A DataFrame containing the filtered transcripts within the tile extents. """ # Buffer tile bounds to include transcripts around boundary - outset = self.extents.buffer(self.margin, join_style='mitre') - xmin, ymin, xmax, ymax = outset.bounds + outset = self.extents.buffer(self.margin, join_style="mitre") + xmin, ymin, xmax, ymax = outset.bounds # Get transcripts inside buffered region x, y = self.settings.transcripts.xy @@ -882,7 +858,6 @@ def get_filtered_transcripts(self) -> pd.DataFrame: return filtered_transcripts - def get_transcript_props(self) -> torch.Tensor: """ Encodes transcript features in a sparse format. @@ -894,9 +869,9 @@ def get_transcript_props(self) -> torch.Tensor: Notes ----- - The intention is for this function to simplify testing new strategies + The intention is for this function to simplify testing new strategies for 'tx' node representations. For example, the encoder can be any type - of encoder that transforms the transcript labels into a numerical + of encoder that transforms the transcript labels into a numerical matrix (in sparse format). """ # Encode transcript features in sparse format @@ -906,7 +881,6 @@ def get_transcript_props(self) -> torch.Tensor: return props - @staticmethod def get_polygon_props( polygons: gpd.GeoSeries, @@ -938,18 +912,17 @@ def get_polygon_props( """ props = pd.DataFrame(index=polygons.index, dtype=float) if area: - props['area'] = polygons.area + props["area"] = polygons.area if convexity: - props['convexity'] = polygons.convex_hull.area / polygons.area + props["convexity"] = polygons.convex_hull.area / polygons.area if elongation: rects = polygons.minimum_rotated_rectangle() - props['elongation'] = rects.area / polygons.envelope.area + props["elongation"] = rects.area / polygons.envelope.area if circularity: r = polygons.minimum_bounding_radius() - props["circularity"] = polygons.area / r ** 2 - - return props + props["circularity"] = polygons.area / r**2 + return props @staticmethod def get_kdtree_edge_index( @@ -993,7 +966,6 @@ def get_kdtree_edge_index( return edge_index - def get_boundary_props( self, area: bool = True, @@ -1007,29 +979,29 @@ def get_boundary_props( Parameters ---------- area : bool, optional - If True, compute the area of each boundary polygon (default is + If True, compute the area of each boundary polygon (default is True). convexity : bool, optional - If True, compute the convexity of each boundary polygon (default is + If True, compute the convexity of each boundary polygon (default is True). elongation : bool, optional If True, compute the elongation of each boundary polygon (default is True). circularity : bool, optional - If True, compute the circularity of each boundary polygon (default + If True, compute the circularity of each boundary polygon (default is True). Returns ------- torch.Tensor - A tensor containing the computed properties for each boundary + A tensor containing the computed properties for each boundary polygon. Notes ----- - The intention is for this function to simplify testing new strategies + The intention is for this function to simplify testing new strategies for 'bd' node representations. You can just change the function body to - return another torch.Tensor without worrying about changes to the rest + return another torch.Tensor without worrying about changes to the rest of the code. """ # Get polygons from coordinates @@ -1045,10 +1017,9 @@ def get_boundary_props( return props - def to_pyg_dataset( self, - #train: bool, + # train: bool, neg_sampling_ratio: float = 5, k_bd: int = 3, dist_bd: float = 15, @@ -1066,7 +1037,7 @@ def to_pyg_dataset( Parameters ---------- train: bool - Whether a sample is part of the training dataset. If True, add + Whether a sample is part of the training dataset. If True, add negative edges to dataset. k_bd : int, optional The number of nearest neighbors for the 'bd' nodes (default is 4). @@ -1142,7 +1113,7 @@ def to_pyg_dataset( Edge indices in COO format between transcripts and boundaries 3. ("tx", "neighbors", "tx") - Represents the relationship where a transcript is nearby another + Represents the relationship where a transcript is nearby another transcript. Attributes @@ -1154,15 +1125,15 @@ def to_pyg_dataset( pyg_data = HeteroData() # Set up Transcript nodes - pyg_data['tx'].id = torch.tensor( + pyg_data["tx"].id = torch.tensor( self.transcripts[self.settings.transcripts.id].values.astype(int), dtype=torch.int, ) - pyg_data['tx'].pos = torch.tensor( + pyg_data["tx"].pos = torch.tensor( self.transcripts[self.settings.transcripts.xyz].values, dtype=torch.float32, ) - pyg_data['tx'].x = self.get_transcript_props() + pyg_data["tx"].x = self.get_transcript_props() # Set up Transcript-Transcript neighbor edges nbrs_edge_idx = self.get_kdtree_edge_index( @@ -1187,11 +1158,9 @@ def to_pyg_dataset( self.settings.boundaries.label, ) centroids = polygons.centroid.get_coordinates() - pyg_data['bd'].id = polygons.index.to_numpy() - pyg_data['bd'].pos = torch.tensor(centroids.values, dtype=torch.float32) - pyg_data['bd'].x = self.get_boundary_props( - area, convexity, elongation, circularity - ) + pyg_data["bd"].id = polygons.index.to_numpy() + pyg_data["bd"].pos = torch.tensor(centroids.values, dtype=torch.float32) + pyg_data["bd"].x = self.get_boundary_props(area, convexity, elongation, circularity) # Set up Boundary-Transcript neighbor edges dist = np.sqrt(polygons.area.max()) * 10 # heuristic distance @@ -1208,16 +1177,14 @@ def to_pyg_dataset( logging.warning(f"No tx-neighbors-bd edges found in tile {self.uid}.") pyg_data["tx", "belongs", "bd"].edge_index = torch.tensor([], dtype=torch.long) return pyg_data - + # Now we identify and split the tx-belongs-bd edges - edge_type = ('tx', 'belongs', 'bd') + edge_type = ("tx", "belongs", "bd") # Find nuclear transcripts tx_cell_ids = self.transcripts[self.settings.boundaries.id] cell_ids_map = {idx: i for (i, idx) in enumerate(polygons.index)} - is_nuclear = self.transcripts[ - self.settings.transcripts.nuclear - ].astype(bool) + is_nuclear = self.transcripts[self.settings.transcripts.nuclear].astype(bool) is_nuclear &= tx_cell_ids.isin(polygons.index) # Set up overlap edges @@ -1242,11 +1209,10 @@ def to_pyg_dataset( ) pyg_data, _, _ = transform(pyg_data) - # Refilter negative edges to include only transcripts in the + # Refilter negative edges to include only transcripts in the # original positive edges (still need a memory-efficient solution) edges = pyg_data[edge_type] - mask = edges.edge_label_index[0].unsqueeze(1) == \ - edges.edge_index[0].unsqueeze(0) + mask = edges.edge_label_index[0].unsqueeze(1) == edges.edge_index[0].unsqueeze(0) mask = torch.nonzero(torch.any(mask, 1)).squeeze() edges.edge_label_index = edges.edge_label_index[:, mask] edges.edge_label = edges.edge_label[mask] diff --git a/src/segger/data/parquet/transcript_embedding.py b/src/segger/data/parquet/transcript_embedding.py index 2f8085c..8abeebc 100644 --- a/src/segger/data/parquet/transcript_embedding.py +++ b/src/segger/data/parquet/transcript_embedding.py @@ -6,14 +6,15 @@ from numpy.typing import ArrayLike import pandas as pd + # TODO: Add documentation class TranscriptEmbedding(torch.nn.Module): - ''' + """ Utility class to handle transcript embeddings in PyTorch so that they are optionally learnable in the future. - + Default behavior is to use the index of gene names. - ''' + """ # TODO: Add documentation @staticmethod @@ -23,26 +24,17 @@ def _check_inputs( ): # Classes is a 1D array if len(classes.shape) > 1: - msg = ( - "'classes' should be a 1D array, got an array of shape " - f"{classes.shape} instead." - ) + msg = "'classes' should be a 1D array, got an array of shape " f"{classes.shape} instead." raise ValueError(msg) # Items appear exactly once if len(classes) != len(set(classes)): - msg = ( - "All embedding classes must be unique. One or more items in " - "'classes' appears twice." - ) + msg = "All embedding classes must be unique. One or more items in " "'classes' appears twice." raise ValueError(msg) # All classes have an entry in weights elif weights is not None: missing = set(classes).difference(weights.index) if len(missing) > 0: - msg = ( - f"Index of 'weights' DataFrame is missing {len(missing)} " - "entries compared to classes." - ) + msg = f"Index of 'weights' DataFrame is missing {len(missing)} " "entries compared to classes." raise ValueError(msg) # TODO: Add documentation @@ -66,6 +58,6 @@ def embed(self, classes: ArrayLike): indices = LongTensor(self._encoder.transform(classes)) # Default, one-hot encoding if self._weights is None: - return indices #F.one_hot(indices, len(self._encoder.classes_)) + return indices # F.one_hot(indices, len(self._encoder.classes_)) else: return F.embedding(indices, self._weights) diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index 3abd5b1..b673a87 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -5,6 +5,7 @@ def try_import(module_name): except ImportError: print(f"Warning: {module_name} is not installed. Please install it to use this functionality.") + # Standard imports import pandas as pd import numpy as np @@ -20,6 +21,7 @@ def try_import(module_name): from torch_geometric.nn import radius_graph import os from scipy.spatial import cKDTree + # import hnswlib from shapely.geometry import Polygon from shapely.affinity import scale @@ -28,10 +30,10 @@ def try_import(module_name): import sys # Attempt to import specific modules with try_import function -try_import('multiprocessing') -try_import('joblib') -try_import('faiss') -try_import('cuvs') +try_import("multiprocessing") +try_import("joblib") +try_import("faiss") +try_import("cuvs") try: import cupy as cp from cuvs.neighbors import cagra @@ -42,8 +44,6 @@ def try_import(module_name): from datetime import timedelta - - def filter_transcripts( transcripts_df: pd.DataFrame, min_qv: float = 20.0, @@ -64,7 +64,7 @@ def filter_transcripts( "NegControlCodeword_", "BLANK_", "DeprecatedCodeword_", - "UnassignedCodeword_" + "UnassignedCodeword_", ) mask = transcripts_df["qv"].ge(min_qv) mask &= ~transcripts_df["feature_name"].str.startswith(filter_codewords) @@ -72,9 +72,7 @@ def filter_transcripts( def compute_transcript_metrics( - df: pd.DataFrame, - qv_threshold: float = 30, - cell_id_col: str = 'cell_id' + df: pd.DataFrame, qv_threshold: float = 30, cell_id_col: str = "cell_id" ) -> Dict[str, Any]: """ Computes various metrics for a given dataframe of transcript data filtered by quality value threshold. @@ -92,44 +90,48 @@ def compute_transcript_metrics( - 'percent_non_assigned_cytoplasmic' (float): The percentage of non-assigned cytoplasmic transcripts. - 'gene_metrics' (pd.DataFrame): A dataframe containing gene-level metrics. """ - df_filtered = df[df['qv'] > qv_threshold] + df_filtered = df[df["qv"] > qv_threshold] total_transcripts = len(df_filtered) assigned_transcripts = df_filtered[df_filtered[cell_id_col] != -1] - percent_assigned = len(assigned_transcripts) / (total_transcripts+1) * 100 - cytoplasmic_transcripts = assigned_transcripts[assigned_transcripts['overlaps_nucleus'] != 1] - percent_cytoplasmic = len(cytoplasmic_transcripts) / (len(assigned_transcripts) + 1)* 100 + percent_assigned = len(assigned_transcripts) / (total_transcripts + 1) * 100 + cytoplasmic_transcripts = assigned_transcripts[assigned_transcripts["overlaps_nucleus"] != 1] + percent_cytoplasmic = len(cytoplasmic_transcripts) / (len(assigned_transcripts) + 1) * 100 percent_nucleus = 100 - percent_cytoplasmic non_assigned_transcripts = df_filtered[df_filtered[cell_id_col] == -1] - non_assigned_cytoplasmic = non_assigned_transcripts[non_assigned_transcripts['overlaps_nucleus'] != 1] - percent_non_assigned_cytoplasmic = len(non_assigned_cytoplasmic) / (len(non_assigned_transcripts)+1) * 100 - gene_group_assigned = assigned_transcripts.groupby('feature_name') - gene_group_all = df_filtered.groupby('feature_name') - gene_percent_assigned = (gene_group_assigned.size() / (gene_group_all.size()+1) * 100).reset_index(names='percent_assigned') - cytoplasmic_gene_group = cytoplasmic_transcripts.groupby('feature_name') - gene_percent_cytoplasmic = (cytoplasmic_gene_group.size() / (len(cytoplasmic_transcripts)+1) * 100).reset_index(name='percent_cytoplasmic') - gene_metrics = pd.merge(gene_percent_assigned, gene_percent_cytoplasmic, on='feature_name', how='outer').fillna(0) + non_assigned_cytoplasmic = non_assigned_transcripts[non_assigned_transcripts["overlaps_nucleus"] != 1] + percent_non_assigned_cytoplasmic = len(non_assigned_cytoplasmic) / (len(non_assigned_transcripts) + 1) * 100 + gene_group_assigned = assigned_transcripts.groupby("feature_name") + gene_group_all = df_filtered.groupby("feature_name") + gene_percent_assigned = (gene_group_assigned.size() / (gene_group_all.size() + 1) * 100).reset_index( + names="percent_assigned" + ) + cytoplasmic_gene_group = cytoplasmic_transcripts.groupby("feature_name") + gene_percent_cytoplasmic = (cytoplasmic_gene_group.size() / (len(cytoplasmic_transcripts) + 1) * 100).reset_index( + name="percent_cytoplasmic" + ) + gene_metrics = pd.merge(gene_percent_assigned, gene_percent_cytoplasmic, on="feature_name", how="outer").fillna(0) results = { - 'percent_assigned': percent_assigned, - 'percent_cytoplasmic': percent_cytoplasmic, - 'percent_nucleus': percent_nucleus, - 'percent_non_assigned_cytoplasmic': percent_non_assigned_cytoplasmic, - 'gene_metrics': gene_metrics + "percent_assigned": percent_assigned, + "percent_cytoplasmic": percent_cytoplasmic, + "percent_nucleus": percent_nucleus, + "percent_non_assigned_cytoplasmic": percent_non_assigned_cytoplasmic, + "gene_metrics": gene_metrics, } return results def create_anndata( - df: pd.DataFrame, - panel_df: Optional[pd.DataFrame] = None, - min_transcripts: int = 5, - cell_id_col: str = 'cell_id', - qv_threshold: float = 30, - min_cell_area: float = 10.0, - max_cell_area: float = 1000.0 + df: pd.DataFrame, + panel_df: Optional[pd.DataFrame] = None, + min_transcripts: int = 5, + cell_id_col: str = "cell_id", + qv_threshold: float = 30, + min_cell_area: float = 10.0, + max_cell_area: float = 1000.0, ) -> ad.AnnData: """ Generates an AnnData object from a dataframe of segmented transcriptomics data. - + Parameters: df (pd.DataFrame): The dataframe containing segmented transcriptomics data. panel_df (Optional[pd.DataFrame]): The dataframe containing panel information. @@ -138,24 +140,23 @@ def create_anndata( qv_threshold (float): The quality value threshold for filtering transcripts. min_cell_area (float): The minimum cell area to include a cell. max_cell_area (float): The maximum cell area to include a cell. - + Returns: ad.AnnData: The generated AnnData object containing the transcriptomics data and metadata. """ # df_filtered = filter_transcripts(df, min_qv=qv_threshold) df_filtered = df # metrics = compute_transcript_metrics(df_filtered, qv_threshold, cell_id_col) - df_filtered = df_filtered[df_filtered[cell_id_col].astype(str) != '-1'] - pivot_df = df_filtered.rename(columns={ - cell_id_col: "cell", - "feature_name": "gene" - })[['cell', 'gene']].pivot_table(index='cell', columns='gene', aggfunc='size', fill_value=0) + df_filtered = df_filtered[df_filtered[cell_id_col].astype(str) != "-1"] + pivot_df = df_filtered.rename(columns={cell_id_col: "cell", "feature_name": "gene"})[["cell", "gene"]].pivot_table( + index="cell", columns="gene", aggfunc="size", fill_value=0 + ) pivot_df = pivot_df[pivot_df.sum(axis=1) >= min_transcripts] cell_summary = [] for cell_id, cell_data in df_filtered.groupby(cell_id_col): if len(cell_data) < min_transcripts: continue - cell_convex_hull = ConvexHull(cell_data[['x_location', 'y_location']], qhull_options='QJ') + cell_convex_hull = ConvexHull(cell_data[["x_location", "y_location"]], qhull_options="QJ") cell_area = cell_convex_hull.area if cell_area < min_cell_area or cell_area > max_cell_area: continue @@ -167,47 +168,50 @@ def create_anndata( # nucleus_convex_hull = ConvexHull(nucleus_data[['x_location', 'y_location']]) # else: # nucleus_convex_hull = None - cell_summary.append({ - "cell": cell_id, - "cell_centroid_x": cell_data['x_location'].mean(), - "cell_centroid_y": cell_data['y_location'].mean(), - "cell_area": cell_area, - # "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), - # "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), - # "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0, - # "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100, - # "has_nucleus": len(nucleus_data) > 0 - }) + cell_summary.append( + { + "cell": cell_id, + "cell_centroid_x": cell_data["x_location"].mean(), + "cell_centroid_y": cell_data["y_location"].mean(), + "cell_area": cell_area, + # "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), + # "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(), + # "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0, + # "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100, + # "has_nucleus": len(nucleus_data) > 0 + } + ) cell_summary = pd.DataFrame(cell_summary).set_index("cell") if panel_df is not None: - panel_df = panel_df.sort_values('gene') - genes = panel_df['gene'].values + panel_df = panel_df.sort_values("gene") + genes = panel_df["gene"].values for gene in genes: if gene not in pivot_df: pivot_df[gene] = 0 pivot_df = pivot_df[genes.tolist()] if panel_df is None: - var_df = pd.DataFrame([{ - "gene": i, - "feature_types": 'Gene Expression', - 'genome': 'Unknown' - } for i in np.unique(pivot_df.columns.values)]).set_index('gene') + var_df = pd.DataFrame( + [ + {"gene": i, "feature_types": "Gene Expression", "genome": "Unknown"} + for i in np.unique(pivot_df.columns.values) + ] + ).set_index("gene") else: - var_df = panel_df[['gene', 'ensembl']].rename(columns={'ensembl':'gene_ids'}) - var_df['feature_types'] = 'Gene Expression' - var_df['genome'] = 'Unknown' - var_df = var_df.set_index('gene') + var_df = panel_df[["gene", "ensembl"]].rename(columns={"ensembl": "gene_ids"}) + var_df["feature_types"] = "Gene Expression" + var_df["genome"] = "Unknown" + var_df = var_df.set_index("gene") # gene_metrics = metrics['gene_metrics'].set_index('feature_name') # var_df = var_df.join(gene_metrics, how='left').fillna(0) cells = list(set(pivot_df.index) & set(cell_summary.index)) - pivot_df = pivot_df.loc[cells,:] - cell_summary = cell_summary.loc[cells,:] + pivot_df = pivot_df.loc[cells, :] + cell_summary = cell_summary.loc[cells, :] adata = ad.AnnData(pivot_df.values) adata.var = var_df - adata.obs['transcripts'] = pivot_df.sum(axis=1).values - adata.obs['unique_transcripts'] = (pivot_df > 0).sum(axis=1).values + adata.obs["transcripts"] = pivot_df.sum(axis=1).values + adata.obs["unique_transcripts"] = (pivot_df > 0).sum(axis=1).values adata.obs_names = pivot_df.index.values.tolist() - adata.obs = pd.merge(adata.obs, cell_summary.loc[adata.obs_names,:], left_index=True, right_index=True) + adata.obs = pd.merge(adata.obs, cell_summary.loc[adata.obs_names, :], left_index=True, right_index=True) # adata.uns['metrics'] = { # 'percent_assigned': metrics['percent_assigned'], # 'percent_cytoplasmic': metrics['percent_cytoplasmic'], @@ -216,10 +220,9 @@ def create_anndata( # } return adata - def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_column: str) -> pd.DataFrame: - """Calculate the cell type abundance embedding for each gene based on the percentage of cells in each cell type + """Calculate the cell type abundance embedding for each gene based on the percentage of cells in each cell type that express the gene (non-zero expression). Parameters: @@ -227,9 +230,9 @@ def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_colu celltype_column (str): The column name in `adata.obs` that contains the cell type information. Returns: - pd.DataFrame: A DataFrame where rows are genes and columns are cell types, with each value representing + pd.DataFrame: A DataFrame where rows are genes and columns are cell types, with each value representing the percentage of cells in that cell type expressing the gene. - + Example: >>> adata = AnnData(...) # Load your scRNA-seq AnnData object >>> celltype_column = 'celltype_major' @@ -255,13 +258,21 @@ def calculate_gene_celltype_abundance_embedding(adata: ad.AnnData, celltype_colu abundance = gene_expression_df[cell_type_mask].mean(axis=0) * 100 cell_type_abundance_list.append(abundance) # Create a DataFrame for the cell type abundance with gene names as rows and cell types as columns - cell_type_abundance_df = pd.DataFrame(cell_type_abundance_list, - columns=adata.var_names, - index=encoder.categories_[0]).T + cell_type_abundance_df = pd.DataFrame( + cell_type_abundance_list, columns=adata.var_names, index=encoder.categories_[0] + ).T return cell_type_abundance_df -def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, method: str = 'kd_tree', - gpu: bool = False, workers: int = 1) -> torch.Tensor: + +def get_edge_index( + coords_1: np.ndarray, + coords_2: np.ndarray, + k: int = 5, + dist: int = 10, + method: str = "kd_tree", + gpu: bool = False, + workers: int = 1, +) -> torch.Tensor: """ Computes edge indices using various methods (KD-Tree, FAISS, RAPIDS::cuvs+cupy (cuda)). @@ -276,23 +287,21 @@ def get_edge_index(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: Returns: torch.Tensor: Edge indices. """ - if method == 'kd_tree': + if method == "kd_tree": return get_edge_index_kdtree(coords_1, coords_2, k=k, dist=dist, workers=workers) - elif method == 'faiss': + elif method == "faiss": return get_edge_index_faiss(coords_1, coords_2, k=k, dist=dist, gpu=gpu) - elif method == 'cuda': + elif method == "cuda": # pass return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist) else: - msg = ( - f"Unknown method {method}. Valid methods include: 'kd_tree', " - "'faiss', and 'cuda'." - ) + msg = f"Unknown method {method}. Valid methods include: 'kd_tree', " "'faiss', and 'cuda'." raise ValueError() - -def get_edge_index_kdtree(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, workers: int = 1) -> torch.Tensor: +def get_edge_index_kdtree( + coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, workers: int = 1 +) -> torch.Tensor: """ Computes edge indices using KDTree. @@ -313,15 +322,15 @@ def get_edge_index_kdtree(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5 for idx, valid in enumerate(valid_mask): valid_indices = idx_out[idx][valid] if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) + edges.append(np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T) edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() return edge_index -def get_edge_index_faiss(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, gpu: bool = False) -> torch.Tensor: +def get_edge_index_faiss( + coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, gpu: bool = False +) -> torch.Tensor: """ Computes edge indices using FAISS. @@ -344,30 +353,28 @@ def get_edge_index_faiss(coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, else: index = faiss.IndexFlatL2(d) - index.add(coords_1.astype('float32')) - D, I = index.search(coords_2.astype('float32'), k) + index.add(coords_1.astype("float32")) + D, I = index.search(coords_2.astype("float32"), k) - valid_mask = D < dist ** 2 + valid_mask = D < dist**2 edges = [] for idx, valid in enumerate(valid_mask): valid_indices = I[idx][valid] if valid_indices.size > 0: - edges.append( - np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T - ) + edges.append(np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T) edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous() return edge_index def get_edge_index_cuda( - coords_1: torch.Tensor, - coords_2: torch.Tensor, - k: int = 10, + coords_1: torch.Tensor, + coords_2: torch.Tensor, + k: int = 10, dist: float = 10.0, metric: str = "sqeuclidean", - nn_descent_niter: int = 100 + nn_descent_niter: int = 100, ) -> torch.Tensor: """ Computes edge indices using RAPIDS cuVS with cagra for vector similarity search, @@ -382,11 +389,14 @@ def get_edge_index_cuda( Returns: torch.Tensor: Edge indices as a PyTorch tensor on CUDA. """ + def cupy_to_torch(cupy_array): return torch.from_dlpack((cupy_array.toDlpack())) + # gg def torch_to_cupy(tensor): return cp.fromDlpack(dlpack.to_dlpack(tensor)) + # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack cp_coords_1 = torch_to_cupy(coords_1).astype(cp.float32) cp_coords_2 = torch_to_cupy(coords_2).astype(cp.float32) @@ -394,14 +404,16 @@ def torch_to_cupy(tensor): cp_dist = cp.float32(dist) # IndexParams and SearchParams for cagra # compression_params = cagra.CompressionParams(pq_bits=pq_bits) - index_params = cagra.IndexParams(metric=metric,nn_descent_niter=nn_descent_niter) #, compression=compression_params) + index_params = cagra.IndexParams( + metric=metric, nn_descent_niter=nn_descent_niter + ) # , compression=compression_params) search_params = cagra.SearchParams() # Build index using CuPy coords index = cagra.build_index(index_params, cp_coords_1) # Perform search to get distances and indices (still in CuPy) D, I = cagra.search(search_params, index, cp_coords_2, k) # Boolean mask for filtering distances below the squared threshold (all in CuPy) - valid_mask = cp.asarray(D < cp_dist ** 2) + valid_mask = cp.asarray(D < cp_dist**2) # Vectorized operations for row and valid indices (all in CuPy) repeats = valid_mask.sum(axis=1).tolist() row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats) @@ -412,6 +424,7 @@ def torch_to_cupy(tensor): edge_index = cupy_to_torch(edges).long().contiguous() return edge_index + class SpatialTranscriptomicsDataset(InMemoryDataset): """A dataset class for handling SpatialTranscriptomics spatial transcriptomics data. @@ -421,7 +434,10 @@ class SpatialTranscriptomicsDataset(InMemoryDataset): pre_transform (callable): A function/transform that takes in a Data object and returns a transformed version. pre_filter (callable): A function that takes in a Data object and returns a boolean indicating whether to keep it. """ - def __init__(self, root: str, transform: Callable = None, pre_transform: Callable = None, pre_filter: Callable = None): + + def __init__( + self, root: str, transform: Callable = None, pre_transform: Callable = None, pre_filter: Callable = None + ): """Initialize the SpatialTranscriptomicsDataset. Args: @@ -448,16 +464,14 @@ def processed_file_names(self) -> List[str]: Returns: List[str]: List of processed file names. """ - return [x for x in os.listdir(self.processed_dir) if 'tiles' in x] + return [x for x in os.listdir(self.processed_dir) if "tiles" in x] def download(self) -> None: - """Download the raw data. This method should be overridden if you need to download the data. - """ + """Download the raw data. This method should be overridden if you need to download the data.""" pass def process(self) -> None: - """Process the raw data and save it to the processed directory. This method should be overridden if you need to process the data. - """ + """Process the raw data and save it to the processed directory. This method should be overridden if you need to process the data.""" pass def len(self) -> int: @@ -478,7 +492,7 @@ def get(self, idx: int) -> Data: Data: The processed data object. """ data = torch.load(os.path.join(self.processed_dir, self.processed_file_names[idx])) - data['tx'].x = data['tx'].x.to_dense() + data["tx"].x = data["tx"].x.to_dense() return data @@ -531,8 +545,7 @@ def coo_to_dense_adj( # Check COO format if not edge_index.shape[0] == 2: msg = ( - "Edge index is not in COO format. First dimension should have " - f"size 2, but found {edge_index.shape[0]}." + "Edge index is not in COO format. First dimension should have " f"size 2, but found {edge_index.shape[0]}." ) raise ValueError(msg) @@ -547,39 +560,23 @@ def coo_to_dense_adj( # Fill matrix with neighbors nbr_idx = torch.full((num_nodes, num_nbrs), -1) for i, nbrs in zip(uniques, torch.split(edge_index[1], counts)): - nbr_idx[i, :len(nbrs)] = nbrs + nbr_idx[i, : len(nbrs)] = nbrs return nbr_idx - - - def format_time(elapsed: float) -> str: """ Format elapsed time to h:m:s. - + Parameters: ---------- elapsed : float Elapsed time in seconds. - + Returns: ------- str Formatted time in h:m:s. """ return str(timedelta(seconds=int(elapsed))) - - - - - - - - - - - - - diff --git a/src/segger/models/README.md b/src/segger/models/README.md index 1f872b3..033e545 100644 --- a/src/segger/models/README.md +++ b/src/segger/models/README.md @@ -1,4 +1,3 @@ - # segger: Graph Neural Network Model The `segger` model is a graph neural network designed to handle heterogeneous graphs with two primary node types: **transcripts** and **nuclei or cell boundaries**. It leverages attention-based convolutional layers to compute node embeddings and relationships in spatial transcriptomics data. The architecture includes an initial embedding layer for node feature transformation, multiple graph attention layers (GATv2Conv), and residual linear connections. @@ -32,7 +31,8 @@ The `segger` model is a graph neural network designed to handle heterogeneous gr $$ where: - - \( \alpha_{ij} \) is the attention coefficient between node \( i \) and node \( j \), computed as: + + - \( \alpha\_{ij} \) is the attention coefficient between node \( i \) and node \( j \), computed as: $$ \alpha_{ij} = \frac{\exp\left( \text{LeakyReLU}\left( \mathbf{a}^{\top} [\mathbf{W}^{(l)} \mathbf{h}_{i}^{(l)} || \mathbf{W}^{(l)} \mathbf{h}_{j}^{(l)}] \right)\right)}{\sum_{k \in \mathcal{N}(i)} \exp\left( \text{LeakyReLU}\left( \mathbf{a}^{\top} [\mathbf{W}^{(l)} \mathbf{h}_{i}^{(l)} || \mathbf{W}^{(l)} \mathbf{h}_{k}^{(l)}] \right)\right)} @@ -47,7 +47,7 @@ The `segger` model is a graph neural network designed to handle heterogeneous gr \mathbf{h}_{i}^{(l+1)} = \text{ReLU}\left( \mathbf{h}_{i}^{(l+1)} + \mathbf{W}_{res} \mathbf{h}_{i}^{(l)} \right) $$ - where \( \mathbf{W}_{res} \) is a residual weight matrix. + where \( \mathbf{W}\_{res} \) is a residual weight matrix. 4. **L2 Normalization**: Finally, the embeddings are normalized using L2 normalization: @@ -62,23 +62,21 @@ The `segger` model is a graph neural network designed to handle heterogeneous gr In the next step, the `segger` model is transformed into a **heterogeneous graph neural network** using PyTorch Geometric's `to_hetero` function. This transformation enables the model to handle distinct node and edge types (transcripts and nuclei or cell boundaries) with separate mechanisms for modeling their relationships. - ## Usage To instantiate and run the segger model: ```python model = segger( - num_tx_tokens=5000, # Number of unique 'tx' tokens - init_emb=32, # Initial embedding dimension - hidden_channels=64, # Number of hidden channels - num_mid_layers=2, # Number of middle layers - out_channels=128, # Number of output channels - heads=4 # Number of attention heads + num_tx_tokens=5000, # Number of unique 'tx' tokens + init_emb=32, # Initial embedding dimension + hidden_channels=64, # Number of hidden channels + num_mid_layers=2, # Number of middle layers + out_channels=128, # Number of output channels + heads=4, # Number of attention heads ) output = model(x, edge_index) ``` Once transformed to a heterogeneous model and trained using PyTorch Lightning, the model can efficiently learn relationships between transcripts and nuclei or cell boundaries. - diff --git a/src/segger/models/__init__.py b/src/segger/models/__init__.py index 1271af3..0a66407 100644 --- a/src/segger/models/__init__.py +++ b/src/segger/models/__init__.py @@ -4,8 +4,6 @@ Contains the implementation of the Segger model using Graph Neural Networks. """ -__all__ = [ - "Segger" - ] +__all__ = ["Segger"] from .segger_model import * diff --git a/src/segger/models/segger_model.py b/src/segger/models/segger_model.py index d2e13ad..6943dab 100644 --- a/src/segger/models/segger_model.py +++ b/src/segger/models/segger_model.py @@ -3,10 +3,20 @@ from torch.nn import Embedding from torch import Tensor from typing import Union -#from torch_sparse import SparseTensor + +# from torch_sparse import SparseTensor + class Segger(torch.nn.Module): - def __init__(self, num_tx_tokens: int, init_emb: int = 16, hidden_channels: int = 32, num_mid_layers: int = 3, out_channels: int = 32, heads: int = 3): + def __init__( + self, + num_tx_tokens: int, + init_emb: int = 16, + hidden_channels: int = 32, + num_mid_layers: int = 3, + out_channels: int = 32, + heads: int = 3, + ): """ Initializes the Segger model. @@ -54,27 +64,26 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: Returns: Tensor: Output node embeddings. """ - x = torch.nan_to_num(x, nan = 0) + x = torch.nan_to_num(x, nan=0) is_one_dim = (x.ndim == 1) * 1 - # x = x[:, None] - x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) + # x = x[:, None] + x = self.tx_embedding(((x.sum(1) * is_one_dim).int())) * is_one_dim + self.lin0(x.float()) * (1 - is_one_dim) # First layer x = x.relu() - x = self.conv_first(x, edge_index) # + self.lin_first(x) + x = self.conv_first(x, edge_index) # + self.lin_first(x) x = x.relu() # Middle layers if self.num_mid_layers > 0: - for conv_mid in self.conv_mid_layers: - x = conv_mid(x, edge_index) # + lin_mid(x) + for conv_mid in self.conv_mid_layers: + x = conv_mid(x, edge_index) # + lin_mid(x) x = x.relu() # Last layer - x = self.conv_last(x, edge_index) # + self.lin_last(x) + x = self.conv_last(x, edge_index) # + self.lin_last(x) return x - def decode(self, z: Tensor, edge_index: Union[Tensor]) -> Tensor: """ Decode the node embeddings to predict edge values. diff --git a/src/segger/prediction/__init__.py b/src/segger/prediction/__init__.py index abc96d9..f82a9cc 100644 --- a/src/segger/prediction/__init__.py +++ b/src/segger/prediction/__init__.py @@ -4,9 +4,6 @@ Contains prediction scripts and utilities for the Segger model. """ -__all__ = [ - "load_model", - "predict" - ] +__all__ = ["load_model", "predict"] from .predict import load_model, predict diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index cf73116..337a3c1 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -40,8 +40,8 @@ from cupyx.scipy.sparse import find # To find non-zero elements in sparse matrix from scipy.sparse.csgraph import connected_components as cc from scipy.sparse import coo_matrix as scipy_coo_matrix -# Setup Dask cluster with 3 workers +# Setup Dask cluster with 3 workers # CONFIG @@ -57,7 +57,7 @@ def load_model(checkpoint_path: str) -> LitSegger: Parameters ---------- checkpoint_path : str - Specific checkpoint file to load, or directory where the model checkpoints are stored. + Specific checkpoint file to load, or directory where the model checkpoints are stored. If directory, the latest checkpoint is loaded. Returns @@ -75,13 +75,15 @@ def load_model(checkpoint_path: str) -> LitSegger: # Get last checkpoint if directory is provided if os.path.isdir(checkpoint_path): - checkpoints = glob.glob(str(checkpoint_path / '*.ckpt')) + checkpoints = glob.glob(str(checkpoint_path / "*.ckpt")) if len(checkpoints) == 0: raise FileNotFoundError(msg) + # Sort checkpoints by epoch and step def sort_order(c): - match = re.match(r'.*epoch=(\d+)-step=(\d+).ckpt', c) + match = re.match(r".*epoch=(\d+)-step=(\d+).ckpt", c) return int(match[1]), int(match[2]) + checkpoint_path = Path(sorted(checkpoints, key=sort_order)[-1]) elif not checkpoint_path.exists(): raise FileExistsError(msg) @@ -94,16 +96,11 @@ def sort_order(c): return lit_segger - def get_similarity_scores( - model: torch.nn.Module, - batch: Batch, - from_type: str, - to_type: str, - receptive_field: dict + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, receptive_field: dict ) -> coo_matrix: """ - Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes + Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes using sparse matrix multiplication with CuPy and the 'sees' edge relation. Args: @@ -113,7 +110,7 @@ def get_similarity_scores( to_type (str): The type of node to which the similarity is computed. Returns: - coo_matrix: A sparse matrix containing the similarity scores between + coo_matrix: A sparse matrix containing the similarity scores between 'from_type' and 'to_type' nodes. """ # Step 1: Get embeddings from the model @@ -122,21 +119,21 @@ def get_similarity_scores( edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], - method='cuda' + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method="cuda", ) edge_index = coo_to_dense_adj( - edge_index.T, - num_nodes=shape[0], - num_nbrs=receptive_field[f'k_{to_type}'], + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f"k_{to_type}"], ) - + with torch.no_grad(): embeddings = model(batch.x_dict, batch.edge_index_dict) del batch - + # print(edge_index) # print(embeddings) @@ -144,19 +141,19 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros similarity = torch.bmm( - m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed - embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1), # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 del embeddings # Sigmoid to get most similar 'to_type' neighbor similarity[similarity == 0] = -torch.inf # ensure zero stays zero similarity = F.sigmoid(similarity) # Neighbor-filtered similarity scores # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(edge_index != -1).T + indices = torch.argwhere(edge_index != -1).T indices[1] = edge_index[edge_index != -1] - rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) - columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + rows = cp.fromDlpack(to_dlpack(indices[0, :].to("cuda"))) + columns = cp.fromDlpack(to_dlpack(indices[1, :].to("cuda"))) # print(rows) del indices values = similarity[edge_index != -1].flatten() @@ -164,7 +161,6 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: return sparse_result # Free GPU memory after computation - # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) gc.collect() @@ -175,38 +171,37 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: return sparse_similarity - - def predict_batch( lit_segger: torch.nn.Module, batch: Batch, score_cut: float, receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = 'cuda' + knn_method: str = "cuda", ) -> pd.DataFrame: """ Predict cell assignments for a batch of transcript data using a segmentation model. - Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) + Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) or unassigned (bound=0). Args: lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. batch (Batch): A batch of transcript and cell data. score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. - receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell + receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. - use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. Returns: - pd.DataFrame: A DataFrame containing the transcript IDs, similarity scores, + pd.DataFrame: A DataFrame containing the transcript IDs, similarity scores, assigned cell IDs, and 'bound' column. """ + def _get_id(): """Generate a random Xenium-style ID.""" - return ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '-nx' + return "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 8)) + "-nx" # Use CuPy with GPU context with cp.cuda.Device(0): @@ -214,10 +209,10 @@ def _get_id(): batch = batch.to("cuda") # Extract transcript IDs and initialize assignments DataFrame - transcript_id = cp.asnumpy(batch['tx'].id) - assignments = pd.DataFrame({'transcript_id': transcript_id}) + transcript_id = cp.asnumpy(batch["tx"].id) + assignments = pd.DataFrame({"transcript_id": transcript_id}) - if len(batch['bd'].pos) >= 10: + if len(batch["bd"].pos) >= 10: # Compute similarity scores between 'tx' and 'bd' scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field) torch.cuda.empty_cache() @@ -227,48 +222,47 @@ def _get_id(): cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory # Get direct assignments from similarity matrix belongs = cp.max(dense_scores, axis=1) # Max score per transcript - assignments['score'] = cp.asnumpy(belongs) # Move back to CPU + assignments["score"] = cp.asnumpy(belongs) # Move back to CPU - mask = assignments['score'] > score_cut - all_ids = np.concatenate(batch['bd'].id) # Keep IDs as NumPy array - assignments['segger_cell_id'] = None # Initialize as None + mask = assignments["score"] > score_cut + all_ids = np.concatenate(batch["bd"].id) # Keep IDs as NumPy array + assignments["segger_cell_id"] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments['segger_cell_id'][mask] = all_ids[max_indices[mask]] # Assign IDs - + assignments["segger_cell_id"][mask] = all_ids[max_indices[mask]] # Assign IDs + del dense_scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() -# Move back to CPU - assignments['bound'] = 0 - assignments['bound'][mask] = 1 - - + # Move back to CPU + assignments["bound"] = 0 + assignments["bound"][mask] = 1 + if use_cc: # Compute similarity scores between 'tx' and 'tx' scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) - # Convert to dense NumPy array - data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) - row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) - col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) + # Convert to dense NumPy array + data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) + row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) + col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) # dense_scores_tx = scores_tx.toarray().astype(cp.float16) # Rebuild the matrix on CPU using SciPy dense_scores_tx = scipy_coo_matrix((data_cpu, (row_cpu, col_cpu)), shape=scores_tx.shape).toarray() np.fill_diagonal(dense_scores_tx, 0) # Ignore self-similarity - + del scores_tx # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory # Assign unassigned transcripts using connected components - no_id = assignments['segger_cell_id'].isna() + no_id = assignments["segger_cell_id"].isna() if np.any(no_id): # Only compute if there are unassigned transcripts no_id_scores = dense_scores_tx[no_id][:, no_id] del dense_scores_tx # Remove from memory no_id_scores[no_id_scores < score_cut] = 0 n, comps = cc(no_id_scores, connection="weak", directed=False) new_ids = np.array([_get_id() for _ in range(n)]) - assignments['segger_cell_id'][no_id] = new_ids[comps] + assignments["segger_cell_id"][no_id] = new_ids[comps] # Perform memory cleanup to avoid OOM issues cp.get_default_memory_pool().free_all_blocks() @@ -276,9 +270,6 @@ def _get_id(): return assignments - - - def predict( lit_segger: LitSegger, @@ -286,7 +277,7 @@ def predict( score_cut: float, receptive_field: dict, use_cc: bool = True, - knn_method: str = 'cuda' + knn_method: str = "cuda", ) -> pd.DataFrame: # Change return type to Dask DataFrame if applicable """ Optimized prediction for multiple batches of transcript data. @@ -296,7 +287,7 @@ def predict( for batch in data_loader: assignments = predict_batch(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) all_assignments.append(dd.from_pandas(assignments, npartitions=1)) - + cp.get_default_memory_pool().free_all_blocks() torch.cuda.empty_cache() @@ -304,26 +295,26 @@ def predict( final_assignments = dd.concat(all_assignments, ignore_index=True) # Sort the Dask DataFrame by 'transcript_id' before setting it as an index - final_assignments = final_assignments.sort_values(by='transcript_id') + final_assignments = final_assignments.sort_values(by="transcript_id") # Set a unique index for Dask DataFrame - final_assignments = final_assignments.set_index('transcript_id', sorted=True) + final_assignments = final_assignments.set_index("transcript_id", sorted=True) # Max score selection logic - max_bound_idx = final_assignments[final_assignments['bound'] == 1].groupby('transcript_id')['score'].idxmax() - max_unbound_idx = final_assignments[final_assignments['bound'] == 0].groupby('transcript_id')['score'].idxmax() + max_bound_idx = final_assignments[final_assignments["bound"] == 1].groupby("transcript_id")["score"].idxmax() + max_unbound_idx = final_assignments[final_assignments["bound"] == 0].groupby("transcript_id")["score"].idxmax() # Combine indices, prioritizing bound=1 scores final_idx = max_bound_idx.combine_first(max_unbound_idx).compute() # Ensure it's computed # Now use the computed final_idx for indexing - result = final_assignments.loc[final_idx].compute().reset_index(names=['transcript_id']) - + result = final_assignments.loc[final_idx].compute().reset_index(names=["transcript_id"]) + # result = results.reset_index() # Handle cases where there's only one entry per 'segger_cell_id' # single_entry_mask = result.groupby('segger_cell_id').size() == 1 -# Handle cases where there's only one entry per 'segger_cell_id' + # Handle cases where there's only one entry per 'segger_cell_id' # single_entry_counts = result['segger_cell_id'].value_counts() # Count occurrences of each ID # single_entry_mask = single_entry_counts[single_entry_counts == 1].index # Get IDs with a count of 1 @@ -331,27 +322,26 @@ def predict( # for segger_id in single_entry_mask: # result.loc[result['segger_cell_id'] == segger_id, 'segger_cell_id'] = 'floating' - return result def segment( - model: LitSegger, - dm: SeggerDataModule, - save_dir: Union[str, Path], - seg_tag: str, - transcript_file: Union[str, Path], - score_cut: float = .5, + model: LitSegger, + dm: SeggerDataModule, + save_dir: Union[str, Path], + seg_tag: str, + transcript_file: Union[str, Path], + score_cut: float = 0.5, use_cc: bool = True, - file_format: str = 'anndata', - receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, - knn_method: str = 'kd_tree', + file_format: str = "anndata", + receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, + knn_method: str = "kd_tree", verbose: bool = False, - **anndata_kwargs + **anndata_kwargs, ) -> None: """ Perform segmentation using the model, merge segmentation results with transcripts_df, and save in the specified format. - + Parameters: ---------- model : LitSegger @@ -388,22 +378,22 @@ def segment( # Step 1: Prediction step_start_time = time.time() - + train_dataloader = dm.train_dataloader() - test_dataloader = dm.test_dataloader() - val_dataloader = dm.val_dataloader() - + test_dataloader = dm.test_dataloader() + val_dataloader = dm.val_dataloader() + segmentation_train = predict(model, train_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() - - segmentation_val = predict(model, val_dataloader, score_cut, receptive_field, use_cc, knn_method) + + segmentation_val = predict(model, val_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() - - segmentation_test = predict(model, test_dataloader, score_cut, receptive_field, use_cc, knn_method) + + segmentation_test = predict(model, test_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() @@ -422,7 +412,7 @@ def segment( # print(seg_combined.columns) # print(transcripts_df.id) # Drop any unassigned rows - seg_final = seg_combined.dropna(subset=['segger_cell_id']).reset_index(drop=True) + seg_final = seg_combined.dropna(subset=["segger_cell_id"]).reset_index(drop=True) if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -440,7 +430,7 @@ def segment( seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) # Merge the segmentation results with the transcript data (still as Dask DataFrame) - transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') + transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on="transcript_id", how="inner") if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -448,18 +438,18 @@ def segment( # Step 4: Save the merged result step_start_time = time.time() - + if verbose: print(f"Saving results in {file_format} format...") - if file_format == 'csv': - save_path = save_dir / f'{seg_tag}_segmentation.csv' + if file_format == "csv": + save_path = save_dir / f"{seg_tag}_segmentation.csv" transcripts_df_filtered.compute().to_csv(save_path, index=False) # Use pandas after computing - elif file_format == 'parquet': - save_path = save_dir / f'{seg_tag}_segmentation.parquet' + elif file_format == "parquet": + save_path = save_dir / f"{seg_tag}_segmentation.parquet" transcripts_df_filtered.to_parquet(save_path, index=False) # Dask handles Parquet fine - elif file_format == 'anndata': - save_path = save_dir / f'{seg_tag}_segmentation.h5ad' + elif file_format == "anndata": + save_path = save_dir / f"{seg_tag}_segmentation.h5ad" segger_adata = create_anndata(transcripts_df_filtered.compute(), **anndata_kwargs) # Compute for AnnData segger_adata.write(save_path) else: @@ -479,9 +469,6 @@ def segment( torch.cuda.empty_cache() gc.collect() - - - # def predict( # lit_segger: LitSegger, @@ -493,7 +480,7 @@ def segment( # ) -> dd.DataFrame: # """ # Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - + # Args: # lit_segger (LitSegger): The lightning module wrapping the segmentation model. # data_loader (DataLoader): A data loader providing batches of transcript and cell data. @@ -539,7 +526,7 @@ def segment( # # Handle cases where there's only one entry per 'segger_cell_id' # single_entry_mask = result.groupby('segger_cell_id').size() == 1 # result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - + # return result # # Map the logic over each partition using Dask @@ -548,14 +535,11 @@ def segment( # # Trigger garbage collection and free GPU memory # torch.cuda.empty_cache() # gc.collect() - -# final_assignments = final_assignments.compute() - - -# return final_assignments +# final_assignments = final_assignments.compute() +# return final_assignments # # def predict( @@ -568,7 +552,7 @@ def segment( # # ) -> dd.DataFrame: # # """ # # Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - + # # Args: # # lit_segger (LitSegger): The lightning module wrapping the segmentation model. # # data_loader (DataLoader): A data loader providing batches of transcript and cell data. @@ -596,7 +580,7 @@ def segment( # # delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) # # for batch in data_loader # # ] - + # # # Build the Dask DataFrame from the delayed assignments # # assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) @@ -612,7 +596,7 @@ def segment( # # # Handle cases where there's only one entry per 'segger_cell_id' # # single_entry_mask = result.groupby('segger_cell_id').size() == 1 # # result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - + # # return result # # # Map the logic over each partition using Dask @@ -627,22 +611,22 @@ def segment( # def segment( -# model: LitSegger, -# dm: SeggerDataModule, -# save_dir: Union[str, Path], -# seg_tag: str, -# transcript_file: Union[str, Path], +# model: LitSegger, +# dm: SeggerDataModule, +# save_dir: Union[str, Path], +# seg_tag: str, +# transcript_file: Union[str, Path], # score_cut: float = .25, # use_cc: bool = True, -# file_format: str = 'anndata', +# file_format: str = 'anndata', # receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, # knn_method: str = 'kd_tree', # verbose: bool = False, # **anndata_kwargs # ) -> None: # """ -# Perform segmentation using the model, merge segmentation results with transcripts_df, -# and save in the specified format. Memory is managed efficiently using Dask and GPU +# Perform segmentation using the model, merge segmentation results with transcripts_df, +# and save in the specified format. Memory is managed efficiently using Dask and GPU # memory optimizations. # Args: @@ -674,15 +658,15 @@ def segment( # # Step 1: Prediction # step_start_time = time.time() - + # train_dataloader = dm.train_dataloader() # test_dataloader = dm.test_dataloader() # val_dataloader = dm.val_dataloader() - + # # delayed_train = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) # # delayed_val = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) # delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - + # delayed_test = delayed_test.compute() # # Compute all predictions at once using Dask # # with ProgressBar(): @@ -726,7 +710,7 @@ def segment( # # Step 4: Save the merged result # step_start_time = time.time() - + # if verbose: # print(f"Saving results in {file_format} format...") diff --git a/src/segger/training/README.md b/src/segger/training/README.md index ff7e04d..958cd20 100644 --- a/src/segger/training/README.md +++ b/src/segger/training/README.md @@ -7,20 +7,24 @@ The training module makes use of **PyTorch Lightning** for efficient and scalabl ## Key Components ### 1. **SpatialTranscriptomicsDataset** + The `SpatialTranscriptomicsDataset` class is used to load and manage spatial transcriptomics data stored in the format of PyTorch Geometric `Data` objects. It inherits from `InMemoryDataset` to load preprocessed datasets, ensuring efficient in-memory data handling for training and validation phases. - **Root Path**: The root directory contains the dataset, which is expected to have separate folders for training, validation, and test sets. - **Raw and Processed Data**: The module expects datasets in the form of processed PyTorch files, and the dataset class is responsible for loading them efficiently. ### 2. **Segger Model** + The `Segger` model is a custom graph neural network designed to work with heterogeneous graph data. It takes both **transcript (tx)** and **boundary (bd)** nodes, utilizing attention mechanisms for better feature aggregation. Key parameters such as `num_tx_tokens`, `init_emb`, `hidden_channels`, `out_channels`, and `heads` allow the user to control the model's architecture and initial embedding sizes. - **Heterogeneous Graph Support**: The model is converted to handle different node types using `to_hetero` from PyTorch Geometric. The transformation allows the model to handle multiple relations like `belongs` (tx to bd) and `neighbors` (tx to tx). ### 3. **LitSegger** + `LitSegger` is the PyTorch Lightning wrapper around the Segger model, which handles training, validation, and optimization. This wrapper facilitates the integration with Lightning’s trainer, allowing easy multi-GPU and distributed training. ### 4. **Training Pipeline** + The module provides an easily configurable pipeline for training the Segger model: - **Datasets**: Training and validation datasets are loaded using `SpatialTranscriptomicsDataset` with paths provided via arguments. @@ -30,6 +34,7 @@ The module provides an easily configurable pipeline for training the Segger mode ## Usage and Configuration ### Command-Line Arguments + The module accepts various command-line arguments that allow for flexible configuration: - `--train_dir`: Path to the training data directory. This directory should include `processed` and `raw` subdirectories. The direcotry `processed` should include the `pyg` `HeteroData` objects. @@ -51,6 +56,7 @@ The module accepts various command-line arguments that allow for flexible config - `--default_root_dir`: Directory where logs, checkpoints, and models will be saved. ### Example Training Command + The module can be executed from the command line as follows: ```bash diff --git a/src/segger/training/segger_data_module.py b/src/segger/training/segger_data_module.py index c1be43d..3feadef 100644 --- a/src/segger/training/segger_data_module.py +++ b/src/segger/training/segger_data_module.py @@ -21,9 +21,9 @@ def __init__( # TODO: Add documentation def setup(self, stage=None): - self.train = STPyGDataset(root=self.data_dir / 'train_tiles') - self.test = STPyGDataset(root=self.data_dir / 'test_tiles') - self.val = STPyGDataset(root=self.data_dir / 'val_tiles') + self.train = STPyGDataset(root=self.data_dir / "train_tiles") + self.test = STPyGDataset(root=self.data_dir / "test_tiles") + self.val = STPyGDataset(root=self.data_dir / "val_tiles") self.loader_kwargs = dict( batch_size=self.batch_size, num_workers=self.num_workers, diff --git a/src/segger/training/train.py b/src/segger/training/train.py index a3cf471..68adbb3 100644 --- a/src/segger/training/train.py +++ b/src/segger/training/train.py @@ -60,7 +60,17 @@ def __init__(self, **kwargs): self.validation_step_outputs = [] self.criterion = torch.nn.BCEWithLogitsLoss() - def from_new(self, num_tx_tokens: int, init_emb: int, hidden_channels: int, out_channels: int, heads: int, num_mid_layers: int, aggr: str, metadata: Union[Tuple, Metadata]): + def from_new( + self, + num_tx_tokens: int, + init_emb: int, + hidden_channels: int, + out_channels: int, + heads: int, + num_mid_layers: int, + aggr: str, + metadata: Union[Tuple, Metadata], + ): """ Initializes the LitSegger module with new parameters. @@ -124,7 +134,7 @@ def forward(self, batch: SpatialTranscriptomicsDataset) -> torch.Tensor: The output of the model. """ z = self.model(batch.x_dict, batch.edge_index_dict) - output = torch.matmul(z['tx'], z['bd'].t()) # Example for bipartite graph + output = torch.matmul(z["tx"], z["bd"].t()) # Example for bipartite graph return output def training_step(self, batch: Any, batch_idx: int) -> torch.Tensor: @@ -145,16 +155,16 @@ def training_step(self, batch: Any, batch_idx: int) -> torch.Tensor: """ # Forward pass to get the logits z = self.model(batch.x_dict, batch.edge_index_dict) - output = torch.matmul(z['tx'], z['bd'].t()) + output = torch.matmul(z["tx"], z["bd"].t()) # Get edge labels and logits - edge_label_index = batch['tx', 'belongs', 'bd'].edge_label_index + edge_label_index = batch["tx", "belongs", "bd"].edge_label_index out_values = output[edge_label_index[0], edge_label_index[1]] - edge_label = batch['tx', 'belongs', 'bd'].edge_label - + edge_label = batch["tx", "belongs", "bd"].edge_label + # Compute binary cross-entropy loss with logits (no sigmoid here) loss = self.criterion(out_values, edge_label) - + # Log the training loss self.log("train_loss", loss, prog_bar=True, batch_size=batch.num_graphs) return loss @@ -177,31 +187,31 @@ def validation_step(self, batch: Any, batch_idx: int) -> torch.Tensor: """ # Forward pass to get the logits z = self.model(batch.x_dict, batch.edge_index_dict) - output = torch.matmul(z['tx'], z['bd'].t()) + output = torch.matmul(z["tx"], z["bd"].t()) # Get edge labels and logits - edge_label_index = batch['tx', 'belongs', 'bd'].edge_label_index + edge_label_index = batch["tx", "belongs", "bd"].edge_label_index out_values = output[edge_label_index[0], edge_label_index[1]] - edge_label = batch['tx', 'belongs', 'bd'].edge_label - + edge_label = batch["tx", "belongs", "bd"].edge_label + # Compute binary cross-entropy loss with logits (no sigmoid here) loss = self.criterion(out_values, edge_label) - + # Apply sigmoid to logits for AUROC and F1 metrics out_values_prob = torch.sigmoid(out_values) # Compute metrics auroc = torchmetrics.AUROC(task="binary") auroc_res = auroc(out_values_prob, edge_label) - + f1 = F1Score(task="binary").to(self.device) f1_res = f1(out_values_prob, edge_label) - + # Log validation metrics self.log("validation_loss", loss, batch_size=batch.num_graphs) self.log("validation_auroc", auroc_res, prog_bar=True, batch_size=batch.num_graphs) self.log("validation_f1", f1_res, prog_bar=True, batch_size=batch.num_graphs) - + return loss def configure_optimizers(self) -> torch.optim.Optimizer: diff --git a/src/segger/validation/__init__.py b/src/segger/validation/__init__.py index 220150b..bfc7689 100644 --- a/src/segger/validation/__init__.py +++ b/src/segger/validation/__init__.py @@ -1,3 +1,3 @@ from .utils import * -from .xenium_explorer import * \ No newline at end of file +from .xenium_explorer import * diff --git a/src/segger/validation/utils.py b/src/segger/validation/utils.py index b283b00..72a5438 100644 --- a/src/segger/validation/utils.py +++ b/src/segger/validation/utils.py @@ -11,22 +11,20 @@ from matplotlib.backends.backend_pdf import PdfPages import matplotlib.pyplot as plt import dask -dask.config.set({'dataframe.query-planning': False}) + +dask.config.set({"dataframe.query-planning": False}) import squidpy as sq from sklearn.metrics import calinski_harabasz_score, silhouette_score, f1_score from pathlib import Path import seaborn as sns - - - def find_markers( - adata: ad.AnnData, - cell_type_column: str, - pos_percentile: float = 5, - neg_percentile: float = 10, - percentage: float = 50 + adata: ad.AnnData, + cell_type_column: str, + pos_percentile: float = 5, + neg_percentile: float = 10, + percentage: float = 50, ) -> Dict[str, Dict[str, List[str]]]: """Identify positive and negative markers for each cell type based on gene expression and filter by expression percentage. @@ -62,17 +60,12 @@ def find_markers( valid_pos_indices = pos_indices[expr_frac >= (percentage / 100)] positive_markers = genes[valid_pos_indices] negative_markers = genes[neg_indices] - markers[cell_type] = { - 'positive': list(positive_markers), - 'negative': list(negative_markers) - } + markers[cell_type] = {"positive": list(positive_markers), "negative": list(negative_markers)} return markers def find_mutually_exclusive_genes( - adata: ad.AnnData, - markers: Dict[str, Dict[str, List[str]]], - cell_type_column: str + adata: ad.AnnData, markers: Dict[str, Dict[str, List[str]]], cell_type_column: str ) -> List[Tuple[str, str]]: """Identify mutually exclusive genes based on expression criteria. @@ -94,7 +87,7 @@ def find_mutually_exclusive_genes( all_exclusive = [] gene_expression = adata.to_df() for cell_type, marker_sets in markers.items(): - positive_markers = marker_sets['positive'] + positive_markers = marker_sets["positive"] exclusive_genes[cell_type] = [] for gene in positive_markers: gene_expr = adata[:, gene].X @@ -104,7 +97,9 @@ def find_mutually_exclusive_genes( exclusive_genes[cell_type].append(gene) all_exclusive.append(gene) unique_genes = list({gene for i in exclusive_genes.keys() for gene in exclusive_genes[i] if gene in all_exclusive}) - filtered_exclusive_genes = {i: [gene for gene in exclusive_genes[i] if gene in unique_genes] for i in exclusive_genes.keys()} + filtered_exclusive_genes = { + i: [gene for gene in exclusive_genes[i] if gene in unique_genes] for i in exclusive_genes.keys() + } mutually_exclusive_gene_pairs = [ (gene1, gene2) for key1, key2 in combinations(filtered_exclusive_genes.keys(), 2) @@ -114,10 +109,7 @@ def find_mutually_exclusive_genes( return mutually_exclusive_gene_pairs -def compute_MECR( - adata: ad.AnnData, - gene_pairs: List[Tuple[str, str]] -) -> Dict[Tuple[str, str], float]: +def compute_MECR(adata: ad.AnnData, gene_pairs: List[Tuple[str, str]]) -> Dict[Tuple[str, str], float]: """Compute the Mutually Exclusive Co-expression Rate (MECR) for each gene pair in an AnnData object. Args: @@ -143,9 +135,7 @@ def compute_MECR( def compute_quantized_mecr_area( - adata: sc.AnnData, - gene_pairs: List[Tuple[str, str]], - quantiles: int = 10 + adata: sc.AnnData, gene_pairs: List[Tuple[str, str]], quantiles: int = 10 ) -> pd.DataFrame: """Compute the average MECR, variance of MECR, and average cell area for quantiles of cell areas. @@ -161,28 +151,28 @@ def compute_quantized_mecr_area( - quantized_data: pd.DataFrame DataFrame containing quantile information, average MECR, variance of MECR, average area, and number of cells. """ - adata.obs['quantile'] = pd.qcut(adata.obs['cell_area'], quantiles, labels=False) + adata.obs["quantile"] = pd.qcut(adata.obs["cell_area"], quantiles, labels=False) quantized_data = [] for quantile in range(quantiles): - cells_in_quantile = adata.obs['quantile'] == quantile + cells_in_quantile = adata.obs["quantile"] == quantile mecr = compute_MECR(adata[cells_in_quantile, :], gene_pairs) average_mecr = np.mean([i for i in mecr.values()]) variance_mecr = np.var([i for i in mecr.values()]) - average_area = adata.obs.loc[cells_in_quantile, 'cell_area'].mean() - quantized_data.append({ - 'quantile': quantile / quantiles, - 'average_mecr': average_mecr, - 'variance_mecr': variance_mecr, - 'average_area': average_area, - 'num_cells': cells_in_quantile.sum() - }) + average_area = adata.obs.loc[cells_in_quantile, "cell_area"].mean() + quantized_data.append( + { + "quantile": quantile / quantiles, + "average_mecr": average_mecr, + "variance_mecr": variance_mecr, + "average_area": average_area, + "num_cells": cells_in_quantile.sum(), + } + ) return pd.DataFrame(quantized_data) def compute_quantized_mecr_counts( - adata: sc.AnnData, - gene_pairs: List[Tuple[str, str]], - quantiles: int = 10 + adata: sc.AnnData, gene_pairs: List[Tuple[str, str]], quantiles: int = 10 ) -> pd.DataFrame: """Compute the average MECR, variance of MECR, and average transcript counts for quantiles of transcript counts. @@ -198,28 +188,28 @@ def compute_quantized_mecr_counts( - quantized_data: pd.DataFrame DataFrame containing quantile information, average MECR, variance of MECR, average counts, and number of cells. """ - adata.obs['quantile'] = pd.qcut(adata.obs['transcripts'], quantiles, labels=False) + adata.obs["quantile"] = pd.qcut(adata.obs["transcripts"], quantiles, labels=False) quantized_data = [] for quantile in range(quantiles): - cells_in_quantile = adata.obs['quantile'] == quantile + cells_in_quantile = adata.obs["quantile"] == quantile mecr = compute_MECR(adata[cells_in_quantile, :], gene_pairs) average_mecr = np.mean([i for i in mecr.values()]) variance_mecr = np.var([i for i in mecr.values()]) - average_counts = adata.obs.loc[cells_in_quantile, 'transcripts'].mean() - quantized_data.append({ - 'quantile': quantile / quantiles, - 'average_mecr': average_mecr, - 'variance_mecr': variance_mecr, - 'average_counts': average_counts, - 'num_cells': cells_in_quantile.sum() - }) + average_counts = adata.obs.loc[cells_in_quantile, "transcripts"].mean() + quantized_data.append( + { + "quantile": quantile / quantiles, + "average_mecr": average_mecr, + "variance_mecr": variance_mecr, + "average_counts": average_counts, + "num_cells": cells_in_quantile.sum(), + } + ) return pd.DataFrame(quantized_data) def annotate_query_with_reference( - reference_adata: ad.AnnData, - query_adata: ad.AnnData, - transfer_column: str + reference_adata: ad.AnnData, query_adata: ad.AnnData, transfer_column: str ) -> ad.AnnData: """Annotate query AnnData object using a scRNA-seq reference atlas. @@ -238,25 +228,25 @@ def annotate_query_with_reference( common_genes = list(set(reference_adata.var_names) & set(query_adata.var_names)) reference_adata = reference_adata[:, common_genes] query_adata = query_adata[:, common_genes] - query_adata.layers['raw'] = query_adata.raw.X if query_adata.raw else query_adata.X - query_adata.var['raw_counts'] = query_adata.layers['raw'].sum(axis=0) + query_adata.layers["raw"] = query_adata.raw.X if query_adata.raw else query_adata.X + query_adata.var["raw_counts"] = query_adata.layers["raw"].sum(axis=0) sc.pp.normalize_total(query_adata, target_sum=1e4) sc.pp.log1p(query_adata) sc.pp.pca(reference_adata) sc.pp.neighbors(reference_adata) sc.tl.umap(reference_adata) sc.tl.ingest(query_adata, reference_adata, obs=transfer_column) - query_adata.obsm['X_umap'] = query_adata.obsm['X_umap'] + query_adata.obsm["X_umap"] = query_adata.obsm["X_umap"] return query_adata def calculate_contamination( - adata: ad.AnnData, - markers: Dict[str, Dict[str, List[str]]], - radius: float = 15, - n_neighs: int = 10, - celltype_column: str = 'celltype_major', - num_cells: int = 10000 + adata: ad.AnnData, + markers: Dict[str, Dict[str, List[str]]], + radius: float = 15, + n_neighs: int = 10, + celltype_column: str = "celltype_major", + num_cells: int = 10000, ) -> pd.DataFrame: """Calculate normalized contamination from neighboring cells of different cell types based on positive markers. @@ -282,11 +272,11 @@ def calculate_contamination( """ if celltype_column not in adata.obs: raise ValueError("Column celltype_column must be present in adata.obs.") - positive_markers = {ct: markers[ct]['positive'] for ct in markers} + positive_markers = {ct: markers[ct]["positive"] for ct in markers} adata.obsm["spatial"] = adata.obs[["cell_centroid_x", "cell_centroid_y"]].copy().to_numpy() - sq.gr.spatial_neighbors(adata, radius=radius, n_neighs=n_neighs, coord_type='generic') - neighbors = adata.obsp['spatial_connectivities'].tolil() - raw_counts = adata[:, adata.var_names].layers['raw'].toarray() + sq.gr.spatial_neighbors(adata, radius=radius, n_neighs=n_neighs, coord_type="generic") + neighbors = adata.obsp["spatial_connectivities"].tolil() + raw_counts = adata[:, adata.var_names].layers["raw"].toarray() cell_types = adata.obs[celltype_column] selected_cells = np.random.choice(adata.n_obs, size=min(num_cells, adata.n_obs), replace=False) contamination = {ct: {ct2: 0 for ct2 in positive_markers.keys()} for ct in positive_markers.keys()} @@ -309,19 +299,19 @@ def calculate_contamination( if marker in adata.var_names: marker_counts_in_neighbor = raw_counts[neighbor_idx, adata.var_names.get_loc(marker)] if total_counts_in_neighborhood > 0: - contamination[cell_type][neighbor_type] += marker_counts_in_neighbor / total_counts_in_neighborhood + contamination[cell_type][neighbor_type] += ( + marker_counts_in_neighbor / total_counts_in_neighborhood + ) negighborings[cell_type][neighbor_type] += 1 contamination_df = pd.DataFrame(contamination).T negighborings_df = pd.DataFrame(negighborings).T - contamination_df.index.name = 'Source Cell Type' - contamination_df.columns.name = 'Target Cell Type' + contamination_df.index.name = "Source Cell Type" + contamination_df.columns.name = "Target Cell Type" return contamination_df / (negighborings_df + 1) def calculate_sensitivity( - adata: ad.AnnData, - purified_markers: Dict[str, List[str]], - max_cells_per_type: int = 1000 + adata: ad.AnnData, purified_markers: Dict[str, List[str]], max_cells_per_type: int = 1000 ) -> Dict[str, List[float]]: """Calculate the sensitivity of the purified markers for each cell type. @@ -339,8 +329,8 @@ def calculate_sensitivity( """ sensitivity_results = {cell_type: [] for cell_type in purified_markers.keys()} for cell_type, markers in purified_markers.items(): - markers = markers['positive'] - subset = adata[adata.obs['celltype_major'] == cell_type] + markers = markers["positive"] + subset = adata[adata.obs["celltype_major"] == cell_type] if subset.n_obs > max_cells_per_type: cell_indices = np.random.choice(subset.n_obs, max_cells_per_type, replace=False) subset = subset[cell_indices] @@ -352,9 +342,7 @@ def calculate_sensitivity( def compute_clustering_scores( - adata: ad.AnnData, - cell_type_column: str = 'celltype_major', - use_pca: bool = True + adata: ad.AnnData, cell_type_column: str = "celltype_major", use_pca: bool = True ) -> Tuple[float, float]: """Compute the Calinski-Harabasz and Silhouette scores for an AnnData object based on the assigned cell types. @@ -384,11 +372,11 @@ def compute_clustering_scores( def compute_neighborhood_metrics( - adata: ad.AnnData, - radius: float = 10, - celltype_column: str = 'celltype_major', + adata: ad.AnnData, + radius: float = 10, + celltype_column: str = "celltype_major", n_neighs: int = 20, - subset_size: int = 10000 + subset_size: int = 10000, ) -> None: """Compute neighborhood entropy and number of neighbors for each cell in the AnnData object. @@ -418,8 +406,8 @@ def compute_neighborhood_metrics( # Randomly select a subset of cells subset_indices = np.random.choice(adata.n_obs, subset_size, replace=False) # Compute spatial neighbors for the entire dataset - sq.gr.spatial_neighbors(adata, radius=radius, coord_type='generic', n_neighs=n_neighs) - neighbors = adata.obsp['spatial_distances'].tolil().rows + sq.gr.spatial_neighbors(adata, radius=radius, coord_type="generic", n_neighs=n_neighs) + neighbors = adata.obsp["spatial_distances"].tolil().rows entropies = [] num_neighbors = [] # Calculate entropy and number of neighbors only for the selected subset @@ -441,8 +429,8 @@ def compute_neighborhood_metrics( neighbors_full = np.full(adata.n_obs, np.nan) entropy_full[subset_indices] = entropies neighbors_full[subset_indices] = num_neighbors - adata.obs['neighborhood_entropy'] = entropy_full - adata.obs['number_of_neighbors'] = neighbors_full + adata.obs["neighborhood_entropy"] = entropy_full + adata.obs["number_of_neighbors"] = neighbors_full def compute_transcript_density(adata: ad.AnnData) -> None: @@ -453,15 +441,15 @@ def compute_transcript_density(adata: ad.AnnData) -> None: Annotated data object containing transcript and cell area information. """ try: - transcript_counts = adata.obs['transcript_counts'] + transcript_counts = adata.obs["transcript_counts"] except: - transcript_counts = adata.obs['transcripts'] - cell_areas = adata.obs['cell_area'] - adata.obs['transcript_density'] = transcript_counts / cell_areas + transcript_counts = adata.obs["transcripts"] + cell_areas = adata.obs["cell_area"] + adata.obs["transcript_density"] = transcript_counts / cell_areas # def compute_celltype_f1_purity( -# adata: ad.AnnData, +# adata: ad.AnnData, # marker_genes: Dict[str, Dict[str, List[str]]] # ) -> Dict[str, float]: # """ @@ -497,7 +485,7 @@ def compute_transcript_density(adata: ad.AnnData) -> None: # def average_log_normalized_expression( -# adata: ad.AnnData, +# adata: ad.AnnData, # celltype_column: str # ) -> pd.DataFrame: # """ @@ -516,18 +504,8 @@ def compute_transcript_density(adata: ad.AnnData) -> None: # return adata.to_df().groupby(adata.obs[celltype_column]).mean() - - - - def plot_metric_comparison( - ax: plt.Axes, - data: pd.DataFrame, - metric: str, - label: str, - method1: str, - method2: str, - output_path: Path + ax: plt.Axes, data: pd.DataFrame, metric: str, label: str, method1: str, method2: str, output_path: Path ) -> None: """Plot a comparison of a specific metric between two methods and save the comparison data. @@ -547,25 +525,22 @@ def plot_metric_comparison( - output_path: Path Path to save the merged DataFrame as a CSV. """ - subset1 = data[data['method'] == method1] - subset2 = data[data['method'] == method2] - merged_data = pd.merge(subset1, subset2, on='celltype_major', suffixes=(f'_{method1}', f'_{method2}')) - + subset1 = data[data["method"] == method1] + subset2 = data[data["method"] == method2] + merged_data = pd.merge(subset1, subset2, on="celltype_major", suffixes=(f"_{method1}", f"_{method2}")) + # Save the merged data used in the plot to CSV - merged_data.to_csv(output_path / f'metric_comparison_{metric}_{method1}_vs_{method2}.csv', index=False) - - for cell_type in merged_data['celltype_major'].unique(): - cell_data = merged_data[merged_data['celltype_major'] == cell_type] - ax.scatter(cell_data[f'{metric}_{method1}'], cell_data[f'{metric}_{method2}'], - label=cell_type) - - max_value = max(merged_data[f'{metric}_{method1}'].max(), merged_data[f'{metric}_{method2}'].max()) - ax.plot([0, max_value], [0, max_value], 'k--', alpha=0.5) - ax.set_xlabel(f'{label} ({method1})') - ax.set_ylabel(f'{label} ({method2})') - ax.set_title(f'{label}: {method1} vs {method2}') + merged_data.to_csv(output_path / f"metric_comparison_{metric}_{method1}_vs_{method2}.csv", index=False) + for cell_type in merged_data["celltype_major"].unique(): + cell_data = merged_data[merged_data["celltype_major"] == cell_type] + ax.scatter(cell_data[f"{metric}_{method1}"], cell_data[f"{metric}_{method2}"], label=cell_type) + max_value = max(merged_data[f"{metric}_{method1}"].max(), merged_data[f"{metric}_{method2}"].max()) + ax.plot([0, max_value], [0, max_value], "k--", alpha=0.5) + ax.set_xlabel(f"{label} ({method1})") + ax.set_ylabel(f"{label} ({method2})") + ax.set_title(f"{label}: {method1} vs {method2}") def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnData]: @@ -581,16 +556,15 @@ def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnD for method, path in segmentation_paths.items(): adata = sc.read(path) # Special handling for 'segger' to separate into 'segger_n0' and 'segger_n1' - if method == 'segger': - cells_n1 = [i for i in adata.obs_names if not i.endswith('-nx')] - cells_n0 = [i for i in adata.obs_names if i.endswith('-nx')] - segmentations_dict['segger_n1'] = adata[cells_n1, :] - segmentations_dict['segger_n0'] = adata[cells_n0, :] + if method == "segger": + cells_n1 = [i for i in adata.obs_names if not i.endswith("-nx")] + cells_n0 = [i for i in adata.obs_names if i.endswith("-nx")] + segmentations_dict["segger_n1"] = adata[cells_n1, :] + segmentations_dict["segger_n0"] = adata[cells_n0, :] segmentations_dict[method] = adata return segmentations_dict - def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the number of cells per segmentation method and save the cell count data as a CSV. @@ -600,33 +574,37 @@ def plot_cell_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat """ # Calculate the number of cells in each segmentation method cell_counts = {method: seg.n_obs for method, seg in segmentations_dict.items()} - + # Create a DataFrame for the bar plot - df = pd.DataFrame(cell_counts, index=['Number of Cells']).T - + df = pd.DataFrame(cell_counts, index=["Number of Cells"]).T + # Save the DataFrame to CSV - df.to_csv(output_path / 'cell_counts_data.csv', index=True) - + df.to_csv(output_path / "cell_counts_data.csv", index=True) + # Generate the bar plot - ax = df.plot(kind='bar', stacked=False, color=[palette.get(key, '#333333') for key in df.index], figsize=(3, 6), width=0.9) - + ax = df.plot( + kind="bar", stacked=False, color=[palette.get(key, "#333333") for key in df.index], figsize=(3, 6), width=0.9 + ) + # Add a dashed line for the 10X baseline - if '10X' in cell_counts: - baseline_height = cell_counts['10X'] - ax.axhline(y=baseline_height, color='gray', linestyle='--', linewidth=1.5, label='10X Baseline') - + if "10X" in cell_counts: + baseline_height = cell_counts["10X"] + ax.axhline(y=baseline_height, color="gray", linestyle="--", linewidth=1.5, label="10X Baseline") + # Set plot titles and labels - plt.title('Number of Cells per Segmentation Method') - plt.xlabel('Segmentation Method') - plt.ylabel('Number of Cells') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - + plt.title("Number of Cells per Segmentation Method") + plt.xlabel("Segmentation Method") + plt.ylabel("Number of Cells") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + # Save the figure as a PDF - plt.savefig(output_path / 'cell_counts_bar_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "cell_counts_bar_plot.pdf", bbox_inches="tight") plt.show() -def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: +def plot_percent_assigned( + segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the percentage of assigned transcripts (normalized) for each segmentation method. Args: @@ -646,43 +624,38 @@ def plot_percent_assigned(segmentations_dict: Dict[str, sc.AnnData], output_path percent_assigned_normalized = total_counts_per_gene.divide(max_counts_per_gene, axis=0) * 100 # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Percent Assigned (Normalized)': [] - }) - - + violin_data = pd.DataFrame({"Segmentation Method": [], "Percent Assigned (Normalized)": []}) # Add normalized percent_assigned data for each method for method in segmentations_dict.keys(): method_data = percent_assigned_normalized[method].dropna() - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_data), - 'Percent Assigned (Normalized)': method_data.values - }) + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_data), "Percent Assigned (Normalized)": method_data.values} + ) violin_data = pd.concat([violin_data, method_df], axis=0) - - violin_data.to_csv(output_path / 'percent_assigned_normalized.csv', index=True) + + violin_data.to_csv(output_path / "percent_assigned_normalized.csv", index=True) # Plot the violin plots plt.figure(figsize=(12, 8)) - ax = sns.violinplot(x='Segmentation Method', y='Percent Assigned (Normalized)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Percent Assigned (Normalized)", data=violin_data, palette=palette) # Add a dashed line for the 10X baseline - if '10X' in segmentations_dict: - baseline_height = percent_assigned_normalized['10X'].mean() - ax.axhline(y=baseline_height, color='gray', linestyle='--', linewidth=1.5, label='10X Baseline') + if "10X" in segmentations_dict: + baseline_height = percent_assigned_normalized["10X"].mean() + ax.axhline(y=baseline_height, color="gray", linestyle="--", linewidth=1.5, label="10X Baseline") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Percent Assigned (Normalized)') - plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Percent Assigned (Normalized)") + plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") # Save the figure as a PDF - plt.savefig(output_path / 'percent_assigned_normalized_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "percent_assigned_normalized_violin_plot.pdf", bbox_inches="tight") plt.show() + def plot_gene_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the normalized gene counts for each segmentation method. @@ -703,40 +676,37 @@ def plot_gene_counts(segmentations_dict: Dict[str, sc.AnnData], output_path: Pat normalized_counts_per_gene = total_counts_per_gene.divide(max_counts_per_gene, axis=0) # Prepare the data for the box plot - boxplot_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Normalized Counts': [] - }) + boxplot_data = pd.DataFrame({"Segmentation Method": [], "Normalized Counts": []}) for method in segmentations_dict.keys(): method_counts = normalized_counts_per_gene[method] - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_counts), - 'Normalized Counts': method_counts.values - }) + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_counts), "Normalized Counts": method_counts.values} + ) boxplot_data = pd.concat([boxplot_data, method_df], axis=0) - - boxplot_data.to_csv(output_path / 'gene_counts_normalized_data.csv', index=True) + + boxplot_data.to_csv(output_path / "gene_counts_normalized_data.csv", index=True) # Plot the box plots plt.figure(figsize=(3, 6)) - ax = sns.boxplot(x='Segmentation Method', y='Normalized Counts', data=boxplot_data, palette=palette, width=0.9) + ax = sns.boxplot(x="Segmentation Method", y="Normalized Counts", data=boxplot_data, palette=palette, width=0.9) # Add a dashed line for the 10X baseline - if '10X' in normalized_counts_per_gene: - baseline_height = normalized_counts_per_gene['10X'].mean() - plt.axhline(y=baseline_height, color='gray', linestyle='--', linewidth=1.5, label='10X Baseline') + if "10X" in normalized_counts_per_gene: + baseline_height = normalized_counts_per_gene["10X"].mean() + plt.axhline(y=baseline_height, color="gray", linestyle="--", linewidth=1.5, label="10X Baseline") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Normalized Counts') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Normalized Counts") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'gene_counts_normalized_boxplot_by_method.pdf', bbox_inches='tight') + plt.savefig(output_path / "gene_counts_normalized_boxplot_by_method.pdf", bbox_inches="tight") plt.show() + def plot_counts_per_cell(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the counts per cell (log2) for each segmentation method. @@ -745,36 +715,33 @@ def plot_counts_per_cell(segmentations_dict: Dict[str, sc.AnnData], output_path: output_path (Path): Path to the directory where the plot will be saved. """ # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Counts per Cell (log2)': [] - }) + violin_data = pd.DataFrame({"Segmentation Method": [], "Counts per Cell (log2)": []}) for method, adata in segmentations_dict.items(): - method_counts = adata.obs['transcripts'] + 1 - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_counts), - 'Counts per Cell (log2)': method_counts.values - }) + method_counts = adata.obs["transcripts"] + 1 + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_counts), "Counts per Cell (log2)": method_counts.values} + ) violin_data = pd.concat([violin_data, method_df], axis=0) - - violin_data.to_csv(output_path / 'counts_per_cell_data.csv', index=True) + + violin_data.to_csv(output_path / "counts_per_cell_data.csv", index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) - ax = sns.violinplot(x='Segmentation Method', y='Counts per Cell (log2)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Counts per Cell (log2)", data=violin_data, palette=palette) ax.set(ylim=(5, 300)) # Add a dashed line for the 10X-nucleus median - if '10X-nucleus' in segmentations_dict: - median_10X_nucleus = np.median(segmentations_dict['10X-nucleus'].obs['transcripts'] + 1) - ax.axhline(y=median_10X_nucleus, color='gray', linestyle='--', linewidth=1.5, label='10X-nucleus Median') + if "10X-nucleus" in segmentations_dict: + median_10X_nucleus = np.median(segmentations_dict["10X-nucleus"].obs["transcripts"] + 1) + ax.axhline(y=median_10X_nucleus, color="gray", linestyle="--", linewidth=1.5, label="10X-nucleus Median") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Counts per Cell (log2)') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Counts per Cell (log2)") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'counts_per_cell_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "counts_per_cell_violin_plot.pdf", bbox_inches="tight") plt.show() + def plot_cell_area(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: """Plot the cell area (log2) for each segmentation method. @@ -783,37 +750,36 @@ def plot_cell_area(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, output_path (Path): Path to the directory where the plot will be saved. """ # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Cell Area (log2)': [] - }) + violin_data = pd.DataFrame({"Segmentation Method": [], "Cell Area (log2)": []}) for method in segmentations_dict.keys(): - if 'cell_area' in segmentations_dict[method].obs.columns: - method_area = segmentations_dict[method].obs['cell_area'] + 1 - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_area), - 'Cell Area (log2)': method_area.values - }) + if "cell_area" in segmentations_dict[method].obs.columns: + method_area = segmentations_dict[method].obs["cell_area"] + 1 + method_df = pd.DataFrame( + {"Segmentation Method": [method] * len(method_area), "Cell Area (log2)": method_area.values} + ) violin_data = pd.concat([violin_data, method_df], axis=0) - violin_data.to_csv(output_path / 'cell_area_log2_data.csv', index=True) + violin_data.to_csv(output_path / "cell_area_log2_data.csv", index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) - ax = sns.violinplot(x='Segmentation Method', y='Cell Area (log2)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Cell Area (log2)", data=violin_data, palette=palette) ax.set(ylim=(5, 100)) # Add a dashed line for the 10X-nucleus median - if '10X-nucleus' in segmentations_dict: - median_10X_nucleus_area = np.median(segmentations_dict['10X-nucleus'].obs['cell_area'] + 1) - ax.axhline(y=median_10X_nucleus_area, color='gray', linestyle='--', linewidth=1.5, label='10X-nucleus Median') + if "10X-nucleus" in segmentations_dict: + median_10X_nucleus_area = np.median(segmentations_dict["10X-nucleus"].obs["cell_area"] + 1) + ax.axhline(y=median_10X_nucleus_area, color="gray", linestyle="--", linewidth=1.5, label="10X-nucleus Median") # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Cell Area (log2)') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Cell Area (log2)") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'cell_area_log2_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "cell_area_log2_violin_plot.pdf", bbox_inches="tight") plt.show() -def plot_transcript_density(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: + +def plot_transcript_density( + segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the transcript density (log2) for each segmentation method. Args: @@ -821,43 +787,53 @@ def plot_transcript_density(segmentations_dict: Dict[str, sc.AnnData], output_pa output_path (Path): Path to the directory where the plot will be saved. """ # Prepare the data for the violin plot - violin_data = pd.DataFrame({ - 'Segmentation Method': [], - 'Transcript Density (log2)': [] - }) + violin_data = pd.DataFrame({"Segmentation Method": [], "Transcript Density (log2)": []}) for method in segmentations_dict.keys(): - if 'cell_area' in segmentations_dict[method].obs.columns: - method_density = segmentations_dict[method].obs['transcripts'] / segmentations_dict[method].obs['cell_area'] + if "cell_area" in segmentations_dict[method].obs.columns: + method_density = segmentations_dict[method].obs["transcripts"] / segmentations_dict[method].obs["cell_area"] method_density_log2 = np.log2(method_density + 1) - method_df = pd.DataFrame({ - 'Segmentation Method': [method] * len(method_density_log2), - 'Transcript Density (log2)': method_density_log2.values - }) + method_df = pd.DataFrame( + { + "Segmentation Method": [method] * len(method_density_log2), + "Transcript Density (log2)": method_density_log2.values, + } + ) violin_data = pd.concat([violin_data, method_df], axis=0) - - violin_data.to_csv(output_path / 'transcript_density_log2_data.csv', index=True) + + violin_data.to_csv(output_path / "transcript_density_log2_data.csv", index=True) # Plot the violin plots plt.figure(figsize=(4, 6)) - ax = sns.violinplot(x='Segmentation Method', y='Transcript Density (log2)', data=violin_data, palette=palette) + ax = sns.violinplot(x="Segmentation Method", y="Transcript Density (log2)", data=violin_data, palette=palette) # Add a dashed line for the 10X-nucleus median - if '10X-nucleus' in segmentations_dict: - median_10X_nucleus_density_log2 = np.median(np.log2(segmentations_dict['10X-nucleus'].obs['transcripts'] / segmentations_dict['10X-nucleus'].obs['cell_area'] + 1)) - ax.axhline(y=median_10X_nucleus_density_log2, color='gray', linestyle='--', linewidth=1.5, label='10X-nucleus Median') + if "10X-nucleus" in segmentations_dict: + median_10X_nucleus_density_log2 = np.median( + np.log2( + segmentations_dict["10X-nucleus"].obs["transcripts"] + / segmentations_dict["10X-nucleus"].obs["cell_area"] + + 1 + ) + ) + ax.axhline( + y=median_10X_nucleus_density_log2, color="gray", linestyle="--", linewidth=1.5, label="10X-nucleus Median" + ) # Set plot titles and labels - plt.title('') - plt.xlabel('Segmentation Method') - plt.ylabel('Transcript Density (log2)') + plt.title("") + plt.xlabel("Segmentation Method") + plt.ylabel("Transcript Density (log2)") plt.xticks(rotation=0) # Save the figure as a PDF - plt.savefig(output_path / 'transcript_density_log2_violin_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "transcript_density_log2_violin_plot.pdf", bbox_inches="tight") plt.show() -def plot_general_statistics_plots(segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str]) -> None: + +def plot_general_statistics_plots( + segmentations_dict: Dict[str, sc.AnnData], output_path: Path, palette: Dict[str, str] +) -> None: """Create a summary plot with all the general statistics subplots. Args: @@ -884,11 +860,13 @@ def plot_general_statistics_plots(segmentations_dict: Dict[str, sc.AnnData], out plot_transcript_density(segmentations_dict, output_path, palette=palette) plt.tight_layout() - plt.savefig(output_path / 'general_statistics_plots.pdf', bbox_inches='tight') + plt.savefig(output_path / "general_statistics_plots.pdf", bbox_inches="tight") plt.show() -def plot_mecr_results(mecr_results: Dict[str, Dict[Tuple[str, str], float]], output_path: Path, palette: Dict[str, str]) -> None: +def plot_mecr_results( + mecr_results: Dict[str, Dict[Tuple[str, str], float]], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the MECR (Mutually Exclusive Co-expression Rate) results for each segmentation method. Args: @@ -900,26 +878,25 @@ def plot_mecr_results(mecr_results: Dict[str, Dict[Tuple[str, str], float]], out plot_data = [] for method, mecr_dict in mecr_results.items(): for gene_pair, mecr_value in mecr_dict.items(): - plot_data.append({ - 'Segmentation Method': method, - 'Gene Pair': f"{gene_pair[0]} - {gene_pair[1]}", - 'MECR': mecr_value - }) + plot_data.append( + {"Segmentation Method": method, "Gene Pair": f"{gene_pair[0]} - {gene_pair[1]}", "MECR": mecr_value} + ) df = pd.DataFrame(plot_data) - df.to_csv(output_path / 'mcer_box.csv', index=True) + df.to_csv(output_path / "mcer_box.csv", index=True) plt.figure(figsize=(3, 6)) - sns.boxplot(x='Segmentation Method', y='MECR', data=df, palette=palette) - plt.title('Mutually Exclusive Co-expression Rate (MECR)') - plt.xlabel('Segmentation Method') - plt.ylabel('MECR') - plt.xticks(rotation=45, ha='right') + sns.boxplot(x="Segmentation Method", y="MECR", data=df, palette=palette) + plt.title("Mutually Exclusive Co-expression Rate (MECR)") + plt.xlabel("Segmentation Method") + plt.ylabel("MECR") + plt.xticks(rotation=45, ha="right") plt.tight_layout() - plt.savefig(output_path / 'mecr_results_boxplot.pdf', bbox_inches='tight') + plt.savefig(output_path / "mecr_results_boxplot.pdf", bbox_inches="tight") plt.show() - -def plot_quantized_mecr_counts(quantized_mecr_counts: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str]) -> None: +def plot_quantized_mecr_counts( + quantized_mecr_counts: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the quantized MECR values against transcript counts for each segmentation method, with point size proportional to the variance of MECR. Args: @@ -927,38 +904,40 @@ def plot_quantized_mecr_counts(quantized_mecr_counts: Dict[str, pd.DataFrame], o output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - quantized_mecr_counts.to_csv(output_path / 'quantized_mecr_counts.csv', index=True) + quantized_mecr_counts.to_csv(output_path / "quantized_mecr_counts.csv", index=True) plt.figure(figsize=(9, 6)) for method, df in quantized_mecr_counts.items(): plt.plot( - df['average_counts'], - df['average_mecr'], - marker='o', - linestyle='-', - color=palette.get(method, '#333333'), + df["average_counts"], + df["average_mecr"], + marker="o", + linestyle="-", + color=palette.get(method, "#333333"), label=method, - markersize=0 # No markers, only lines + markersize=0, # No markers, only lines ) plt.scatter( - df['average_counts'], - df['average_mecr'], - s=df['variance_mecr'] * 1e5, # Size of points based on the variance of MECR - color=palette.get(method, '#333333'), + df["average_counts"], + df["average_mecr"], + s=df["variance_mecr"] * 1e5, # Size of points based on the variance of MECR + color=palette.get(method, "#333333"), alpha=0.7, # Slight transparency for overlapping points - edgecolor='w', # White edge color for better visibility - linewidth=0.5 # Thin edge line + edgecolor="w", # White edge color for better visibility + linewidth=0.5, # Thin edge line ) - plt.title('Quantized MECR by Transcript Counts') - plt.xlabel('Average Transcript Counts') - plt.ylabel('Average MECR') + plt.title("Quantized MECR by Transcript Counts") + plt.xlabel("Average Transcript Counts") + plt.ylabel("Average MECR") # Place the legend outside the plot on the top right - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") plt.tight_layout() - plt.savefig(output_path / 'quantized_mecr_counts_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "quantized_mecr_counts_plot.pdf", bbox_inches="tight") plt.show() - - -def plot_quantized_mecr_area(quantized_mecr_area: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str]) -> None: + + +def plot_quantized_mecr_area( + quantized_mecr_area: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str] +) -> None: """Plot the quantized MECR values against cell areas for each segmentation method, with point size proportional to the variance of MECR. Args: @@ -966,40 +945,41 @@ def plot_quantized_mecr_area(quantized_mecr_area: Dict[str, pd.DataFrame], outpu output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - quantized_mecr_area.to_csv(output_path / 'quantized_mecr_area.csv', index=True) + quantized_mecr_area.to_csv(output_path / "quantized_mecr_area.csv", index=True) plt.figure(figsize=(6, 4)) for method, df in quantized_mecr_area.items(): plt.plot( - df['average_area'], - df['average_mecr'], - marker='o', + df["average_area"], + df["average_mecr"], + marker="o", # s=df['variance_mecr'] * 1e5, - linestyle='-', - color=palette.get(method, '#333333'), + linestyle="-", + color=palette.get(method, "#333333"), label=method, - markersize=0 + markersize=0, ) plt.scatter( - df['average_area'], - df['average_mecr'], - s=df['variance_mecr'] * 1e5, # Size of points based on the variance of MECR - color=palette.get(method, '#333333'), + df["average_area"], + df["average_mecr"], + s=df["variance_mecr"] * 1e5, # Size of points based on the variance of MECR + color=palette.get(method, "#333333"), alpha=0.7, # Slight transparency for overlapping points - edgecolor='w', # White edge color for better visibility - linewidth=0.5 # Thin edge line + edgecolor="w", # White edge color for better visibility + linewidth=0.5, # Thin edge line ) - plt.title('Quantized MECR by Cell Area') - plt.xlabel('Average Cell Area') - plt.ylabel('Average MECR') + plt.title("Quantized MECR by Cell Area") + plt.xlabel("Average Cell Area") + plt.ylabel("Average MECR") # Place the legend outside the plot on the top right - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") plt.tight_layout() - plt.savefig(output_path / 'quantized_mecr_area_plot.pdf', bbox_inches='tight') + plt.savefig(output_path / "quantized_mecr_area_plot.pdf", bbox_inches="tight") plt.show() - -def plot_contamination_results(contamination_results: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str]) -> None: +def plot_contamination_results( + contamination_results: Dict[str, pd.DataFrame], output_path: Path, palette: Dict[str, str] +) -> None: """Plot contamination results for each segmentation method. Args: @@ -1007,18 +987,18 @@ def plot_contamination_results(contamination_results: Dict[str, pd.DataFrame], o output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - contamination_results.to_csv(output_path / 'contamination_results.csv', index=True) + contamination_results.to_csv(output_path / "contamination_results.csv", index=True) for method, df in contamination_results.items(): plt.figure(figsize=(10, 6)) - sns.heatmap(df, annot=True, cmap='coolwarm', linewidths=0.5) - plt.title(f'Contamination Matrix for {method}') - plt.xlabel('Target Cell Type') - plt.ylabel('Source Cell Type') + sns.heatmap(df, annot=True, cmap="coolwarm", linewidths=0.5) + plt.title(f"Contamination Matrix for {method}") + plt.xlabel("Target Cell Type") + plt.ylabel("Source Cell Type") plt.tight_layout() - plt.savefig(output_path / f'{method}_contamination_matrix.pdf', bbox_inches='tight') + plt.savefig(output_path / f"{method}_contamination_matrix.pdf", bbox_inches="tight") plt.show() - - + + def plot_contamination_boxplots(boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: """Plot boxplots for contamination values across different segmentation methods. @@ -1027,31 +1007,25 @@ def plot_contamination_boxplots(boxplot_data: pd.DataFrame, output_path: Path, p output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - boxplot_data.to_csv(output_path / 'contamination_box_results.csv', index=True) + boxplot_data.to_csv(output_path / "contamination_box_results.csv", index=True) plt.figure(figsize=(14, 8)) - sns.boxplot( - x='Source Cell Type', - y='Contamination', - hue='Segmentation Method', - data=boxplot_data, - palette=palette - ) - plt.title('Neighborhood Contamination') - plt.xlabel('Source Cell Type') - plt.ylabel('Contamination') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(rotation=45, ha='right') - + sns.boxplot(x="Source Cell Type", y="Contamination", hue="Segmentation Method", data=boxplot_data, palette=palette) + plt.title("Neighborhood Contamination") + plt.xlabel("Source Cell Type") + plt.ylabel("Contamination") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.xticks(rotation=45, ha="right") + plt.tight_layout() - plt.savefig(output_path / 'contamination_boxplots.pdf', bbox_inches='tight') + plt.savefig(output_path / "contamination_boxplots.pdf", bbox_inches="tight") plt.show() - - + + def plot_umaps_with_scores( - segmentations_dict: Dict[str, sc.AnnData], - clustering_scores: Dict[str, Tuple[float, float]], - output_path: Path, - palette: Dict[str, str] + segmentations_dict: Dict[str, sc.AnnData], + clustering_scores: Dict[str, Tuple[float, float]], + output_path: Path, + palette: Dict[str, str], ) -> None: """Plot UMAPs colored by cell type for each segmentation method and display clustering scores in the title. Args: @@ -1069,17 +1043,15 @@ def plot_umaps_with_scores( plt.figure(figsize=(8, 6)) sc.pp.neighbors(adata_copy, n_neighbors=5) sc.tl.umap(adata_copy, spread=5) - sc.pl.umap(adata_copy, color='celltype_major', palette=palette, show=False) + sc.pl.umap(adata_copy, color="celltype_major", palette=palette, show=False) # Add clustering scores to the title - ch_score, sh_score = compute_clustering_scores(adata_copy, cell_type_column='celltype_major') + ch_score, sh_score = compute_clustering_scores(adata_copy, cell_type_column="celltype_major") plt.title(f"{method} - UMAP\nCalinski-Harabasz: {ch_score:.2f}, Silhouette: {sh_score:.2f}") # Save the figure - plt.savefig(output_path / f'{method}_umap_with_scores.pdf', bbox_inches='tight') + plt.savefig(output_path / f"{method}_umap_with_scores.pdf", bbox_inches="tight") plt.show() - - def plot_entropy_boxplots(entropy_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: """Plot boxplots for neighborhood entropy across different segmentation methods by cell type. @@ -1090,45 +1062,37 @@ def plot_entropy_boxplots(entropy_boxplot_data: pd.DataFrame, output_path: Path, """ plt.figure(figsize=(14, 8)) sns.boxplot( - x='Cell Type', - y='Neighborhood Entropy', - hue='Segmentation Method', - data=entropy_boxplot_data, - palette=palette + x="Cell Type", y="Neighborhood Entropy", hue="Segmentation Method", data=entropy_boxplot_data, palette=palette ) - plt.title('Neighborhood Entropy') - plt.xlabel('Cell Type') - plt.ylabel('Neighborhood Entropy') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(rotation=45, ha='right') + plt.title("Neighborhood Entropy") + plt.xlabel("Cell Type") + plt.ylabel("Neighborhood Entropy") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.xticks(rotation=45, ha="right") plt.tight_layout() - plt.savefig(output_path / 'neighborhood_entropy_boxplots.pdf', bbox_inches='tight') + plt.savefig(output_path / "neighborhood_entropy_boxplots.pdf", bbox_inches="tight") plt.show() - - -def plot_sensitivity_boxplots(sensitivity_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str]) -> None: +def plot_sensitivity_boxplots( + sensitivity_boxplot_data: pd.DataFrame, output_path: Path, palette: Dict[str, str] +) -> None: """Plot boxplots for sensitivity across different segmentation methods by cell type. Args: sensitivity_boxplot_data (pd.DataFrame): DataFrame containing sensitivity data for all segmentation methods. output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - sensitivity_boxplot_data.to_csv(output_path / 'sensitivity_results.csv', index=True) + sensitivity_boxplot_data.to_csv(output_path / "sensitivity_results.csv", index=True) plt.figure(figsize=(14, 8)) sns.boxplot( - x='Cell Type', - y='Sensitivity', - hue='Segmentation Method', - data=sensitivity_boxplot_data, - palette=palette + x="Cell Type", y="Sensitivity", hue="Segmentation Method", data=sensitivity_boxplot_data, palette=palette ) - plt.title('Sensitivity Score') - plt.xlabel('Cell Type') - plt.ylabel('Sensitivity') - plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(rotation=45, ha='right') + plt.title("Sensitivity Score") + plt.xlabel("Cell Type") + plt.ylabel("Sensitivity") + plt.legend(title="", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.xticks(rotation=45, ha="right") plt.tight_layout() - plt.savefig(output_path / 'sensitivity_boxplots.pdf', bbox_inches='tight') - plt.show() \ No newline at end of file + plt.savefig(output_path / "sensitivity_boxplots.pdf", bbox_inches="tight") + plt.show() diff --git a/src/segger/validation/xenium_explorer.py b/src/segger/validation/xenium_explorer.py index d4edcd9..6ad5fc9 100644 --- a/src/segger/validation/xenium_explorer.py +++ b/src/segger/validation/xenium_explorer.py @@ -10,7 +10,6 @@ from typing import Dict, Any, Optional, List, Tuple - def str_to_uint32(cell_id_str: str) -> Tuple[int, int]: """Convert a string cell ID back to uint32 format. @@ -20,18 +19,31 @@ def str_to_uint32(cell_id_str: str) -> Tuple[int, int]: Returns: Tuple[int, int]: The cell ID in uint32 format and the dataset suffix. """ - prefix, suffix = cell_id_str.split('-') + prefix, suffix = cell_id_str.split("-") str_to_hex_mapping = { - 'a': '0', 'b': '1', 'c': '2', 'd': '3', - 'e': '4', 'f': '5', 'g': '6', 'h': '7', - 'i': '8', 'j': '9', 'k': 'a', 'l': 'b', - 'm': 'c', 'n': 'd', 'o': 'e', 'p': 'f' + "a": "0", + "b": "1", + "c": "2", + "d": "3", + "e": "4", + "f": "5", + "g": "6", + "h": "7", + "i": "8", + "j": "9", + "k": "a", + "l": "b", + "m": "c", + "n": "d", + "o": "e", + "p": "f", } - hex_prefix = ''.join([str_to_hex_mapping[char] for char in prefix]) + hex_prefix = "".join([str_to_hex_mapping[char] for char in prefix]) cell_id_uint32 = int(hex_prefix, 16) dataset_suffix = int(suffix) return cell_id_uint32, dataset_suffix + def get_indices_indptr(input_array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Get the indices and indptr arrays for sparse matrix representation. @@ -47,13 +59,14 @@ def get_indices_indptr(input_array: np.ndarray) -> Tuple[np.ndarray, np.ndarray] for cluster in clusters: cluster_indices = np.where(input_array == cluster)[0] - indptr[cluster-1] = len(indices) + indptr[cluster - 1] = len(indices) indices.extend(cluster_indices) indices.extend(-np.zeros(len(input_array[input_array == 0]))) indices = np.array(indices, dtype=np.int32).astype(np.uint32) return indices, indptr + def save_cell_clustering(merged: pd.DataFrame, zarr_path: str, columns: List[str]) -> None: """Save cell clustering information to a Zarr file. @@ -64,35 +77,38 @@ def save_cell_clustering(merged: pd.DataFrame, zarr_path: str, columns: List[str """ import zarr - new_zarr = zarr.open(zarr_path, mode='w') - new_zarr.create_group('/cell_groups') + new_zarr = zarr.open(zarr_path, mode="w") + new_zarr.create_group("/cell_groups") mappings = [] for index, column in enumerate(columns): - new_zarr['cell_groups'].create_group(index) + new_zarr["cell_groups"].create_group(index) classes = list(np.unique(merged[column].astype(str))) - mapping_dict = {key: i for i, key in zip(range(1, len(classes)), [k for k in classes if k != 'nan'])} - mapping_dict['nan'] = 0 + mapping_dict = {key: i for i, key in zip(range(1, len(classes)), [k for k in classes if k != "nan"])} + mapping_dict["nan"] = 0 clusters = merged[column].astype(str).replace(mapping_dict).values.astype(int) indices, indptr = get_indices_indptr(clusters) - new_zarr['cell_groups'][index].create_dataset('indices', data=indices) - new_zarr['cell_groups'][index].create_dataset('indptr', data=indptr) + new_zarr["cell_groups"][index].create_dataset("indices", data=indices) + new_zarr["cell_groups"][index].create_dataset("indptr", data=indptr) mappings.append(mapping_dict) - new_zarr['cell_groups'].attrs.update({ - "major_version": 1, - "minor_version": 0, - "number_groupings": len(columns), - "grouping_names": columns, - "group_names": [ - [k for k, v in sorted(mapping_dict.items(), key=lambda item: item[1])][1:] for mapping_dict in mappings - ] - }) + new_zarr["cell_groups"].attrs.update( + { + "major_version": 1, + "minor_version": 0, + "number_groupings": len(columns), + "grouping_names": columns, + "group_names": [ + [k for k, v in sorted(mapping_dict.items(), key=lambda item: item[1])][1:] for mapping_dict in mappings + ], + } + ) new_zarr.store.close() -def draw_umap(adata, column: str = 'leiden') -> None: + +def draw_umap(adata, column: str = "leiden") -> None: """Draw UMAP plots for the given AnnData object. Args: @@ -102,12 +118,13 @@ def draw_umap(adata, column: str = 'leiden') -> None: sc.pl.umap(adata, color=[column]) plt.show() - sc.pl.umap(adata, color=['KRT5', 'KRT7'], vmax='p95') + sc.pl.umap(adata, color=["KRT5", "KRT7"], vmax="p95") plt.show() - sc.pl.umap(adata, color=['ACTA2', 'PTPRC'], vmax='p95') + sc.pl.umap(adata, color=["ACTA2", "PTPRC"], vmax="p95") plt.show() + def get_leiden_umap(adata, draw: bool = False): """Perform Leiden clustering and UMAP visualization on the given AnnData object. @@ -123,12 +140,9 @@ def get_leiden_umap(adata, draw: bool = False): gene_names = adata.var_names mean_expression_values = adata.X.mean(axis=0) - gene_mean_expression_df = pd.DataFrame({ - 'gene_name': gene_names, - 'mean_expression': mean_expression_values - }) - top_genes = gene_mean_expression_df.sort_values(by='mean_expression', ascending=False).head(30) - top_gene_names = top_genes['gene_name'].tolist() + gene_mean_expression_df = pd.DataFrame({"gene_name": gene_names, "mean_expression": mean_expression_values}) + top_genes = gene_mean_expression_df.sort_values(by="mean_expression", ascending=False).head(30) + top_gene_names = top_genes["gene_name"].tolist() sc.pp.normalize_total(adata) sc.pp.log1p(adata) @@ -137,11 +151,12 @@ def get_leiden_umap(adata, draw: bool = False): sc.tl.leiden(adata) if draw: - draw_umap(adata, 'leiden') + draw_umap(adata, "leiden") return adata -def get_median_expression_table(adata, column: str = 'leiden') -> pd.DataFrame: + +def get_median_expression_table(adata, column: str = "leiden") -> pd.DataFrame: """Get the median expression table for the given AnnData object. Args: @@ -151,7 +166,23 @@ def get_median_expression_table(adata, column: str = 'leiden') -> pd.DataFrame: Returns: pd.DataFrame: The median expression table. """ - top_genes = ['GATA3', 'ACTA2', 'KRT7', 'KRT8', 'KRT5', 'AQP1', 'SERPINA3', 'PTGDS', 'CXCR4', 'SFRP1', 'ENAH', 'MYH11', 'SVIL', 'KRT14', 'CD4'] + top_genes = [ + "GATA3", + "ACTA2", + "KRT7", + "KRT8", + "KRT5", + "AQP1", + "SERPINA3", + "PTGDS", + "CXCR4", + "SFRP1", + "ENAH", + "MYH11", + "SVIL", + "KRT14", + "CD4", + ] top_gene_indices = [adata.var_names.get_loc(gene) for gene in top_genes] clusters = adata.obs[column] @@ -160,26 +191,29 @@ def get_median_expression_table(adata, column: str = 'leiden') -> pd.DataFrame: for cluster in clusters.unique(): cluster_cells = adata[clusters == cluster].X cluster_expression = cluster_cells[:, top_gene_indices] - gene_medians = [pd.Series(cluster_expression[:, gene_idx]).median() for gene_idx in range(len(top_gene_indices))] - cluster_data[f'Cluster_{cluster}'] = gene_medians + gene_medians = [ + pd.Series(cluster_expression[:, gene_idx]).median() for gene_idx in range(len(top_gene_indices)) + ] + cluster_data[f"Cluster_{cluster}"] = gene_medians cluster_expression_df = pd.DataFrame(cluster_data, index=top_genes) sorted_columns = sorted(cluster_expression_df.columns.values, key=lambda x: int(x.split("_")[-1])) cluster_expression_df = cluster_expression_df[sorted_columns] - return cluster_expression_df.T.style.background_gradient(cmap='Greens') + return cluster_expression_df.T.style.background_gradient(cmap="Greens") + def seg2explorer( - seg_df: pd.DataFrame, - source_path: str, - output_dir: str, - cells_filename: str = 'seg_cells', - analysis_filename: str = "seg_analysis", + seg_df: pd.DataFrame, + source_path: str, + output_dir: str, + cells_filename: str = "seg_cells", + analysis_filename: str = "seg_analysis", xenium_filename: str = "seg_experiment.xenium", - analysis_df: Optional[pd.DataFrame] = None, - draw: bool = False, - cell_id_columns: str = 'seg_cell_id', + analysis_df: Optional[pd.DataFrame] = None, + draw: bool = False, + cell_id_columns: str = "seg_cell_id", area_low: float = 10, - area_high: float = 100 + area_high: float = 100, ) -> None: """Convert seg output to a format compatible with Xenium explorer. @@ -214,8 +248,8 @@ def seg2explorer( for cell_incremental_id, (seg_cell_id, seg_cell) in tqdm(enumerate(grouped_by), total=len(grouped_by)): if len(seg_cell) < 5: continue - - cell_convex_hull = ConvexHull(seg_cell[['x_location', 'y_location']]) + + cell_convex_hull = ConvexHull(seg_cell[["x_location", "y_location"]]) if cell_convex_hull.area > area_high: continue if cell_convex_hull.area < area_low: @@ -224,25 +258,31 @@ def seg2explorer( uint_cell_id = cell_incremental_id + 1 cell_id2old_id[uint_cell_id] = seg_cell_id - seg_nucleous = seg_cell[seg_cell['overlaps_nucleus'] == 1] + seg_nucleous = seg_cell[seg_cell["overlaps_nucleus"] == 1] if len(seg_nucleous) >= 3: - nucleus_convex_hull = ConvexHull(seg_nucleous[['x_location', 'y_location']]) + nucleus_convex_hull = ConvexHull(seg_nucleous[["x_location", "y_location"]]) cell_id.append(uint_cell_id) - cell_summary.append({ - "cell_centroid_x": seg_cell['x_location'].mean(), - "cell_centroid_y": seg_cell['y_location'].mean(), - "cell_area": cell_convex_hull.area, - "nucleus_centroid_x": seg_cell['x_location'].mean(), - "nucleus_centroid_y": seg_cell['y_location'].mean(), - "nucleus_area": cell_convex_hull.area, - "z_level": (seg_cell.z_location.mean() // 3).round(0) * 3 - }) - - polygon_num_vertices[0].append(len(cell_convex_hull.vertices)) + cell_summary.append( + { + "cell_centroid_x": seg_cell["x_location"].mean(), + "cell_centroid_y": seg_cell["y_location"].mean(), + "cell_area": cell_convex_hull.area, + "nucleus_centroid_x": seg_cell["x_location"].mean(), + "nucleus_centroid_y": seg_cell["y_location"].mean(), + "nucleus_area": cell_convex_hull.area, + "z_level": (seg_cell.z_location.mean() // 3).round(0) * 3, + } + ) + + polygon_num_vertices[0].append(len(cell_convex_hull.vertices)) polygon_num_vertices[1].append(len(nucleus_convex_hull.vertices) if len(seg_nucleous) >= 3 else 0) - polygon_vertices[0].append(seg_cell[['x_location', 'y_location']].values[cell_convex_hull.vertices]) - polygon_vertices[1].append(seg_nucleous[['x_location', 'y_location']].values[nucleus_convex_hull.vertices] if len(seg_nucleous) >= 3 else np.array([[], []]).T) + polygon_vertices[0].append(seg_cell[["x_location", "y_location"]].values[cell_convex_hull.vertices]) + polygon_vertices[1].append( + seg_nucleous[["x_location", "y_location"]].values[nucleus_convex_hull.vertices] + if len(seg_nucleous) >= 3 + else np.array([[], []]).T + ) seg_mask_value.append(cell_incremental_id + 1) cell_polygon_vertices = get_flatten_version(polygon_vertices[0], max_value=21) @@ -251,66 +291,80 @@ def seg2explorer( cells = { "cell_id": np.array([np.array(cell_id), np.ones(len(cell_id))], dtype=np.uint32).T, "cell_summary": pd.DataFrame(cell_summary).values.astype(np.float64), - "polygon_num_vertices": np.array([ - [min(x+1, x+1) for x in polygon_num_vertices[1]], - [min(x+1, x+1) for x in polygon_num_vertices[0]] - ], dtype=np.int32), + "polygon_num_vertices": np.array( + [ + [min(x + 1, x + 1) for x in polygon_num_vertices[1]], + [min(x + 1, x + 1) for x in polygon_num_vertices[0]], + ], + dtype=np.int32, + ), "polygon_vertices": np.array([nucl_polygon_vertices, cell_polygon_vertices]).astype(np.float32), - "seg_mask_value": np.array(seg_mask_value, dtype=np.int32) + "seg_mask_value": np.array(seg_mask_value, dtype=np.int32), } - - existing_store = zarr.open(source_path / 'cells.zarr.zip', mode='r') - new_store = zarr.open(storage / f'{cells_filename}.zarr.zip', mode='w') - - new_store['cell_id'] = cells['cell_id'] - new_store['polygon_num_vertices'] = cells['polygon_num_vertices'] - new_store['polygon_vertices'] = cells['polygon_vertices'] - new_store['seg_mask_value'] = cells['seg_mask_value'] - + + existing_store = zarr.open(source_path / "cells.zarr.zip", mode="r") + new_store = zarr.open(storage / f"{cells_filename}.zarr.zip", mode="w") + + new_store["cell_id"] = cells["cell_id"] + new_store["polygon_num_vertices"] = cells["polygon_num_vertices"] + new_store["polygon_vertices"] = cells["polygon_vertices"] + new_store["seg_mask_value"] = cells["seg_mask_value"] + new_store.attrs.update(existing_store.attrs) - new_store.attrs['number_cells'] = len(cells['cell_id']) + new_store.attrs["number_cells"] = len(cells["cell_id"]) new_store.store.close() - + if analysis_df is None: analysis_df = pd.DataFrame([cell_id2old_id[i] for i in cell_id], columns=[cell_id_columns]) - analysis_df['default'] = 'seg' - + analysis_df["default"] = "seg" + zarr_df = pd.DataFrame([cell_id2old_id[i] for i in cell_id], columns=[cell_id_columns]) - clustering_df = pd.merge(zarr_df, analysis_df, how='left', on=cell_id_columns) + clustering_df = pd.merge(zarr_df, analysis_df, how="left", on=cell_id_columns) clusters_names = [i for i in analysis_df.columns if i != cell_id_columns] - clusters_dict = {cluster: {j: i for i, j in zip(range(1, len(sorted(np.unique(clustering_df[cluster].dropna()))) + 1), sorted(np.unique(clustering_df[cluster].dropna())))} for cluster in clusters_names} + clusters_dict = { + cluster: { + j: i + for i, j in zip( + range(1, len(sorted(np.unique(clustering_df[cluster].dropna()))) + 1), + sorted(np.unique(clustering_df[cluster].dropna())), + ) + } + for cluster in clusters_names + } - new_zarr = zarr.open(storage / (analysis_filename + ".zarr.zip"), mode='w') - new_zarr.create_group('/cell_groups') + new_zarr = zarr.open(storage / (analysis_filename + ".zarr.zip"), mode="w") + new_zarr.create_group("/cell_groups") clusters = [[clusters_dict[cluster].get(x, 0) for x in list(clustering_df[cluster])] for cluster in clusters_names] for i in range(len(clusters)): - new_zarr['cell_groups'].create_group(i) + new_zarr["cell_groups"].create_group(i) indices, indptr = get_indices_indptr(np.array(clusters[i])) - new_zarr['cell_groups'][i].create_dataset('indices', data=indices) - new_zarr['cell_groups'][i].create_dataset('indptr', data=indptr) - - new_zarr['cell_groups'].attrs.update({ - "major_version": 1, - "minor_version": 0, - "number_groupings": len(clusters_names), - "grouping_names": clusters_names, - "group_names": [ - [x[0] for x in sorted(clusters_dict[cluster].items(), key=lambda x: x[1])] - for cluster in clusters_names - ] - }) + new_zarr["cell_groups"][i].create_dataset("indices", data=indices) + new_zarr["cell_groups"][i].create_dataset("indptr", data=indptr) + + new_zarr["cell_groups"].attrs.update( + { + "major_version": 1, + "minor_version": 0, + "number_groupings": len(clusters_names), + "grouping_names": clusters_names, + "group_names": [ + [x[0] for x in sorted(clusters_dict[cluster].items(), key=lambda x: x[1])] for cluster in clusters_names + ], + } + ) new_zarr.store.close() generate_experiment_file( - template_path=source_path / 'experiment.xenium', + template_path=source_path / "experiment.xenium", output_path=storage / xenium_filename, cells_name=cells_filename, - analysis_name=analysis_filename + analysis_name=analysis_filename, ) + def get_flatten_version(polygons: List[np.ndarray], max_value: int = 21) -> np.ndarray: """Get the flattened version of polygon vertices. @@ -326,23 +380,21 @@ def get_flatten_version(polygons: List[np.ndarray], max_value: int = 21) -> np.n for i, polygon in tqdm(enumerate(polygons), total=len(polygons)): num_points = len(polygon) if num_points == 0: - result[i] = np.zeros(n*2) + result[i] = np.zeros(n * 2) continue elif num_points < max_value: repeated_points = np.tile(polygon[0], (n - num_points, 1)) padded_polygon = np.concatenate((polygon, repeated_points), axis=0) else: padded_polygon = np.zeros((n, 2)) - padded_polygon[:min(num_points, n)] = polygon[:min(num_points, n)] + padded_polygon[: min(num_points, n)] = polygon[: min(num_points, n)] padded_polygon[-1] = polygon[0] result[i] = padded_polygon.flatten() return result + def generate_experiment_file( - template_path: str, - output_path: str, - cells_name: str = "seg_cells", - analysis_name: str = 'seg_analysis' + template_path: str, output_path: str, cells_name: str = "seg_cells", analysis_name: str = "seg_analysis" ) -> None: """Generate the experiment file for Xenium. @@ -357,12 +409,12 @@ def generate_experiment_file( with open(template_path) as f: experiment = json.load(f) - experiment['images'].pop('morphology_filepath') - experiment['images'].pop('morphology_focus_filepath') + experiment["images"].pop("morphology_filepath") + experiment["images"].pop("morphology_focus_filepath") - experiment['xenium_explorer_files']['cells_zarr_filepath'] = f"{cells_name}.zarr.zip" - experiment['xenium_explorer_files'].pop('cell_features_zarr_filepath') - experiment['xenium_explorer_files']['analysis_zarr_filepath'] = f"{analysis_name}.zarr.zip" + experiment["xenium_explorer_files"]["cells_zarr_filepath"] = f"{cells_name}.zarr.zip" + experiment["xenium_explorer_files"].pop("cell_features_zarr_filepath") + experiment["xenium_explorer_files"]["analysis_zarr_filepath"] = f"{analysis_name}.zarr.zip" - with open(output_path, 'w') as f: + with open(output_path, "w") as f: json.dump(experiment, f, indent=2) diff --git a/tests/test_data.py b/tests/test_data.py index 3f3c5b8..7712067 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -7,46 +7,44 @@ import unittest import pandas as pd + class TestDataUtils(unittest.TestCase): def test_filter_transcripts(self): - data = { - 'qv': [30, 10, 25], - 'feature_name': ['gene1', 'NegControlProbe_gene2', 'gene3'] - } + data = {"qv": [30, 10, 25], "feature_name": ["gene1", "NegControlProbe_gene2", "gene3"]} df = pd.DataFrame(data) filtered_df = filter_transcripts(df, min_qv=20) self.assertEqual(len(filtered_df), 2) - self.assertTrue('gene1' in filtered_df['feature_name'].values) - self.assertTrue('gene3' in filtered_df['feature_name'].values) + self.assertTrue("gene1" in filtered_df["feature_name"].values) + self.assertTrue("gene3" in filtered_df["feature_name"].values) def test_compute_transcript_metrics(self): data = { - 'qv': [40, 40, 25, 25], - 'feature_name': ['gene1', 'gene2', 'gene1', 'gene2'], - 'cell_id': [1, 1, -1, 2], - 'overlaps_nucleus': [1, 0, 0, 1] + "qv": [40, 40, 25, 25], + "feature_name": ["gene1", "gene2", "gene1", "gene2"], + "cell_id": [1, 1, -1, 2], + "overlaps_nucleus": [1, 0, 0, 1], } df = pd.DataFrame(data) metrics = compute_transcript_metrics(df, qv_threshold=30) - self.assertAlmostEqual(metrics['percent_assigned'], 50.0) - self.assertAlmostEqual(metrics['percent_cytoplasmic'], 50.0) - self.assertAlmostEqual(metrics['percent_nucleus'], 50.0) - self.assertAlmostEqual(metrics['percent_non_assigned_cytoplasmic'], 100.0) - self.assertEqual(len(metrics['gene_metrics']), 2) - self.assertTrue('gene1' in metrics['gene_metrics']['feature_name'].values) - self.assertTrue('gene2' in metrics['gene_metrics']['feature_name'].values) - + self.assertAlmostEqual(metrics["percent_assigned"], 50.0) + self.assertAlmostEqual(metrics["percent_cytoplasmic"], 50.0) + self.assertAlmostEqual(metrics["percent_nucleus"], 50.0) + self.assertAlmostEqual(metrics["percent_non_assigned_cytoplasmic"], 100.0) + self.assertEqual(len(metrics["gene_metrics"]), 2) + self.assertTrue("gene1" in metrics["gene_metrics"]["feature_name"].values) + self.assertTrue("gene2" in metrics["gene_metrics"]["feature_name"].values) + def setUp(self): data = { - 'x_location': [100, 200, 300], - 'y_location': [100, 200, 300], - 'z_location': [0, 0, 0], - 'qv': [40, 40, 25], - 'feature_name': ['gene1', 'gene2', 'gene3'], - 'transcript_id': [1, 2, 3], - 'overlaps_nucleus': [1, 0, 1], - 'cell_id': [1, -1, 2] + "x_location": [100, 200, 300], + "y_location": [100, 200, 300], + "z_location": [0, 0, 0], + "qv": [40, 40, 25], + "feature_name": ["gene1", "gene2", "gene3"], + "transcript_id": [1, 2, 3], + "overlaps_nucleus": [1, 0, 1], + "cell_id": [1, -1, 2], } self.df = pd.DataFrame(data) self.sample = XeniumSample(self.df) @@ -54,21 +52,18 @@ def setUp(self): def test_crop_transcripts(self): cropped_sample = self.sample.crop_transcripts(50, 50, 200, 200) self.assertEqual(len(cropped_sample.transcripts_df), 1) - self.assertEqual(cropped_sample.transcripts_df.iloc[0]['feature_name'], 'gene1') + self.assertEqual(cropped_sample.transcripts_df.iloc[0]["feature_name"], "gene1") def test_filter_transcripts(self): filtered_df = XeniumSample.filter_transcripts(self.df, min_qv=30) self.assertEqual(len(filtered_df), 2) - self.assertTrue('gene1' in filtered_df['feature_name'].values) - self.assertTrue('gene2' in filtered_df['feature_name'].values) + self.assertTrue("gene1" in filtered_df["feature_name"].values) + self.assertTrue("gene2" in filtered_df["feature_name"].values) def test_unassign_all_except_nucleus(self): unassigned_df = XeniumSample.unassign_all_except_nucleus(self.df) - self.assertEqual(unassigned_df.loc[unassigned_df['overlaps_nucleus'] == 0, 'cell_id'].values[0], 'UNASSIGNED') + self.assertEqual(unassigned_df.loc[unassigned_df["overlaps_nucleus"] == 0, "cell_id"].values[0], "UNASSIGNED") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() - - - diff --git a/tests/test_model.py b/tests/test_model.py index 802c17b..b6dfdf0 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -4,21 +4,18 @@ from torch_geometric.nn import to_hetero from torch_geometric.data import HeteroData + class TestSeggerModel(unittest.TestCase): def setUp(self): - model = Segger( - init_emb=16, hidden_channels=32, out_channels=32, heads=3 - ) - metadata = ( - ["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")] - ) - self.model = to_hetero(model, metadata=metadata, aggr='sum') + model = Segger(init_emb=16, hidden_channels=32, out_channels=32, heads=3) + metadata = (["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")]) + self.model = to_hetero(model, metadata=metadata, aggr="sum") self.data = HeteroData() - self.data['tx'].x = torch.randn(10, 16) - self.data['nc'].x = torch.randn(5, 16) - self.data['tx', 'belongs', 'nc'].edge_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) - self.data['tx', 'neighbors', 'tx'].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) + self.data["tx"].x = torch.randn(10, 16) + self.data["nc"].x = torch.randn(5, 16) + self.data["tx", "belongs", "nc"].edge_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) + self.data["tx", "neighbors", "tx"].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) def test_forward(self): out = self.model(self.data.x_dict, self.data.edge_index_dict) @@ -26,13 +23,15 @@ def test_forward(self): self.assertTrue("nc" in out) self.assertEqual(out["tx"].shape[1], 32 * 3) self.assertEqual(out["nc"].shape[1], 32 * 3) - ''' + + """ def test_decode(self): z = {'tx': torch.randn(10, 16), 'nc': torch.randn(5, 16)} edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) out = self.model.decode(z, edge_label_index) self.assertEqual(out.shape[0], 3) - ''' + """ + if __name__ == "__main__": unittest.main() diff --git a/tests/test_prediction.py b/tests/test_prediction.py index 9d90316..fd77227 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -4,21 +4,23 @@ from segger.models.segger_model import Segger from torch_geometric.data import HeteroData + class TestPrediction(unittest.TestCase): def setUp(self): self.model = Segger(init_emb=16, hidden_channels=32, out_channels=32, heads=3) - self.lit_model = load_model("path/to/checkpoint", 16, 32, 32, 3, 'sum') + self.lit_model = load_model("path/to/checkpoint", 16, 32, 32, 3, "sum") self.data = HeteroData() - self.data['tx'].x = torch.randn(10, 16) - self.data['nc'].x = torch.randn(5, 16) - self.data['tx', 'belongs', 'nc'].edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) - self.data['tx', 'neighbors', 'tx'].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) + self.data["tx"].x = torch.randn(10, 16) + self.data["nc"].x = torch.randn(5, 16) + self.data["tx", "belongs", "nc"].edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) + self.data["tx", "neighbors", "tx"].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) def test_predict(self): output_path = "path/to/output.csv.gz" predict(self.lit_model, "path/to/dataset", output_path, 0.5, 4, 20, 5, 10) self.assertTrue(os.path.exists(output_path)) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/tests/test_training.py b/tests/test_training.py index 5154fef..11615f8 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -4,40 +4,32 @@ from torch_geometric.data import HeteroData import torch + class TestTraining(unittest.TestCase): def setUp(self): # Setup model and data - metadata = ( - ["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")] - ) + metadata = (["tx", "nc"], [("tx", "belongs", "nc"), ("tx", "neighbors", "tx")]) self.lit_segger = LitSegger( init_emb=16, hidden_channels=32, out_channels=32, heads=3, metadata=metadata, - aggr='sum', + aggr="sum", ) self.data = HeteroData() self.data["tx"].x = torch.randn(10, 16) self.data["nc"].x = torch.randn(5, 16) - self.data["tx", "belongs", "nc"].edge_label_index = torch.tensor( - [[0, 1, 2], [0, 1, 2]], dtype=torch.long - ) - self.data["tx", "belongs", "nc"].edge_label = torch.tensor( - [1.0, 0.0, 1.0], dtype=torch.float - ) - self.data["tx", "neighbors", "tx"].edge_index = torch.tensor( - [[0, 1], [1, 2]], dtype=torch.long - ) - + self.data["tx", "belongs", "nc"].edge_label_index = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long) + self.data["tx", "belongs", "nc"].edge_label = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float) + self.data["tx", "neighbors", "tx"].edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long) + # Move model and data to GPU self.lit_segger.cuda() self.data.to("cuda") - def test_training_step(self): optimizer = self.lit_segger.configure_optimizers() self.lit_segger.train() @@ -47,5 +39,6 @@ def test_training_step(self): optimizer.step() self.assertGreater(loss.item(), 0) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() From 9b84911bfddfd4939b6ef2c403c69ca735df15af Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 13 Oct 2024 20:26:13 +0200 Subject: [PATCH 087/156] added example script spatialdata --- scripts/spatialdata_example.py | 49 ++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 scripts/spatialdata_example.py diff --git a/scripts/spatialdata_example.py b/scripts/spatialdata_example.py new file mode 100644 index 0000000..0e26f7e --- /dev/null +++ b/scripts/spatialdata_example.py @@ -0,0 +1,49 @@ +from segger.data.io import XeniumSample, SpatialDataSample +from pathlib import Path + +# Paths to Xenium sample data and where to store Segger data +segger_data_dir = Path("data/segger_data/") +# raw xenium data +# xenium_data_dir = Path('data/xenium_2.0.0_io/data') +# xenium_data_dir = Path('data/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny_outs') + +# spatialdata zarr data +xenium_data_dir = Path("data/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny_outs.zarr") + +# Setup Xenium sample to create dataset +# xs = XeniumSample(verbose=False) +# xs.set_file_paths( +# # raw xenium data +# transcripts_path=xenium_data_dir / 'transcripts.parquet', +# boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet', +# ) +xs = SpatialDataSample(verbose=False, feature_name="feature_name") +xs.set_file_paths( + # spatialdata zarr data + transcripts_path=xenium_data_dir / "points/transcripts/points.parquet", + boundaries_path=xenium_data_dir / "shapes/nucleus_boundaries/shapes.parquet", +) +xs.set_metadata() + +import shutil + +if segger_data_dir.exists() and "segger_data" in str(segger_data_dir): + shutil.rmtree(str(segger_data_dir)) + +try: + xs.save_dataset_for_segger( + processed_dir=segger_data_dir, + r_tx=5, + k_tx=15, + x_size=120, + y_size=120, + d_x=100, + d_y=100, + margin_x=10, + margin_y=10, + scale_boundaries=1, + num_workers=1, # change to you number of CPUs + # val_prob=0.5, + ) +except AssertionError as err: + print(f"Dataset already exists at {segger_data_dir}") From b261e6d2cdd532804094f3cba67dd9f841b748c6 Mon Sep 17 00:00:00 2001 From: Unyi Date: Sun, 13 Oct 2024 22:03:35 +0200 Subject: [PATCH 088/156] CLI for prediction --- src/segger/cli/configs/predict/default.yaml | 68 +++++++++++++++++++++ src/segger/cli/predict.py | 46 +++++++------- src/segger/prediction/predict.py | 13 ++-- 3 files changed, 99 insertions(+), 28 deletions(-) create mode 100644 src/segger/cli/configs/predict/default.yaml mode change 100644 => 100755 src/segger/cli/predict.py diff --git a/src/segger/cli/configs/predict/default.yaml b/src/segger/cli/configs/predict/default.yaml new file mode 100644 index 0000000..6021865 --- /dev/null +++ b/src/segger/cli/configs/predict/default.yaml @@ -0,0 +1,68 @@ +segger_data_dir: + type: Path + required: true + help: Directory containing the processed Segger dataset. +models_dir: + type: Path + required: true + help: Directory containing the trained models. +benchmarks_dir: + type: Path + required: true + help: Directory to save the segmentation results. +transcripts_file: + type: str + required: true + help: Path to the transcripts file. +batch_size: + type: int + default: 1 + help: Batch size for processing. +num_workers: + type: int + default: 1 + help: Number of workers for data loading. +model_version: + type: int + default: 0 + help: Model version to load. +save_tag: + type: str + default: 'segger_embedding_1001_0.5' + help: Tag for saving segmentation results. +min_transcripts: + type: int + default: 5 + help: Minimum number of transcripts for segmentation. +cell_id_col: + type: str + default: 'segger_cell_id' + help: Column name for cell IDs. +use_cc: + is_flag: true + default: false + help: Use connected components if specified. +knn_method: + type: str + default: 'cuda' + help: Method for KNN computation. +file_format: + type: str + default: 'anndata' + help: File format for output data. +k_bd: + type: int + default: 4 + help: K value for boundary computation. +dist_bd: + type: int + default: 12 + help: Distance for boundary computation. +k_tx: + type: int + default: 5 + help: K value for transcript computation. +dist_tx: + type: int + default: 5 + help: Distance for transcript computation. diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py old mode 100644 new mode 100755 index eca5a4b..e16c1bf --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -1,13 +1,20 @@ import click from segger.training.segger_data_module import SeggerDataModule from segger.prediction.predict import segment, load_model +from segger.cli.utils import add_options, CustomFormatter from pathlib import Path import logging +from argparse import Namespace import os -os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' +# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' -@click.command(name="run_segmentation", help="Run the Segger segmentation model.") +# Path to default YAML configuration file +predict_yml = Path(__file__).parent / 'configs' / 'predict' / 'default.yaml' + +help_msg = "Run the Segger segmentation model." +@click.command(name="run_segmentation", help=help_msg) +@add_options(config_path=predict_yml) @click.option('--segger_data_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') @click.option('--models_dir', type=Path, required=True, help='Directory containing the trained models.') @click.option('--benchmarks_dir', type=Path, required=True, help='Directory to save the segmentation results.') @@ -25,13 +32,7 @@ @click.option('--dist_bd', type=int, default=12, help='Distance for boundary computation.') @click.option('--k_tx', type=int, default=5, help='K value for transcript computation.') @click.option('--dist_tx', type=int, default=5, help='Distance for transcript computation.') -def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Path, - transcripts_file: str, batch_size: int = 1, num_workers: int = 1, - model_version: int = 0, save_tag: str = 'segger_embedding_1001_0.5', - min_transcripts: int = 5, cell_id_col: str = 'segger_cell_id', - use_cc: bool = False, knn_method: str = 'cuda', - file_format: str = 'anndata', k_bd: int = 4, dist_bd: int = 12, - k_tx: int = 5, dist_tx: int = 5): +def run_segmentation(args: Namespace): # Setup logging logging.basicConfig(level=logging.INFO) @@ -40,31 +41,32 @@ def run_segmentation(segger_data_dir: Path, models_dir: Path, benchmarks_dir: Pa logger.info("Initializing Segger data module...") # Initialize the Lightning data module dm = SeggerDataModule( - data_dir=segger_data_dir, - batch_size=batch_size, - num_workers=num_workers, + data_dir=args.segger_data_dir, + batch_size=args.batch_size, + num_workers=args.num_workers, ) dm.setup() logger.info("Loading the model...") # Load in the latest checkpoint - model_path = models_dir / 'lightning_logs' / f'version_{model_version}' + model_path = Path(args.models_dir) / 'lightning_logs' / f'version_{args.model_version}' model = load_model(model_path / 'checkpoints') logger.info("Running segmentation...") segment( model, dm, - save_dir=benchmarks_dir, - seg_tag=save_tag, - transcript_file=transcripts_file, - file_format=file_format, - receptive_field={'k_bd': k_bd, 'dist_bd': dist_bd, 'k_tx': k_tx, 'dist_tx': dist_tx}, - min_transcripts=min_transcripts, - cell_id_col=cell_id_col, - use_cc=use_cc, - knn_method=knn_method, + save_dir=args.benchmarks_dir, + seg_tag=args.save_tag, + transcript_file=args.transcripts_file, + file_format=args.file_format, + receptive_field={'k_bd': args.k_bd, 'dist_bd': args.dist_bd, 'k_tx': args.k_tx, 'dist_tx': args.dist_tx}, + min_transcripts=args.min_transcripts, + cell_id_col=args.cell_id_col, + use_cc=args.use_cc, + knn_method=args.knn_method, + verbose=True, ) logger.info("Segmentation completed.") diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index cf73116..e046ad8 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -100,7 +100,8 @@ def get_similarity_scores( batch: Batch, from_type: str, to_type: str, - receptive_field: dict + receptive_field: dict, + knn_method: str, ) -> coo_matrix: """ Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes @@ -124,7 +125,7 @@ def get_similarity_scores( batch[from_type].pos[:, :2], # 'bd' positions k=receptive_field[f'k_{to_type}'], dist=receptive_field[f'dist_{to_type}'], - method='cuda' + method=knn_method, ) edge_index = coo_to_dense_adj( edge_index.T, @@ -219,7 +220,7 @@ def _get_id(): if len(batch['bd'].pos) >= 10: # Compute similarity scores between 'tx' and 'bd' - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method) torch.cuda.empty_cache() # Convert sparse matrix to dense format dense_scores = scores.toarray() # Convert to dense NumPy array @@ -233,14 +234,14 @@ def _get_id(): all_ids = np.concatenate(batch['bd'].id) # Keep IDs as NumPy array assignments['segger_cell_id'] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments['segger_cell_id'][mask] = all_ids[max_indices[mask]] # Assign IDs + assignments.loc[mask, 'segger_cell_id'] = all_ids[max_indices[mask]] # Assign IDs del dense_scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() -# Move back to CPU + # Move back to CPU assignments['bound'] = 0 - assignments['bound'][mask] = 1 + assignments.loc[mask, 'bound'] = 1 if use_cc: From 57ff20dd321e438fe7c472290006d305d142f140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sun, 13 Oct 2024 22:13:33 +0200 Subject: [PATCH 089/156] Forward pass to initialize the model --- src/segger/cli/train_model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/segger/cli/train_model.py b/src/segger/cli/train_model.py index 78bd5d7..3a90a53 100644 --- a/src/segger/cli/train_model.py +++ b/src/segger/cli/train_model.py @@ -70,6 +70,11 @@ def train_model(args: Namespace): metadata=metadata, ) + # Forward pass to initialize the model + if args.devices > 1: + batch = dm.train[0] + ls.forward(batch) + # Initialize the Lightning trainer trainer = Trainer( accelerator=args.accelerator, # Directly use the specified accelerator From e50bc30a3450eef4aa59c6d7677d8baf1da79ec6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 13 Oct 2024 20:23:45 +0000 Subject: [PATCH 090/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/segger/cli/configs/predict/default.yaml | 8 ++-- src/segger/cli/predict.py | 52 +++++++++++---------- src/segger/prediction/predict.py | 17 ++++--- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/segger/cli/configs/predict/default.yaml b/src/segger/cli/configs/predict/default.yaml index 6021865..4ff49d0 100644 --- a/src/segger/cli/configs/predict/default.yaml +++ b/src/segger/cli/configs/predict/default.yaml @@ -28,7 +28,7 @@ model_version: help: Model version to load. save_tag: type: str - default: 'segger_embedding_1001_0.5' + default: "segger_embedding_1001_0.5" help: Tag for saving segmentation results. min_transcripts: type: int @@ -36,7 +36,7 @@ min_transcripts: help: Minimum number of transcripts for segmentation. cell_id_col: type: str - default: 'segger_cell_id' + default: "segger_cell_id" help: Column name for cell IDs. use_cc: is_flag: true @@ -44,11 +44,11 @@ use_cc: help: Use connected components if specified. knn_method: type: str - default: 'cuda' + default: "cuda" help: Method for KNN computation. file_format: type: str - default: 'anndata' + default: "anndata" help: File format for output data. k_bd: type: int diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py index 5130feb..6d16bd1 100755 --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -10,30 +10,32 @@ # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' # Path to default YAML configuration file -predict_yml = Path(__file__).parent / 'configs' / 'predict' / 'default.yaml' +predict_yml = Path(__file__).parent / "configs" / "predict" / "default.yaml" help_msg = "Run the Segger segmentation model." + + @click.command(name="run_segmentation", help=help_msg) @add_options(config_path=predict_yml) -@click.option('--segger_data_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') -@click.option('--models_dir', type=Path, required=True, help='Directory containing the trained models.') -@click.option('--benchmarks_dir', type=Path, required=True, help='Directory to save the segmentation results.') -@click.option('--transcripts_file', type=str, required=True, help='Path to the transcripts file.') -@click.option('--batch_size', type=int, default=1, help='Batch size for processing.') -@click.option('--num_workers', type=int, default=1, help='Number of workers for data loading.') -@click.option('--model_version', type=int, default=0, help='Model version to load.') -@click.option('--save_tag', type=str, default='segger_embedding_1001_0.5', help='Tag for saving segmentation results.') -@click.option('--min_transcripts', type=int, default=5, help='Minimum number of transcripts for segmentation.') -@click.option('--cell_id_col', type=str, default='segger_cell_id', help='Column name for cell IDs.') -@click.option('--use_cc', is_flag=True, default=False, help='Use connected components if specified.') -@click.option('--knn_method', type=str, default='cuda', help='Method for KNN computation.') -@click.option('--file_format', type=str, default='anndata', help='File format for output data.') -@click.option('--k_bd', type=int, default=4, help='K value for boundary computation.') -@click.option('--dist_bd', type=int, default=12, help='Distance for boundary computation.') -@click.option('--k_tx', type=int, default=5, help='K value for transcript computation.') -@click.option('--dist_tx', type=int, default=5, help='Distance for transcript computation.') +@click.option("--segger_data_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option("--models_dir", type=Path, required=True, help="Directory containing the trained models.") +@click.option("--benchmarks_dir", type=Path, required=True, help="Directory to save the segmentation results.") +@click.option("--transcripts_file", type=str, required=True, help="Path to the transcripts file.") +@click.option("--batch_size", type=int, default=1, help="Batch size for processing.") +@click.option("--num_workers", type=int, default=1, help="Number of workers for data loading.") +@click.option("--model_version", type=int, default=0, help="Model version to load.") +@click.option("--save_tag", type=str, default="segger_embedding_1001_0.5", help="Tag for saving segmentation results.") +@click.option("--min_transcripts", type=int, default=5, help="Minimum number of transcripts for segmentation.") +@click.option("--cell_id_col", type=str, default="segger_cell_id", help="Column name for cell IDs.") +@click.option("--use_cc", is_flag=True, default=False, help="Use connected components if specified.") +@click.option("--knn_method", type=str, default="cuda", help="Method for KNN computation.") +@click.option("--file_format", type=str, default="anndata", help="File format for output data.") +@click.option("--k_bd", type=int, default=4, help="K value for boundary computation.") +@click.option("--dist_bd", type=int, default=12, help="Distance for boundary computation.") +@click.option("--k_tx", type=int, default=5, help="K value for transcript computation.") +@click.option("--dist_tx", type=int, default=5, help="Distance for transcript computation.") def run_segmentation(args: Namespace): - + # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -42,16 +44,16 @@ def run_segmentation(args: Namespace): # Initialize the Lightning data module dm = SeggerDataModule( data_dir=args.segger_data_dir, - batch_size=args.batch_size, - num_workers=args.num_workers, + batch_size=args.batch_size, + num_workers=args.num_workers, ) dm.setup() logger.info("Loading the model...") # Load in the latest checkpoint - model_path = Path(args.models_dir) / 'lightning_logs' / f'version_{args.model_version}' - model = load_model(model_path / 'checkpoints') + model_path = Path(args.models_dir) / "lightning_logs" / f"version_{args.model_version}" + model = load_model(model_path / "checkpoints") logger.info("Running segmentation...") segment( @@ -60,8 +62,8 @@ def run_segmentation(args: Namespace): save_dir=args.benchmarks_dir, seg_tag=args.save_tag, transcript_file=args.transcripts_file, - file_format=args.file_format, - receptive_field={'k_bd': args.k_bd, 'dist_bd': args.dist_bd, 'k_tx': args.k_tx, 'dist_tx': args.dist_tx}, + file_format=args.file_format, + receptive_field={"k_bd": args.k_bd, "dist_bd": args.dist_bd, "k_tx": args.k_tx, "dist_tx": args.dist_tx}, min_transcripts=args.min_transcripts, cell_id_col=args.cell_id_col, use_cc=args.use_cc, diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index ac81daa..0f44b46 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -97,7 +97,7 @@ def sort_order(c): def get_similarity_scores( - model: torch.nn.Module, + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, @@ -124,8 +124,8 @@ def get_similarity_scores( edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], method=knn_method, ) edge_index = coo_to_dense_adj( @@ -233,16 +233,15 @@ def _get_id(): all_ids = np.concatenate(batch["bd"].id) # Keep IDs as NumPy array assignments["segger_cell_id"] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments.loc[mask, 'segger_cell_id'] = all_ids[max_indices[mask]] # Assign IDs - + assignments.loc[mask, "segger_cell_id"] = all_ids[max_indices[mask]] # Assign IDs + del dense_scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() # Move back to CPU - assignments['bound'] = 0 - assignments.loc[mask, 'bound'] = 1 - - + assignments["bound"] = 0 + assignments.loc[mask, "bound"] = 1 + if use_cc: # Compute similarity scores between 'tx' and 'tx' scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) From d20c87f7f90f323f812b516674a0de40051599c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sun, 13 Oct 2024 23:28:10 +0200 Subject: [PATCH 091/156] Script for submitting a job to cluster --- submit_job.sh | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 submit_job.sh diff --git a/submit_job.sh b/submit_job.sh new file mode 100644 index 0000000..d2f0180 --- /dev/null +++ b/submit_job.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# To acquire the Singularity image, run: +# singularity pull docker://danielunyi42/segger_dev + +# Pipeline 1: Data Preprocessing Parameters +OUTPUT_LOG_PREPROCESS="preprocess_output.log" # Path to the output log file for data preprocessing +BASE_DIR="data_xenium" # Base directory for input data +DATA_DIR="data_segger" # Directory for output data +SAMPLE_TYPE="xenium" # Type of sample being processed +TILE_WIDTH=120 # Width of each data tile +TILE_HEIGHT=120 # Height of each data tile +N_WORKERS_PREPROCESS=16 # Number of workers for parallel processing +RAM_PREPROCESS="16G" # Total memory requested for the job + +# Pipeline 2: Training Parameters +OUTPUT_LOG_TRAIN="train_output.log" # Path to the output log file for training +DATASET_DIR="data_segger" # Directory for dataset +MODELS_DIR="model_dir" # Directory to save models +SAMPLE_TAG="first_training" # Tag for the training sample +N_WORKERS_TRAIN=16 # Number of CPUs to request +RAM_TRAIN="16G" # Amount of memory to request +GPUS=8 # Number of GPUs to request +GPU_MEM_TRAIN="8G" # Amount of memory per GPU + +# Pipeline 3: Prediction Parameters +OUTPUT_LOG_PREDICT="predict_output.log" # Path to the output log file for prediction +SEGGER_DATA_DIR="data_segger" # Directory containing the segger data +MODELS_DIR="model_dir" # Directory containing the trained models +BENCHMARKS_DIR="benchmark_dir" # Directory for saving the benchmark results +TRANSCRIPTS_FILE="data_xenium" # Path to the transcripts file +KNN_METHOD="cuda" # Method for KNN search +N_WORKERS_PREDICT=16 # Number of CPUs to request +RAM_PREDICT="16G" # Amount of memory to request +GPU_MEM_PREDICT="8G" # Amount of memory for GPU + +# Paths and common variables +LOCAL_REPO_DIR="/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" # Where the segger_dev repository is located on the local machine +CONTAINER_DIR="/workspace/segger_dev" # Where the segger_dev repository is located in the container +SINGULARITY_IMAGE="segger_dev_latest.sif" # Path to the Singularity image + +# Functions to run different pipelines +run_data_processing() { + bsub -o "$OUTPUT_LOG_PREPROCESS" -n "$N_WORKERS_PREPROCESS" -R "rusage[mem=$RAM_PREPROCESS]" -q long \ + "singularity exec --bind $LOCAL_REPO_DIR:$CONTAINER_DIR \ + $SINGULARITY_IMAGE python3 src/segger/cli/create_dataset_fast.py \ + --base_dir '$BASE_DIR' \ + --data_dir '$DATA_DIR' \ + --sample_type '$SAMPLE_TYPE' \ + --tile_width $TILE_WIDTH \ + --tile_height $TILE_HEIGHT \ + --n_workers $N_WORKERS_PREPROCESS" +} + +run_training() { + bsub -o "$OUTPUT_LOG_TRAIN" -n "$N_WORKERS_TRAIN" -R "rusage[mem=$RAM_TRAIN]" -R "tensorcore" -gpu "num=$GPUS:j_exclusive=no:gmem=$GPU_MEM_TRAIN" -q gpu \ + "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR \ + $SINGULARITY_IMAGE python3 src/segger/cli/train_model.py \ + --dataset_dir '$DATASET_DIR' \ + --models_dir '$MODELS_DIR' \ + --sample_tag '$SAMPLE_TAG' \ + --num_workers $N_WORKERS_TRAIN \ + --devices $GPUS" +} + +run_prediction() { + bsub -o "$OUTPUT_LOG_PREDICT" -n "$N_WORKERS_PREDICT" -R "rusage[mem=$RAM_PREDICT]" -R "tensorcore" -gpu "num=1:j_exclusive=no:gmem=$GPU_MEM_PREDICT" -q gpu \ + "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR \ + $SINGULARITY_IMAGE python3 src/segger/cli/predict.py \ + --segger_data_dir '$SEGGER_DATA_DIR' \ + --models_dir '$MODELS_DIR' \ + --benchmarks_dir '$BENCHMARKS_DIR' \ + --transcripts_file '$TRANSCRIPTS_FILE' \ + --knn_method '$KNN_METHOD' \ + --num_workers $N_WORKERS_PREDICT" +} + +# Main script logic +echo "Which pipelines would you like to run? (1: Data Processing, 2: Training, 3: Prediction)" +echo "Enter the pipeline numbers you want to run (e.g., '1 2 3' for all, or '1' for only data processing):" +read -r pipelines + +for pipeline in $pipelines; do + case $pipeline in + 1) + echo "Running Data Processing..." + run_data_processing + ;; + 2) + echo "Running Training..." + run_training + ;; + 3) + echo "Running Prediction..." + run_prediction + ;; + *) + echo "Invalid choice: $pipeline" + ;; + esac +done From f07b33e15ceafc73816e61aa1d0831690686b988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sun, 13 Oct 2024 23:33:25 +0200 Subject: [PATCH 092/156] Merge branch 'main' of https://github.com/daniel-unyi-42/segger_dev into submit_script --- src/segger/cli/configs/predict/default.yaml | 8 ++-- src/segger/cli/predict.py | 52 +++++++++++---------- src/segger/prediction/predict.py | 17 ++++--- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/segger/cli/configs/predict/default.yaml b/src/segger/cli/configs/predict/default.yaml index 6021865..4ff49d0 100644 --- a/src/segger/cli/configs/predict/default.yaml +++ b/src/segger/cli/configs/predict/default.yaml @@ -28,7 +28,7 @@ model_version: help: Model version to load. save_tag: type: str - default: 'segger_embedding_1001_0.5' + default: "segger_embedding_1001_0.5" help: Tag for saving segmentation results. min_transcripts: type: int @@ -36,7 +36,7 @@ min_transcripts: help: Minimum number of transcripts for segmentation. cell_id_col: type: str - default: 'segger_cell_id' + default: "segger_cell_id" help: Column name for cell IDs. use_cc: is_flag: true @@ -44,11 +44,11 @@ use_cc: help: Use connected components if specified. knn_method: type: str - default: 'cuda' + default: "cuda" help: Method for KNN computation. file_format: type: str - default: 'anndata' + default: "anndata" help: File format for output data. k_bd: type: int diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py index 5130feb..6d16bd1 100755 --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -10,30 +10,32 @@ # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' # Path to default YAML configuration file -predict_yml = Path(__file__).parent / 'configs' / 'predict' / 'default.yaml' +predict_yml = Path(__file__).parent / "configs" / "predict" / "default.yaml" help_msg = "Run the Segger segmentation model." + + @click.command(name="run_segmentation", help=help_msg) @add_options(config_path=predict_yml) -@click.option('--segger_data_dir', type=Path, required=True, help='Directory containing the processed Segger dataset.') -@click.option('--models_dir', type=Path, required=True, help='Directory containing the trained models.') -@click.option('--benchmarks_dir', type=Path, required=True, help='Directory to save the segmentation results.') -@click.option('--transcripts_file', type=str, required=True, help='Path to the transcripts file.') -@click.option('--batch_size', type=int, default=1, help='Batch size for processing.') -@click.option('--num_workers', type=int, default=1, help='Number of workers for data loading.') -@click.option('--model_version', type=int, default=0, help='Model version to load.') -@click.option('--save_tag', type=str, default='segger_embedding_1001_0.5', help='Tag for saving segmentation results.') -@click.option('--min_transcripts', type=int, default=5, help='Minimum number of transcripts for segmentation.') -@click.option('--cell_id_col', type=str, default='segger_cell_id', help='Column name for cell IDs.') -@click.option('--use_cc', is_flag=True, default=False, help='Use connected components if specified.') -@click.option('--knn_method', type=str, default='cuda', help='Method for KNN computation.') -@click.option('--file_format', type=str, default='anndata', help='File format for output data.') -@click.option('--k_bd', type=int, default=4, help='K value for boundary computation.') -@click.option('--dist_bd', type=int, default=12, help='Distance for boundary computation.') -@click.option('--k_tx', type=int, default=5, help='K value for transcript computation.') -@click.option('--dist_tx', type=int, default=5, help='Distance for transcript computation.') +@click.option("--segger_data_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option("--models_dir", type=Path, required=True, help="Directory containing the trained models.") +@click.option("--benchmarks_dir", type=Path, required=True, help="Directory to save the segmentation results.") +@click.option("--transcripts_file", type=str, required=True, help="Path to the transcripts file.") +@click.option("--batch_size", type=int, default=1, help="Batch size for processing.") +@click.option("--num_workers", type=int, default=1, help="Number of workers for data loading.") +@click.option("--model_version", type=int, default=0, help="Model version to load.") +@click.option("--save_tag", type=str, default="segger_embedding_1001_0.5", help="Tag for saving segmentation results.") +@click.option("--min_transcripts", type=int, default=5, help="Minimum number of transcripts for segmentation.") +@click.option("--cell_id_col", type=str, default="segger_cell_id", help="Column name for cell IDs.") +@click.option("--use_cc", is_flag=True, default=False, help="Use connected components if specified.") +@click.option("--knn_method", type=str, default="cuda", help="Method for KNN computation.") +@click.option("--file_format", type=str, default="anndata", help="File format for output data.") +@click.option("--k_bd", type=int, default=4, help="K value for boundary computation.") +@click.option("--dist_bd", type=int, default=12, help="Distance for boundary computation.") +@click.option("--k_tx", type=int, default=5, help="K value for transcript computation.") +@click.option("--dist_tx", type=int, default=5, help="Distance for transcript computation.") def run_segmentation(args: Namespace): - + # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -42,16 +44,16 @@ def run_segmentation(args: Namespace): # Initialize the Lightning data module dm = SeggerDataModule( data_dir=args.segger_data_dir, - batch_size=args.batch_size, - num_workers=args.num_workers, + batch_size=args.batch_size, + num_workers=args.num_workers, ) dm.setup() logger.info("Loading the model...") # Load in the latest checkpoint - model_path = Path(args.models_dir) / 'lightning_logs' / f'version_{args.model_version}' - model = load_model(model_path / 'checkpoints') + model_path = Path(args.models_dir) / "lightning_logs" / f"version_{args.model_version}" + model = load_model(model_path / "checkpoints") logger.info("Running segmentation...") segment( @@ -60,8 +62,8 @@ def run_segmentation(args: Namespace): save_dir=args.benchmarks_dir, seg_tag=args.save_tag, transcript_file=args.transcripts_file, - file_format=args.file_format, - receptive_field={'k_bd': args.k_bd, 'dist_bd': args.dist_bd, 'k_tx': args.k_tx, 'dist_tx': args.dist_tx}, + file_format=args.file_format, + receptive_field={"k_bd": args.k_bd, "dist_bd": args.dist_bd, "k_tx": args.k_tx, "dist_tx": args.dist_tx}, min_transcripts=args.min_transcripts, cell_id_col=args.cell_id_col, use_cc=args.use_cc, diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index ac81daa..0f44b46 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -97,7 +97,7 @@ def sort_order(c): def get_similarity_scores( - model: torch.nn.Module, + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, @@ -124,8 +124,8 @@ def get_similarity_scores( edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], method=knn_method, ) edge_index = coo_to_dense_adj( @@ -233,16 +233,15 @@ def _get_id(): all_ids = np.concatenate(batch["bd"].id) # Keep IDs as NumPy array assignments["segger_cell_id"] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments.loc[mask, 'segger_cell_id'] = all_ids[max_indices[mask]] # Assign IDs - + assignments.loc[mask, "segger_cell_id"] = all_ids[max_indices[mask]] # Assign IDs + del dense_scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() # Move back to CPU - assignments['bound'] = 0 - assignments.loc[mask, 'bound'] = 1 - - + assignments["bound"] = 0 + assignments.loc[mask, "bound"] = 1 + if use_cc: # Compute similarity scores between 'tx' and 'tx' scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) From 61d5f75e7843ad6f46e82b0d253e2b48c9efb43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sun, 13 Oct 2024 23:51:02 +0200 Subject: [PATCH 093/156] Add Dockerfile --- Dockerfile | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9fe2617 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 + +RUN apt-get update -y && apt-get install -y --no-install-recommends \ + git \ + wget \ + tmux \ + vim \ + htop \ + zip \ + unzip \ + build-essential \ + python3 \ + python3-pip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir debugpy + +WORKDIR /workspace + +RUN git clone https://github.com/EliHei2/segger_dev.git /workspace/segger_dev && \ + pip install -e "/workspace/segger_dev[cuda12,rapids12,cupy12,faiss]" + +EXPOSE 5678 + +ENV PYTHONPATH=/home/developer/segger_dev/src:$PYTHONPATH From dd5e3d5a284ce88f00fb6f3877ee378936247a45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Mon, 14 Oct 2024 12:48:25 +0200 Subject: [PATCH 094/156] Small fix in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9fe2617..def71aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,4 +24,4 @@ RUN git clone https://github.com/EliHei2/segger_dev.git /workspace/segger_dev && EXPOSE 5678 -ENV PYTHONPATH=/home/developer/segger_dev/src:$PYTHONPATH +ENV PYTHONPATH=/workspace/segger_dev/src:$PYTHONPATH From 589fafe046e554480e4f3d1c7b09f19fffb1b511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Mon, 14 Oct 2024 13:58:02 +0200 Subject: [PATCH 095/156] Fix in PyG dataset --- src/segger/data/parquet/pyg_dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index d64b9e6..4626ac1 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -70,8 +70,9 @@ def get(self, idx: int) -> Data: data["tx"].x = data["tx"].x.unsqueeze(1) assert data["tx"].x.dim() == 2 # this is an issue in PyG's RandomLinkSplit, dimensions are not consistent if there is only one edge in the graph - if data["tx", "belongs", "bd"].edge_label_index.dim() == 1: - data["tx", "belongs", "bd"].edge_label_index = data["tx", "belongs", "bd"].edge_label_index.unsqueeze(1) - data["tx", "belongs", "bd"].edge_label = data["tx", "belongs", "bd"].edge_label.unsqueeze(0) - assert data["tx", "belongs", "bd"].edge_label_index.dim() == 2 + if hasattr(data, "edge_label_index"): + if data["tx", "belongs", "bd"].edge_label_index.dim() == 1: + data["tx", "belongs", "bd"].edge_label_index = data["tx", "belongs", "bd"].edge_label_index.unsqueeze(1) + data["tx", "belongs", "bd"].edge_label = data["tx", "belongs", "bd"].edge_label.unsqueeze(0) + assert data["tx", "belongs", "bd"].edge_label_index.dim() == 2 return data From 4a9a41766b362159a299e860dba2e17335bda888 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 14 Oct 2024 19:43:38 +0200 Subject: [PATCH 096/156] added predict.parqeut for way faster prediciton including cc --- docs/notebooks/benchmark_bc.py | 52 +- scripts/predict_model_sample.py | 53 +- scripts/sandbox.py | 240 ++++++++ src/segger/prediction/predict.py | 555 ++++++------------ src/segger/prediction/predict_parquet.py | 680 +++++++++++++++++++++++ src/segger/validation/utils.py | 9 +- 6 files changed, 1182 insertions(+), 407 deletions(-) create mode 100644 scripts/sandbox.py create mode 100644 src/segger/prediction/predict_parquet.py diff --git a/docs/notebooks/benchmark_bc.py b/docs/notebooks/benchmark_bc.py index 31ac0bd..614083d 100644 --- a/docs/notebooks/benchmark_bc.py +++ b/docs/notebooks/benchmark_bc.py @@ -18,10 +18,13 @@ 'segger': '#D55E00', 'segger_n0': '#E69F00', 'segger_n1': '#F0E442', - 'Baysor': '#0072B2', - '10X': '#009E73', + 'segger_embedding': '#C72228', + 'Baysor': '#000075', + 'Baysor_n0': '#0F4A9C', + 'Baysor_n1': '#0072B2', + '10X': '#8B008B', '10X-nucleus': '#CC79A7', - 'BIDCell': '#8B008B' + # 'BIDCell': '#009E73' } # Define colors for cell types @@ -37,34 +40,47 @@ 'Plasmablasts': '#000075' } + # Define segmentation file paths segmentation_paths = { - 'segger': benchmarks_path / 'adata_segger.h5ad', + # 'segger': benchmarks_path / 'Xenium_FFPE_Human_Breast_Cancer_Rep1_v9_segger.h5ad', + 'segger': 'data_tidy/Xenium_FFPE_Human_Breast_Cancer_Rep1_v9_segger.h5ad', + # 'segger_embedding': benchmarks_path / 'segger_embedding_1001_0.5_cc_segmentation.h5ad', 'Baysor': benchmarks_path / 'adata_baysor.h5ad', '10X': benchmarks_path / 'adata_10X.h5ad', '10X-nucleus': benchmarks_path / 'adata_10X_nuc.h5ad', - 'BIDCell': benchmarks_path / 'adata_BIDCell.h5ad' + # 'BIDCell': benchmarks_path / 'adata_BIDCell.h5ad' } # Load the segmentations and the scRNAseq data segmentations_dict = load_segmentations(segmentation_paths) segmentations_dict = {k: segmentations_dict[k] for k in method_colors.keys() if k in segmentations_dict} scRNAseq_adata = sc.read(benchmarks_path / 'scRNAseq.h5ad') +segmentations_dict['segger'] = segmentations_dict['segger'][segmentations_dict['segger'].obs.cell_area < 150] +# scRNAseq_adata = set(scRNAseq_adata.var_names).intersect([set(segmentations_dict[seg].var_names) for seg in segmentations_dict.keys()]) # Generate general statistics plots -plot_general_statistics_plots(segmentations_dict, figures_path, method_colors) +# plot_general_statistics_plots(segmentations_dict, figures_path, method_colors) + + +plot_cell_counts(segmentations_dict, figures_path, palette=method_colors) +plot_cell_area(segmentations_dict, figures_path, palette=method_colors) # Find markers for scRNAseq data markers = find_markers(scRNAseq_adata, cell_type_column='celltype_major', pos_percentile=30, neg_percentile=5) # Annotate spatial segmentations with scRNAseq reference data -for method in segmentation_paths.keys(): - # segmentations_dict[method] = annotate_query_with_reference( - # reference_adata=scRNAseq_adata, - # query_adata=segmentations_dict[method], - # transfer_column='celltype_major' - # ) - segmentations_dict[method].write(segmentation_paths[method]) +# for method in segmentation_paths.keys(): +# segmentations_dict[method] = annotate_query_with_reference( +# reference_adata=scRNAseq_adata, +# query_adata=segmentations_dict[method], +# transfer_column='celltype_major' +# ) +# segmentations_dict[method].write(segmentation_paths[method]) + +sc._settings.ScanpyConfig.figdir = figures_path +segmentations_dict['segger_embedding'].obsm['spatial'] = segmentations_dict['segger_embedding'].obs[['cell_centroid_x', 'cell_centroid_y']].values +sc.pl.spatial(segmentations_dict['segger_embedding'], spot_size=10, save= 'embedding.pdf', color='celltype_major', palette=major_colors) # Find mutually exclusive genes based on scRNAseq data exclusive_gene_pairs = find_mutually_exclusive_genes( @@ -88,10 +104,10 @@ adata=segmentations_dict[method], gene_pairs=exclusive_gene_pairs ) - quantized_mecr_counts[method] = compute_quantized_mecr_counts( - adata=segmentations_dict[method], - gene_pairs=exclusive_gene_pairs - ) + # quantized_mecr_counts[method] = compute_quantized_mecr_counts( + # adata=segmentations_dict[method], + # gene_pairs=exclusive_gene_pairs + # ) # Plot MECR results plot_mecr_results(mecr_results, output_path=figures_path, palette=method_colors) @@ -166,7 +182,7 @@ plot_entropy_boxplots(entropy_boxplot_data, figures_path, palette=method_colors) # Find markers for sensitivity calculation -purified_markers = find_markers(scRNAseq_adata, 'celltype_major', pos_percentile=20, percentage=75) +purified_markers = find_markers(scRNAseq_adata, 'celltype_major', pos_percentile=30, percentage=70) # Calculate sensitivity for each segmentation method sensitivity_results_per_method = {} diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 11c5e89..1896020 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -1,5 +1,5 @@ from segger.training.segger_data_module import SeggerDataModule -from segger.prediction.predict import segment, get_similarity_scores, load_model, predict_batch, predict +from segger.prediction.predict_parquet import segment, load_model from pathlib import Path from matplotlib import pyplot as plt import seaborn as sns @@ -14,6 +14,7 @@ from dask_cuda import LocalCUDACluster import dask.dataframe as dd + segger_data_dir = Path('./data_tidy/pyg_datasets/bc_embedding_1001') models_dir = Path('./models/bc_embedding_1001_small') benchmarks_dir = Path('/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc') @@ -22,7 +23,7 @@ dm = SeggerDataModule( data_dir=segger_data_dir, batch_size=1, - num_workers=1, + num_workers=0, ) dm.setup() @@ -34,19 +35,57 @@ model_path = models_dir / 'lightning_logs' / f'version_{model_version}' model = load_model(model_path / 'checkpoints') -receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 5, 'dist_tx': 5} +receptive_field = {'k_bd': 4, 'dist_bd': 20,'k_tx': 15, 'dist_tx': 1} + + segment( model, dm, save_dir=benchmarks_dir, - seg_tag='segger_embedding_1001_0.5_cc', + seg_tag='parquet_test_big', transcript_file=transcripts_file, - file_format='anndata', + # file_format='anndata', receptive_field = receptive_field, min_transcripts=5, + score_cut=0.5, # max_transcripts=1500, cell_id_col='segger_cell_id', use_cc=True, - knn_method='cuda' -) \ No newline at end of file + knn_method='cuda', + verbose=True + # client=client +) + +# if __name__ == "__main__": +# cluster = LocalCUDACluster( +# # CUDA_VISIBLE_DEVICES="0", +# n_workers=1, +# dashboard_address=":8080", +# memory_limit='30GB', # Adjust based on system memory +# lifetime="2 hours", # Increase worker lifetime +# lifetime_stagger="75 minutes", +# local_directory='.', # Stagger worker restarts +# lifetime_restart=True # Automatically restart workers +# ) +# client = Client(cluster) + +# segment( +# model, +# dm, +# save_dir=benchmarks_dir, +# seg_tag='segger_embedding_0926_mega_0.5_20', +# transcript_file=transcripts_file, +# file_format='anndata', +# receptive_field = receptive_field, +# min_transcripts=5, +# score_cut=0.5, +# # max_transcripts=1500, +# cell_id_col='segger_cell_id', +# use_cc=False, +# knn_method='cuda', +# # client=client +# ) + +# client.close() +# cluster.close() \ No newline at end of file diff --git a/scripts/sandbox.py b/scripts/sandbox.py new file mode 100644 index 0000000..8d4a82f --- /dev/null +++ b/scripts/sandbox.py @@ -0,0 +1,240 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from pathlib import Path + +# Define method colors +method_colors = { + 'segger': '#D55E00', + # 'segger_n0': '#E69F00', + # 'segger_n1': '#F0E442', + # 'segger_embedding': '#C72228', + 'Baysor': '#000075', + # 'Baysor_n0': '#0F4A9C', + # 'Baysor_n1': '#0072B2', + '10X': '#8B008B', + '10X-nucleus': '#CC79A7', + # 'BIDCell': '#009E73' +} + +# Define the path to your figures and data +# figures_path = Path("/path/to/your/figures") # Update with the actual path +cell_counts_path = figures_path / 'cell_counts_data.csv' +cell_area_log2_path = figures_path / 'cell_area_log2_data.csv' +mcer_box_path = figures_path / 'mcer_box.csv' +sensitivity_boxplot_data = figures_path / 'sensitivity_results.csv' + +# Load the data +cell_counts_data = pd.read_csv(cell_counts_path) +cell_area_log2_data = pd.read_csv(cell_area_log2_path) +sensitivity_boxplot_data = pd.read_csv(sensitivity_boxplot_data) + +# Cell counts barplot +cell_counts_data.rename(columns={'Unnamed: 0': 'Method'}, inplace=True) +cell_counts_data = cell_counts_data[~cell_counts_data['Method'].isin(['segger_n0', 'segger_n1'])] + + + + + + +sensitivity_results.csv + + +# Finally, the MCER plot + +mcer_methods_final = method_colors.keys() +mcer_data_filtered = mcer_box_data[mcer_box_data['Segmentation Method'].isin(mcer_methods_final)] + + +import seaborn as sns +import matplotlib.pyplot as plt + + + +sns.set_style("white") +sns.set_context("paper", font_scale=1.2) + + +mcer_methods_final = method_colors.keys() +cell_area_log2_data = cell_area_log2_data[cell_area_log2_data['Segmentation Method'].isin(mcer_methods_final)] + +# Create the boxplot with the size 4x6 inches and show only the outliers +plt.figure(figsize=(2, 4)) +sns.boxplot(data=cell_area_log2_data, + x='Segmentation Method', + y='Cell Area (log2)', + palette=method_colors, + showfliers=False, + showcaps=False, + flierprops={"marker": "x"}) # Hide the default outliers in the boxplot + +# Add a stripplot to show only the outliers +# sns.stripplot(data=mcer_data_filtered, +# x='Segmentation Method', +# y='MECR', +# jitter=False, # Avoid jittering to keep points in place +# dodge=True, # Keep points aligned with the boxplot categories +# marker="D", # Use diamond-shaped marker +# edgecolor='black', # Set black edge color for the points +# linewidth=1, # Define the thickness of the point borders +# color='black', # Set the color of the outlier points +# size=6) # Set the size of the outliers + +# Rotate the x-axis labels +plt.xticks(rotation=45, ha='right') + +# Setting solid black borders on all four sides +ax = plt.gca() +for spine in ax.spines.values(): + spine.set_visible(True) + spine.set_color('black') # Set border color to black + +# plt.ylim(0, 0.2) + +# Show horizontal grid lines +ax.yaxis.grid(False) +ax.xaxis.grid(False) + +# Adding the y-axis label with "Mutually Exclusive Co-expression Rate (MECR)" +plt.ylabel(r"Cell Area ($\mu m^2$)") + +# Remove the x-axis label +ax.set_xlabel("") + +# Create a clean layout for publication-level +plt.tight_layout() + +# Save the updated plot as both PNG and PDF +cell_areas_boxplot_pdf_path = figures_path / 'cell_areas.pdf' +cell_areas_boxplot_png_path = figures_path / 'cell_areas.png' + +plt.savefig(cell_areas_boxplot_pdf_path, format='pdf', bbox_inches='tight', dpi=300) +plt.savefig(cell_areas_boxplot_png_path, format='png', bbox_inches='tight', dpi=300) + +# Close the figure +plt.close() + + +mcer_methods_final = method_colors.keys() +sensitivity_boxplot_data = sensitivity_boxplot_data[sensitivity_boxplot_data['Segmentation Method'].isin(mcer_methods_final)] + + +plt.figure(figsize=(2.5, 4)) +sns.boxplot(data=sensitivity_boxplot_data, + x='Segmentation Method', + y='Sensitivity', + palette=method_colors, + showfliers=False, + showcaps=False, + flierprops={"marker": "x"}) # Hide the default outliers in the boxplot + +# Add a stripplot to show only the outliers +# sns.stripplot(data=mcer_data_filtered, +# x='Segmentation Method', +# y='MECR', +# jitter=False, # Avoid jittering to keep points in place +# dodge=True, # Keep points aligned with the boxplot categories +# marker="D", # Use diamond-shaped marker +# edgecolor='black', # Set black edge color for the points +# linewidth=1, # Define the thickness of the point borders +# color='black', # Set the color of the outlier points +# size=6) # Set the size of the outliers + +# Rotate the x-axis labels +plt.xticks(rotation=45, ha='right') + +# Setting solid black borders on all four sides +ax = plt.gca() +for spine in ax.spines.values(): + spine.set_visible(True) + spine.set_color('black') # Set border color to black + +# plt.ylim(0, 0.2) + +# Show horizontal grid lines +ax.yaxis.grid(False) +ax.xaxis.grid(False) + +# Adding the y-axis label with "Mutually Exclusive Co-expression Rate (MECR)" +plt.ylabel(r"Positive Marker Expression Rate (PMER)") + +# Remove the x-axis label +ax.set_xlabel("") + +# Create a clean layout for publication-level +plt.tight_layout() + +# Save the updated plot as both PNG and PDF +sensitivity_boxplot_data_boxplot_pdf_path = figures_path / 'sensitivity_boxplot_data.pdf' +sensitivity_boxplot_data_boxplot_png_path = figures_path / 'sensitivity_boxplot_data.png' + +plt.savefig(sensitivity_boxplot_data_boxplot_pdf_path, format='pdf', bbox_inches='tight', dpi=300) +plt.savefig(sensitivity_boxplot_data_boxplot_png_path, format='png', bbox_inches='tight', dpi=300) + +# Close the figure +plt.close() + + + + + + +sns.set_style("white") +sns.set_context("paper") + +# Create the boxplot with the size 4x6 inches and show only the outliers +plt.figure(figsize=(2.5, 4)) +sns.boxplot(data=mcer_data_filtered, + x='Segmentation Method', + y='MECR', + palette=method_colors, + showfliers=False, + showcaps=False, + flierprops={"marker": "x"}) # Hide the default outliers in the boxplot + +# Add a stripplot to show only the outliers +# sns.stripplot(data=mcer_data_filtered, +# x='Segmentation Method', +# y='MECR', +# jitter=False, # Avoid jittering to keep points in place +# dodge=True, # Keep points aligned with the boxplot categories +# marker="D", # Use diamond-shaped marker +# edgecolor='black', # Set black edge color for the points +# linewidth=1, # Define the thickness of the point borders +# color='black', # Set the color of the outlier points +# size=6) # Set the size of the outliers + +# Rotate the x-axis labels +plt.xticks(rotation=45, ha='right') + +# Setting solid black borders on all four sides +ax = plt.gca() +for spine in ax.spines.values(): + spine.set_visible(True) + spine.set_color('black') # Set border color to black + +plt.ylim(0, 0.2) + +# Show horizontal grid lines +ax.yaxis.grid(False) +ax.xaxis.grid(False) + +# Adding the y-axis label with "Mutually Exclusive Co-expression Rate (MECR)" +plt.ylabel("Mutually Exclusive Co-expression Rate") + +# Remove the x-axis label +ax.set_xlabel("") + +# Create a clean layout for publication-level +plt.tight_layout() + +# Save the updated plot as both PNG and PDF +final_mcer_boxplot_pdf_path = figures_path / 'mcer_boxplot_with_outliers.pdf' +final_mcer_boxplot_png_path = figures_path / 'mcer_boxplot_with_outliers.png' + +plt.savefig(final_mcer_boxplot_pdf_path, format='pdf', bbox_inches='tight', dpi=300) +plt.savefig(final_mcer_boxplot_png_path, format='png', bbox_inches='tight', dpi=300) + +# Close the figure +plt.close() \ No newline at end of file diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index cf73116..2844ce3 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -40,6 +40,10 @@ from cupyx.scipy.sparse import find # To find non-zero elements in sparse matrix from scipy.sparse.csgraph import connected_components as cc from scipy.sparse import coo_matrix as scipy_coo_matrix +from dask.distributed import get_client +from pqdm.processes import pqdm +from tqdm import tqdm + # Setup Dask cluster with 3 workers @@ -100,7 +104,8 @@ def get_similarity_scores( batch: Batch, from_type: str, to_type: str, - receptive_field: dict + receptive_field: dict, + knn_method: str = 'cuda' ) -> coo_matrix: """ Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes @@ -111,35 +116,38 @@ def get_similarity_scores( batch (Batch): A batch of data containing input features and edge indices. from_type (str): The type of node from which the similarity is computed. to_type (str): The type of node to which the similarity is computed. + knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. Returns: coo_matrix: A sparse matrix containing the similarity scores between 'from_type' and 'to_type' nodes. """ - # Step 1: Get embeddings from the model - batch = batch.to("cuda") + + # Keep everything on GPU until final results + batch = batch.to('cuda') + + # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + + # Compute edge indices using knn method (still on GPU) edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions k=receptive_field[f'k_{to_type}'], dist=receptive_field[f'dist_{to_type}'], - method='cuda' + method=knn_method ) + + # Convert to dense adjacency matrix (on GPU) edge_index = coo_to_dense_adj( - edge_index.T, - num_nodes=shape[0], - num_nbrs=receptive_field[f'k_{to_type}'], + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f'k_{to_type}'] ) with torch.no_grad(): embeddings = model(batch.x_dict, batch.edge_index_dict) - - del batch - - # print(edge_index) - # print(embeddings) - + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros @@ -164,14 +172,9 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: return sparse_result # Free GPU memory after computation - # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) - gc.collect() - cp.cuda.Stream.null.synchronize() - cp.get_default_memory_pool().free_all_blocks() - torch.cuda.empty_cache() - # No need to convert to PyTorch tensor; return the CuPy sparse matrix + return sparse_similarity @@ -188,7 +191,8 @@ def predict_batch( """ Predict cell assignments for a batch of transcript data using a segmentation model. Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) - or unassigned (bound=0). + or unassigned (bound=0). Unassigned transcripts are handled with connected components + if use_cc is True. Args: lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. @@ -214,17 +218,18 @@ def _get_id(): batch = batch.to("cuda") # Extract transcript IDs and initialize assignments DataFrame - transcript_id = cp.asnumpy(batch['tx'].id) + transcript_id = batch['tx'].id.cpu() assignments = pd.DataFrame({'transcript_id': transcript_id}) if len(batch['bd'].pos) >= 10: # Compute similarity scores between 'tx' and 'bd' - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method) torch.cuda.empty_cache() # Convert sparse matrix to dense format dense_scores = scores.toarray() # Convert to dense NumPy array del scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + # Get direct assignments from similarity matrix belongs = cp.max(dense_scores, axis=1) # Max score per transcript assignments['score'] = cp.asnumpy(belongs) # Move back to CPU @@ -233,75 +238,114 @@ def _get_id(): all_ids = np.concatenate(batch['bd'].id) # Keep IDs as NumPy array assignments['segger_cell_id'] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments['segger_cell_id'][mask] = all_ids[max_indices[mask]] # Assign IDs - + assignments.loc[mask, 'segger_cell_id'] = all_ids[max_indices[mask]] + del dense_scores # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() -# Move back to CPU + assignments['bound'] = 0 - assignments['bound'][mask] = 1 - - + assignments.loc[mask, 'bound'] = 1 + + + # Handle unassigned transcripts with connected components if use_cc: # Compute similarity scores between 'tx' and 'tx' - scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) - # Convert to dense NumPy array + scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method) + + # Convert to dense NumPy array data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) - + # Remove from memory # dense_scores_tx = scores_tx.toarray().astype(cp.float16) # Rebuild the matrix on CPU using SciPy dense_scores_tx = scipy_coo_matrix((data_cpu, (row_cpu, col_cpu)), shape=scores_tx.shape).toarray() - + del scores_tx np.fill_diagonal(dense_scores_tx, 0) # Ignore self-similarity - - del scores_tx # Remove from memory - cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory - # Assign unassigned transcripts using connected components - no_id = assignments['segger_cell_id'].isna() + # Get the indices of unassigned transcripts + no_id = assignments['segger_cell_id'].isna().values # Convert to numpy for indexing + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + torch.cuda.empty_cache() if np.any(no_id): # Only compute if there are unassigned transcripts + # Transfer the relevant parts of the sparse matrix (unassigned transcripts) no_id_scores = dense_scores_tx[no_id][:, no_id] - del dense_scores_tx # Remove from memory - no_id_scores[no_id_scores < score_cut] = 0 - n, comps = cc(no_id_scores, connection="weak", directed=False) - new_ids = np.array([_get_id() for _ in range(n)]) - assignments['segger_cell_id'][no_id] = new_ids[comps] - # Perform memory cleanup to avoid OOM issues - cp.get_default_memory_pool().free_all_blocks() - torch.cuda.empty_cache() + # Apply score cut-off + # no_id_scores = no_id_scores.toarray() # Move to dense temporarily + no_id_scores[no_id_scores < score_cut] = 0 # Apply threshold + no_id_scores = scipy_coo_matrix(no_id_scores) # Convert back to sparse + + # Find the non-zero entries in the no_id_scores to construct edge_index + non_zero_rows, non_zero_cols = no_id_scores.nonzero() + + # Map these indices back to the actual transcript IDs (no_id_mask gives us their original position) + unassigned_ids = batch['tx'].id[no_id] # Unassigned transcript IDs + + # Construct edge index (source, target) based on non-zero connections in the no_id_scores matrix + source_nodes = unassigned_ids[non_zero_rows].cpu() + target_nodes = unassigned_ids[non_zero_cols].cpu() + + # Convert to Dask array for later concatenation + edge_index = dd.from_array(np.stack([source_nodes, target_nodes], axis=0)) + + # Append this batch's edges to a global list to be used later in segment + global_edge_index_list.append(edge_index) + + # Free memory after computation + del no_id_scores + cp.get_default_memory_pool().free_all_blocks() + torch.cuda.empty_cache() return assignments + + def predict( lit_segger: LitSegger, data_loader: DataLoader, score_cut: float, receptive_field: dict, use_cc: bool = True, - knn_method: str = 'cuda' -) -> pd.DataFrame: # Change return type to Dask DataFrame if applicable + knn_method: str = 'cuda', + n_jobs: int = 2 # Number of parallel jobs +) -> pd.DataFrame: """ - Optimized prediction for multiple batches of transcript data. + Optimized prediction for multiple batches of transcript data, using parallel execution + and displaying a progress bar. + + Args: + lit_segger (LitSegger): The trained segmentation model. + data_loader (DataLoader): PyTorch DataLoader to iterate over transcript data batches. + score_cut (float): Threshold for assigning transcripts to cells based on similarity scores. + receptive_field (dict): Defines the receptive field for transcript-cell and transcript-transcript relations. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. + knn_method (str, optional): Method to use for nearest neighbors ('cuda', 'kd_tree', etc.). Defaults to 'cuda'. + n_jobs (int, optional): Number of parallel jobs to run. Defaults to 4. + + Returns: + pd.DataFrame: Final DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. """ + all_assignments = [] + # Use Dask delayed to parallelize the predict_batch execution and create assignments + delayed_batches = [delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) + for batch in data_loader] - for batch in data_loader: - assignments = predict_batch(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) - all_assignments.append(dd.from_pandas(assignments, npartitions=1)) - - cp.get_default_memory_pool().free_all_blocks() - torch.cuda.empty_cache() + # Compute all tasks in parallel with progress bar + with ProgressBar(): + all_assignments = delayed(delayed_batches).compute() + + # Convert the results to Dask DataFrames and combine them + all_assignments_dd = [dd.from_pandas(assignments, npartitions=1) for assignments in all_assignments] # Concatenate all assignments into a single Dask DataFrame - final_assignments = dd.concat(all_assignments, ignore_index=True) + final_assignments = dd.concat(all_assignments_dd, ignore_index=True) # Sort the Dask DataFrame by 'transcript_id' before setting it as an index final_assignments = final_assignments.sort_values(by='transcript_id') @@ -318,23 +362,13 @@ def predict( # Now use the computed final_idx for indexing result = final_assignments.loc[final_idx].compute().reset_index(names=['transcript_id']) - - # result = results.reset_index() - # Handle cases where there's only one entry per 'segger_cell_id' - # single_entry_mask = result.groupby('segger_cell_id').size() == 1 -# Handle cases where there's only one entry per 'segger_cell_id' - # single_entry_counts = result['segger_cell_id'].value_counts() # Count occurrences of each ID - # single_entry_mask = single_entry_counts[single_entry_counts == 1].index # Get IDs with a count of 1 - - # # Update 'segger_cell_id' for single entries - # for segger_id in single_entry_mask: - # result.loc[result['segger_cell_id'] == segger_id, 'segger_cell_id'] = 'floating' + return result - return result + def segment( model: LitSegger, dm: SeggerDataModule, @@ -350,35 +384,41 @@ def segment( **anndata_kwargs ) -> None: """ - Perform segmentation using the model, merge segmentation results with transcripts_df, and save in the specified format. - - Parameters: - ---------- - model : LitSegger - The trained segmentation model. - dm : SeggerDataModule - The SeggerDataModule instance for data loading. - save_dir : Union[str, Path] - Directory to save the final segmentation results. - seg_tag : str - Tag to include in the saved filename. - transcript_file : Union[str, Path] - Path to the transcripts parquet file. - file_format : str, optional - File format to save the results ('csv', 'parquet', or 'anndata'). Defaults to 'anndata'. - score_cut : float, optional - The threshold for assigning transcripts to cells based on similarity scores. - use_cc : bool, optional - If to further re-group transcripts that have not been assigned to any nucleus. - knn_method : str, optional - The method to use for nearest neighbors ('cuda' by default). - **anndata_kwargs : dict, optional - Additional keyword arguments passed to the create_anndata function. + Perform segmentation using the model, merge segmentation results with transcripts data, + and save the results in the specified format. The function also handles unassigned + transcripts through connected components analysis if `use_cc` is True. + + Args: + model (LitSegger): The trained segmentation model. + dm (SeggerDataModule): The SeggerDataModule instance for data loading. + save_dir (Union[str, Path]): Directory to save the final segmentation results. + seg_tag (str): Tag to include in the saved filename. + transcript_file (Union[str, Path]): Path to the transcripts Parquet file. + score_cut (float, optional): The threshold for assigning transcripts to cells based on + similarity scores. Defaults to 0.5. + use_cc (bool, optional): If True, perform connected components analysis for unassigned + transcripts. Defaults to True. + file_format (str, optional): The file format to save the results ('csv', 'parquet', or 'anndata'). + Defaults to 'anndata'. + receptive_field (dict, optional): Defines the receptive field for transcript-cell and + transcript-transcript relations. Defaults to + {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}. + knn_method (str, optional): The method to use for nearest neighbors ('cuda' or 'kd_tree'). + Defaults to 'kd_tree'. + verbose (bool, optional): Whether to print verbose status updates. Defaults to False. + **anndata_kwargs: Additional keyword arguments passed to the `create_anndata` function. Returns: None """ + + # Initialize global_edge_index_list to store edge indices across batches + global global_edge_index_list + global_edge_index_list = [] + + # Start the timer start_time = time.time() + # Ensure the save directory exists save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) @@ -389,10 +429,12 @@ def segment( # Step 1: Prediction step_start_time = time.time() + # Get dataloaders for training, validation, and test train_dataloader = dm.train_dataloader() test_dataloader = dm.test_dataloader() val_dataloader = dm.val_dataloader() - + + # Perform predictions for each dataset segmentation_train = predict(model, train_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() @@ -402,7 +444,7 @@ def segment( torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() gc.collect() - + segmentation_test = predict(model, test_dataloader, score_cut, receptive_field, use_cc, knn_method) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() @@ -418,9 +460,6 @@ def segment( # Combine the segmentation data seg_combined = pd.concat([segmentation_train, segmentation_val, segmentation_test], ignore_index=True) - # seg_combined = segmentation_test - # print(seg_combined.columns) - # print(transcripts_df.id) # Drop any unassigned rows seg_final = seg_combined.dropna(subset=['segger_cell_id']).reset_index(drop=True) @@ -439,16 +478,51 @@ def segment( # Convert the segmentation results to a Dask DataFrame, keeping npartitions consistent seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) - # Merge the segmentation results with the transcript data (still as Dask DataFrame) - transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner') + # Outer merge to include all transcripts, even those without assigned cell ids + transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='outer') if verbose: elapsed_time = format_time(time.time() - step_start_time) print(f"Transcripts merged in {elapsed_time}.") - # Step 4: Save the merged result + # Step 4: Handle unassigned transcripts using connected components (if use_cc is True) + if use_cc: + if verbose: + print(f"Computing connected components for unassigned transcripts...") + + # Concatenate all edge indices stored across batches + concatenated_edge_index = dd.concat(global_edge_index_list) + + # Compute connected components using the concatenated edge_index + edge_index_computed = concatenated_edge_index.compute() # Get the full edge_index (source, target) + + # Map transcript_ids to their index positions in the DataFrame + transcript_idx_map = pd.Series(transcripts_df_filtered.index, index=transcripts_df_filtered['transcript_id']).to_dict() + + # Convert the transcript_ids in edge_index_computed to positional indices + source_indices = [transcript_idx_map[tid] for tid in edge_index_computed[0]] + target_indices = [transcript_idx_map[tid] for tid in edge_index_computed[1]] + + # Use SciPy's connected components algorithm + n, comps = cc(scipy_coo_matrix((np.ones(len(source_indices)), + (source_indices, target_indices)), + shape=(transcripts_df_filtered.shape[0], transcripts_df_filtered.shape[0])), + connection="weak", directed=False) + + # Generate new cell IDs based on connected components + new_ids = np.array([_get_id() for _ in range(n)]) + + # Assign new cell IDs to the unassigned transcripts in the final assignments + unassigned_mask = transcripts_df_filtered['segger_cell_id'].isna() + transcripts_df_filtered.loc[unassigned_mask, 'segger_cell_id'] = new_ids[comps] + + if verbose: + elapsed_time = format_time(time.time() - step_start_time) + print(f"Connected components computed in {elapsed_time}.") + + # Step 5: Save the merged result step_start_time = time.time() - + if verbose: print(f"Saving results in {file_format} format...") @@ -464,7 +538,6 @@ def segment( segger_adata.write(save_path) else: raise ValueError(f"Unsupported file format: {file_format}") - # raise ValueError(f"Unsupported file format: {file_format}") if verbose: elapsed_time = format_time(time.time() - step_start_time) @@ -475,284 +548,6 @@ def segment( total_time = format_time(time.time() - start_time) print(f"Total segmentation process completed in {total_time}.") - # Step 5: Garbage collection and memory cleanup + # Step 6: Garbage collection and memory cleanup torch.cuda.empty_cache() gc.collect() - - - - - -# def predict( -# lit_segger: LitSegger, -# data_loader: DataLoader, -# score_cut: float, -# receptive_field: dict, -# use_cc: bool = True, -# knn_method: str = 'cuda' -# ) -> dd.DataFrame: -# """ -# Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - -# Args: -# lit_segger (LitSegger): The lightning module wrapping the segmentation model. -# data_loader (DataLoader): A data loader providing batches of transcript and cell data. -# score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. -# receptive_field (dict): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. -# use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. -# knn_method (str, optional): The method to use for nearest neighbors ('cuda' by default). Defaults to 'cuda'. - -# Returns: -# dd.DataFrame: A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. -# """ - - -# if len(data_loader) == 0: -# return None - -# # Create a meta DataFrame for the Dask DataFrame -# meta = pd.DataFrame({ -# 'transcript_id': pd.Series(dtype='int64'), -# 'score': pd.Series(dtype='float32'), -# 'segger_cell_id': pd.Series(dtype='object'), -# 'bound': pd.Series(dtype='int64') -# }) - -# # Convert the entire data loader to delayed predictions -# delayed_assignments = [ -# delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) -# for batch in data_loader -# ] - -# # Build the Dask DataFrame from the delayed assignments -# assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) - -# # Max score selection logic, with fallback to unbound scores if no bound=1 -# def select_max_score_partition(df): -# max_bound_idx = df[df['bound'] == 1].groupby('transcript_id')['score'].idxmax() -# max_unbound_idx = df[df['bound'] == 0].groupby('transcript_id')['score'].idxmax() - -# # Combine indices, prioritizing bound=1 scores -# final_idx = max_bound_idx.combine_first(max_unbound_idx) -# result = df.loc[final_idx].reset_index(drop=True) - -# # Handle cases where there's only one entry per 'segger_cell_id' -# single_entry_mask = result.groupby('segger_cell_id').size() == 1 -# result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - -# return result - -# # Map the logic over each partition using Dask -# final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) - -# # Trigger garbage collection and free GPU memory -# torch.cuda.empty_cache() -# gc.collect() - -# final_assignments = final_assignments.compute() - - - -# return final_assignments - - - - -# # def predict( -# # lit_segger: LitSegger, -# # data_loader: DataLoader, -# # score_cut: float, -# # receptive_field: dict, -# # use_cc: bool = True, -# # knn_method: str = 'cuda' -# # ) -> dd.DataFrame: -# # """ -# # Optimized prediction for multiple batches of transcript data using Dask and delayed processing with progress bar. - -# # Args: -# # lit_segger (LitSegger): The lightning module wrapping the segmentation model. -# # data_loader (DataLoader): A data loader providing batches of transcript and cell data. -# # score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. -# # receptive_field (dict): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. -# # use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. -# # knn_method (str, optional): The method to use for nearest neighbors ('cuda' by default). Defaults to 'cuda'. - -# # Returns: -# # dd.DataFrame: A Dask DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. -# # """ -# # if len(data_loader) == 0: -# # return None - -# # # Create a meta DataFrame for the Dask DataFrame -# # meta = pd.DataFrame({ -# # 'transcript_id': pd.Series(dtype='int64'), -# # 'score': pd.Series(dtype='float32'), -# # 'segger_cell_id': pd.Series(dtype='object'), -# # 'bound': pd.Series(dtype='int64') -# # }) - -# # # Convert the entire data loader to delayed predictions -# # delayed_assignments = [ -# # delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) -# # for batch in data_loader -# # ] - -# # # Build the Dask DataFrame from the delayed assignments -# # assignments_dd = dd.from_delayed(delayed_assignments, meta=meta) - -# # # Max score selection logic, with fallback to unbound scores if no bound=1 -# # def select_max_score_partition(df): -# # max_bound_idx = df[df['bound'] == 1].groupby('transcript_id')['score'].idxmax() -# # max_unbound_idx = df[df['bound'] == 0].groupby('transcript_id')['score'].idxmax() - -# # # Combine indices, prioritizing bound=1 scores -# # final_idx = max_bound_idx.combine_first(max_unbound_idx) -# # result = df.loc[final_idx].reset_index(drop=True) - -# # # Handle cases where there's only one entry per 'segger_cell_id' -# # single_entry_mask = result.groupby('segger_cell_id').size() == 1 -# # result.loc[single_entry_mask, 'segger_cell_id'] = 'floating' - -# # return result - -# # # Map the logic over each partition using Dask -# # final_assignments = assignments_dd.map_partitions(select_max_score_partition, meta=meta) - -# # # Trigger garbage collection and free GPU memory -# # # rmm.reinitialize(pool_allocator=True) -# # torch.cuda.empty_cache() -# # gc.collect() - -# # return final_assignments - - -# def segment( -# model: LitSegger, -# dm: SeggerDataModule, -# save_dir: Union[str, Path], -# seg_tag: str, -# transcript_file: Union[str, Path], -# score_cut: float = .25, -# use_cc: bool = True, -# file_format: str = 'anndata', -# receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, -# knn_method: str = 'kd_tree', -# verbose: bool = False, -# **anndata_kwargs -# ) -> None: -# """ -# Perform segmentation using the model, merge segmentation results with transcripts_df, -# and save in the specified format. Memory is managed efficiently using Dask and GPU -# memory optimizations. - -# Args: -# model (LitSegger): The trained segmentation model. -# dm (SeggerDataModule): The SeggerDataModule instance for data loading. -# save_dir (Union[str, Path]): Directory to save the final segmentation results. -# seg_tag (str): Tag to include in the saved filename. -# transcript_file (Union[str, Path]): Path to the transcripts parquet file. -# score_cut (float, optional): The threshold for assigning transcripts to cells based on similarity scores. Defaults to 0.25. -# use_cc (bool, optional): If True, re-group transcripts that have not been assigned to any nucleus. Defaults to True. -# file_format (str, optional): File format to save the results ('csv', 'parquet', or 'anndata'). Defaults to 'anndata'. -# receptive_field (dict, optional): Defines the receptive field for transcript-cell and transcript-transcript relations. -# knn_method (str, optional): The method to use for nearest neighbors ('kd_tree' by default). -# **anndata_kwargs: Additional keyword arguments passed to the create_anndata function. - -# Returns: -# None -# """ -# start_time = time.time() -# # rmm.reinitialize(pool_allocator=True, initial_pool_size=2**26, maximum_pool_size=2**30) -# # cp.cuda.set_allocator(rmm_cupy_allocator) - -# # Ensure the save directory exists -# save_dir = Path(save_dir) -# save_dir.mkdir(parents=True, exist_ok=True) - -# if verbose: -# print(f"Starting segmentation for {seg_tag}...") - -# # Step 1: Prediction -# step_start_time = time.time() - -# train_dataloader = dm.train_dataloader() -# test_dataloader = dm.test_dataloader() -# val_dataloader = dm.val_dataloader() - -# # delayed_train = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) -# # delayed_val = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) -# delayed_test = predict(model, test_dataloader, score_cut=score_cut, receptive_field=receptive_field, use_cc=use_cc, knn_method=knn_method) - -# delayed_test = delayed_test.compute() -# # Compute all predictions at once using Dask -# # with ProgressBar(): -# # segmentation_train, segmentation_val, segmentation_test = dask.compute(delayed_train, delayed_val, delayed_test) - -# if verbose: -# elapsed_time = format_time(time.time() - step_start_time) -# print(f"Predictions completed in {elapsed_time}.") - -# # Step 2: Combine and group by transcript_id -# step_start_time = time.time() - -# # Combine the segmentation data -# seg_combined = dd.concat([segmentation_train, segmentation_val, segmentation_test]) - -# # No need to handle max score logic here, as it's done inside the `predict` function -# seg_final = seg_combined.compute() - -# # Drop any unassigned rows -# seg_final = seg_final.dropna(subset=['segger_cell_id']).reset_index(drop=True) - -# if verbose: -# elapsed_time = format_time(time.time() - step_start_time) -# print(f"Segmentation results processed in {elapsed_time}.") - -# # Step 3: Load transcripts and merge -# step_start_time = time.time() - -# transcripts_df = dd.read_parquet(transcript_file) - -# if verbose: -# print("Merging segmentation results with transcripts...") - -# # Merge the segmentation results with the transcript data -# seg_final_dd = dd.from_pandas(seg_final, npartitions=transcripts_df.npartitions) -# transcripts_df_filtered = transcripts_df.merge(seg_final_dd, on='transcript_id', how='inner').compute() - -# if verbose: -# elapsed_time = format_time(time.time() - step_start_time) -# print(f"Transcripts merged in {elapsed_time}.") - -# # Step 4: Save the merged result -# step_start_time = time.time() - -# if verbose: -# print(f"Saving results in {file_format} format...") - -# if file_format == 'csv': -# save_path = save_dir / f'{seg_tag}_segmentation.csv' -# transcripts_df_filtered.to_csv(save_path, index=False) -# elif file_format == 'parquet': -# save_path = save_dir / f'{seg_tag}_segmentation.parquet' -# transcripts_df_filtered.to_parquet(save_path, index=False) -# elif file_format == 'anndata': -# save_path = save_dir / f'{seg_tag}_segmentation.h5ad' -# segger_adata = create_anndata(transcripts_df_filtered, **anndata_kwargs) -# segger_adata.write(save_path) -# else: -# raise ValueError(f"Unsupported file format: {file_format}") - -# if verbose: -# elapsed_time = format_time(time.time() - step_start_time) -# print(f"Results saved in {elapsed_time} at {save_path}.") - -# # Total time -# if verbose: -# total_time = format_time(time.time() - start_time) -# print(f"Total segmentation process completed in {total_time}.") - -# # Step 5: Garbage collection and memory cleanup -# # rmm.reinitialize(pool_allocator=True) -# # torch.cuda.empty_cache() -# gc.collect() diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py new file mode 100644 index 0000000..8d607ad --- /dev/null +++ b/src/segger/prediction/predict_parquet.py @@ -0,0 +1,680 @@ +import os +import torch +import cupy as cp +import pandas as pd +import numpy as np +import torch.nn.functional as F +import torch._dynamo +import gc +import rmm +import re +import glob +from pathlib import Path +from torch_geometric.loader import DataLoader +from torch_geometric.data import Batch +from segger.data.utils import ( + get_edge_index_cuda, + get_edge_index, + format_time, + create_anndata, + coo_to_dense_adj, +) +from segger.training.train import LitSegger +from segger.training.segger_data_module import SeggerDataModule +from scipy.sparse.csgraph import connected_components as cc +from typing import Union, Dict +import dask.dataframe as dd +from dask import delayed +from dask.diagnostics import ProgressBar +import time +import dask +from rmm.allocators.cupy import rmm_cupy_allocator +from cupyx.scipy.sparse import coo_matrix +from torch.utils.dlpack import to_dlpack, from_dlpack + +from dask.distributed import Client, LocalCluster +import cupy as cp +import numpy as np +import pandas as pd +from cupyx.scipy.sparse import coo_matrix +from cupyx.scipy.sparse import find # To find non-zero elements in sparse matrix +from scipy.sparse.csgraph import connected_components as cc +from scipy.sparse import coo_matrix as scipy_coo_matrix +from dask.distributed import get_client +from pqdm.processes import pqdm +from tqdm import tqdm +import json +from datetime import datetime +import dask_geopandas as dgpd # Assuming dask-geopandas is installed +import cudf +import dask_cudf +import cupy as cp +import cupyx +import warnings +import shutil +from time import time +from cupyx.scipy.sparse import coo_matrix as cp_coo_matrix +from cupyx.scipy.sparse.csgraph import connected_components as cp_cc +# Setup Dask cluster with 3 workers + + + +# CONFIG +torch._dynamo.config.suppress_errors = True +os.environ["PYTORCH_USE_CUDA_DSA"] = "1" +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + + +# Function to zero out diagonal of sparse COO matrix +def zero_out_diagonal_gpu(sparse_matrix): + """ + Zero out the diagonal elements of a sparse CuPy COO matrix while keeping it sparse on the GPU. + + Args: + sparse_matrix (cupyx.scipy.sparse.coo_matrix): Input sparse matrix. + + Returns: + cupyx.scipy.sparse.coo_matrix: Matrix with diagonal elements zeroed out. + """ + # Filter out the diagonal (where row == col) + non_diagonal_mask = sparse_matrix.row != sparse_matrix.col + + # Create a new sparse matrix without diagonal elements + sparse_matrix_no_diag = cupyx.scipy.sparse.coo_matrix( + (sparse_matrix.data[non_diagonal_mask], + (sparse_matrix.row[non_diagonal_mask], sparse_matrix.col[non_diagonal_mask])), + shape=sparse_matrix.shape + ) + + return sparse_matrix_no_diag + + +# Function to subset rows and columns of a sparse matrix +def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): + """ + Subsets a COO sparse matrix by selecting rows and columns corresponding to given indices. + + Args: + sparse_matrix (cupyx.scipy.sparse.coo_matrix): The input sparse matrix. + row_idx (cupy.ndarray): The row indices to subset. + col_idx (cupy.ndarray): The column indices to subset. + + Returns: + cupyx.scipy.sparse.coo_matrix: Subset sparse matrix. + """ + # Filter out the elements where both the row and column match the given indices + row_mask = cp.isin(sparse_matrix.row, row_idx) + col_mask = cp.isin(sparse_matrix.col, col_idx) + combined_mask = row_mask & col_mask + + # Create the subset sparse matrix + new_data = sparse_matrix.data[combined_mask] + new_row = sparse_matrix.row[combined_mask] + new_col = sparse_matrix.col[combined_mask] + + # Map the new row and col indices to the range of the subset + row_map = cp.searchsorted(row_idx, new_row) + col_map = cp.searchsorted(col_idx, new_col) + + # Return the new subset sparse matrix + return coo_matrix((new_data, (row_map, col_map)), shape=(len(row_idx), len(col_idx))) + + + + +def load_model(checkpoint_path: str) -> LitSegger: + """ + Load a LitSegger model from a checkpoint. + + Parameters + ---------- + checkpoint_path : str + Specific checkpoint file to load, or directory where the model checkpoints are stored. + If directory, the latest checkpoint is loaded. + + Returns + ------- + LitSegger + The loaded LitSegger model. + + Raises + ------ + FileNotFoundError + If the specified checkpoint file does not exist. + """ + checkpoint_path = Path(checkpoint_path) + msg = f"No checkpoint found at {checkpoint_path}. Please make sure you've provided the correct path." + + # Get last checkpoint if directory is provided + if os.path.isdir(checkpoint_path): + checkpoints = glob.glob(str(checkpoint_path / '*.ckpt')) + if len(checkpoints) == 0: + raise FileNotFoundError(msg) + # Sort checkpoints by epoch and step + def sort_order(c): + match = re.match(r'.*epoch=(\d+)-step=(\d+).ckpt', c) + return int(match[1]), int(match[2]) + checkpoint_path = Path(sorted(checkpoints, key=sort_order)[-1]) + elif not checkpoint_path.exists(): + raise FileExistsError(msg) + + # Load model from checkpoint + lit_segger = LitSegger.load_from_checkpoint( + checkpoint_path=checkpoint_path, + ) + + return lit_segger + + + +def get_similarity_scores( + model: torch.nn.Module, + batch: Batch, + from_type: str, + to_type: str, + receptive_field: dict, + knn_method: str = 'cuda' +) -> coo_matrix: + """ + Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes + using sparse matrix multiplication with CuPy and the 'sees' edge relation. + + Args: + model (torch.nn.Module): The segmentation model used to generate embeddings. + batch (Batch): A batch of data containing input features and edge indices. + from_type (str): The type of node from which the similarity is computed. + to_type (str): The type of node to which the similarity is computed. + knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. + + Returns: + coo_matrix: A sparse matrix containing the similarity scores between + 'from_type' and 'to_type' nodes. + """ + + # Keep everything on GPU until final results + batch = batch.to('cuda') + + # Step 1: Get embeddings from the model (on GPU) + shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + + # Compute edge indices using knn method (still on GPU) + edge_index = get_edge_index( + batch[to_type].pos[:, :2], # 'tx' positions + batch[from_type].pos[:, :2], # 'bd' positions + k=receptive_field[f'k_{to_type}'], + dist=receptive_field[f'dist_{to_type}'], + method=knn_method + ) + + # Convert to dense adjacency matrix (on GPU) + edge_index = coo_to_dense_adj( + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f'k_{to_type}'] + ) + + with torch.no_grad(): + embeddings = model(batch.x_dict, batch.edge_index_dict) + + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: + m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros + + similarity = torch.bmm( + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 + del embeddings + # Sigmoid to get most similar 'to_type' neighbor + similarity[similarity == 0] = -torch.inf # ensure zero stays zero + similarity = F.sigmoid(similarity) + # Neighbor-filtered similarity scores + # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + indices = torch.argwhere(edge_index != -1).T + indices[1] = edge_index[edge_index != -1] + rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) + columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + # print(rows) + del indices + values = similarity[edge_index != -1].flatten() + sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) + return sparse_result + # Free GPU memory after computation + + # Call the sparse multiply function + sparse_similarity = sparse_multiply(embeddings, edge_index, shape) + + return sparse_similarity + + + + + +def predict_batch( + lit_segger: torch.nn.Module, + batch: Batch, + score_cut: float, + receptive_field: Dict[str, float], + use_cc: bool = True, + knn_method: str = 'cuda', + output_ddf: dask_cudf.DataFrame = None, + edge_index_save_path: Union[str, Path] = None, + output_ddf_save_path: Union[str, Path] = None +) -> dask_cudf.DataFrame: + """ + Predict cell assignments for a batch of transcript data using a segmentation model. + Writes both the assignments and edge_index directly into Parquet files incrementally. + + Args: + lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. + batch (Batch): A batch of transcript and cell data. + score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. + receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell + and transcript-transcript relations. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. + Defaults to True. + knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. + output_ddf (dask_cudf.DataFrame, optional): Dask-CuDF DataFrame to accumulate and store transcript assignments. + edge_index_save_path (str, optional): Path to the Parquet file where edge indices are saved incrementally. + output_ddf_save_path (str, optional): Path to the Parquet file where transcript assignments (`output_ddf`) + are saved incrementally. + + Returns: + dask_cudf.DataFrame: Updated Dask-CuDF DataFrame for assignments. + """ + + def _get_id(): + """Generate a random Xenium-style ID.""" + return ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '-nx' + + with cp.cuda.Device(0): + # Move batch to GPU + batch = batch.to("cuda") + + # Extract transcript IDs and initialize a dictionary for assignments + transcript_id = batch['tx'].id.cpu().numpy().astype('str') + assignments = {'transcript_id': transcript_id} + + if len(batch['bd'].pos) >= 10: + # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method) + torch.cuda.empty_cache() + + # Convert sparse matrix to dense format (on GPU) + dense_scores = scores.toarray() # Convert to dense NumPy array + del scores # Remove from memory + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + + # Step 2: Maximize score and assign transcripts based on score threshold + belongs = cp.max(dense_scores, axis=1) # Max score per transcript + assignments['score'] = cp.asnumpy(belongs) # Move back to CPU + + mask = assignments['score'] >= score_cut # Mask for assigned transcripts + all_ids = np.concatenate(batch['bd'].id) # Boundary IDs as NumPy array + assignments['segger_cell_id'] = np.where(mask, all_ids[cp.argmax(dense_scores, axis=1).get()], None) + + # Clear memory after score processing + del dense_scores + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + torch.cuda.empty_cache() + + assignments['bound'] = np.where(mask, 1, 0) # Mark as 'bound' (1 if assigned, 0 if unassigned) + + + # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) + if use_cc: + scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method) + + # Stay on GPU and use CuPy sparse matrices + no_id_scores = cupyx.scipy.sparse.coo_matrix((scores_tx.data, (scores_tx.row, scores_tx.col)), shape=scores_tx.shape) + + # Apply threshold on GPU + no_id_scores.data[no_id_scores.data <= score_cut] = 0 # Apply threshold + no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse + + # Zero out the diagonal on GPU + no_id_scores = zero_out_diagonal_gpu(no_id_scores) + + # Find unassigned transcripts (those with no segger_cell_id) + no_id = cp.asarray(assignments['segger_cell_id'] == None) # Using CuPy to handle None values + + if cp.any(no_id): # Only compute if there are unassigned transcripts + # Apply score cut-off to unassigned transcripts + no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) + no_id_scores.data[no_id_scores.data <= score_cut] = 0 # Apply threshold + no_id_scores.eliminate_zeros() # Clean up zeros + + # Find the non-zero entries in the no_id_scores to construct edge_index + non_zero_rows, non_zero_cols, _ = find(no_id_scores) + unassigned_ids = transcript_id[no_id.get()] # Unassigned transcript IDs + + # Construct edge index (source, target) based on non-zero connections in no_id_scores + source_nodes = unassigned_ids[non_zero_rows.get()] + target_nodes = unassigned_ids[non_zero_cols.get()] + + # Save edge_index using CuDF and Dask-CuDF for GPU acceleration + edge_index_df = cudf.DataFrame({'source': source_nodes, 'target': target_nodes}) + edge_index_ddf = dask_cudf.from_cudf(edge_index_df, npartitions=1) + + # Use delayed for asynchronous disk writing of edge_index + delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True) + delayed_write_edge_index.persist() # Schedule writing + + assignments = { + 'transcript_id': assignments['transcript_id'].astype('str'), + 'score': assignments['score'].astype('float32'), + 'segger_cell_id': assignments['segger_cell_id'].astype('str'), # Ensure 'string' dtype + 'bound': assignments['bound'].astype('int8') # Ensure 'int64' dtype + } + # Step 4: Convert assignments to Dask-CuDF DataFrame for this batch + batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) + + # # Append batch to output_ddf, adding it as a new partition + # if output_ddf is None: + # output_ddf = batch_ddf # Initialize if empty + # else: + # output_ddf = dask_cudf.concat([output_ddf, batch_ddf], interleave_partitions=True) + + # Save the updated `output_ddf` asynchronously using Dask delayed + delayed_write_output_ddf = delayed(batch_ddf.to_parquet)( + output_ddf_save_path, append=True, ignore_divisions=True + ) + delayed_write_output_ddf.persist() # Schedule writing + + # Free memory after computation + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + torch.cuda.empty_cache() + + return output_ddf + + +def segment( + model: LitSegger, + dm: SeggerDataModule, + save_dir: Union[str, Path], + seg_tag: str, + transcript_file: Union[str, Path], + score_cut: float = 0.5, + use_cc: bool = True, + save_transcripts: bool = True, + save_anndata: bool = True, + save_cell_masks: bool = False, # Placeholder for future implementation + receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, + knn_method: str = 'cuda', + verbose: bool = False, + **anndata_kwargs +) -> None: + """ + Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, + and log the parameters used during segmentation. + + Args: + model (LitSegger): The trained segmentation model. + dm (SeggerDataModule): The SeggerDataModule instance for data loading. + save_dir (Union[str, Path]): Directory to save the final segmentation results. + seg_tag (str): Tag to include in the saved filename. + transcript_file (Union[str, Path]): Path to the transcripts Parquet file. + score_cut (float, optional): The threshold for assigning transcripts to cells based on + similarity scores. Defaults to 0.5. + use_cc (bool, optional): If True, perform connected components analysis for unassigned + transcripts. Defaults to True. + save_transcripts (bool, optional): Whether to save the transcripts as Parquet. Defaults to True. + save_anndata (bool, optional): Whether to save the results in AnnData format. Defaults to True. + save_cell_masks (bool, optional): Save cell masks as Dask Geopandas Parquet. Defaults to False. + receptive_field (dict, optional): Defines the receptive field for transcript-cell and + transcript-transcript relations. Defaults to + {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}. + knn_method (str, optional): The method to use for nearest neighbors ('cuda' or 'kd_tree'). + Defaults to 'cuda'. + verbose (bool, optional): Whether to print verbose status updates. Defaults to False. + **anndata_kwargs: Additional keyword arguments passed to the `create_anndata` function. + + Returns: + None. Saves the result to disk in various formats and logs the parameter choices. + """ + + start_time = time() + + # Create a subdirectory with important parameter info (receptive field values) + sub_dir_name = f"{seg_tag}_{score_cut}_{use_cc}_{receptive_field['k_bd']}_{receptive_field['dist_bd']}_{receptive_field['k_tx']}_{receptive_field['dist_tx']}_{datetime.now().strftime('%Y%m%d')}" + save_dir = Path(save_dir) / sub_dir_name + save_dir.mkdir(parents=True, exist_ok=True) + + # Paths for saving the output_ddf and edge_index Parquet files + output_ddf_save_path = save_dir / "transcripts_df.parquet" + edge_index_save_path = save_dir / "edge_index.parquet" + + if output_ddf_save_path.exists(): + warnings.warn(f"Removing existing file: {output_ddf_save_path}") + shutil.rmtree(output_ddf_save_path) + + if use_cc: + if edge_index_save_path.exists(): + warnings.warn(f"Removing existing file: {edge_index_save_path}") + shutil.rmtree(edge_index_save_path) + + + if verbose: + print(f"Starting segmentation for {seg_tag}...") + + # Step 1: Load the data loaders from the SeggerDataModule + step_start_time = time() + train_dataloader = dm.train_dataloader() + val_dataloader = dm.val_dataloader() + test_dataloader = dm.test_dataloader() + + # Initialize Dask DataFrame for assignments + output_ddf = None + + # Loop through the data loaders (train, val, and test) + for loader_name, loader in zip(['Train', 'Validation', 'Test'], [train_dataloader, val_dataloader, test_dataloader]): + # for loader_name, loader in zip(['Test'], [test_dataloader]): + if verbose: + print(f"Processing {loader_name} data...") + + for batch in tqdm(loader, desc=f'Processing {loader_name} batches'): + # Call predict_batch for each batch + output_ddf = predict_batch( + model, + batch, + score_cut, + receptive_field, + use_cc=use_cc, + knn_method=knn_method, + output_ddf=output_ddf, + edge_index_save_path=edge_index_save_path, + output_ddf_save_path=output_ddf_save_path + ) + + + if verbose: + elapsed_time = time() - step_start_time + print(f"Batch processing completed in {elapsed_time:.2f} seconds.") + + # Load the full saved segmentation results + seg_final_dd = dd.read_parquet(output_ddf_save_path) + seg_final_dd = seg_final_dd.set_index('transcript_id', sorted=False) + + step_start_time = time() + if verbose: + print(f"Applying max score selection logic...") + + + # Step 1: Find max bound indices (bound == 1) and max unbound indices (bound == 0) + max_bound_idx = seg_final_dd[seg_final_dd['bound'] == 1].groupby('transcript_id')['score'].idxmax() + max_unbound_idx = seg_final_dd[seg_final_dd['bound'] == 0].groupby('transcript_id')['score'].idxmax() + + # Step 2: Combine indices, prioritizing bound=1 scores + final_idx = max_bound_idx.combine_first(max_unbound_idx).compute() + print(final_idx) + + # Step 3: Use the computed final_idx to select the best assignments + # Make sure you are using the divisions and set the index correctly before loc + # seg_final_dd = seg_final_dd.set_index('transcript_id', sorted=True) + seg_final_filtered = seg_final_dd.loc[final_idx].compute() + + if verbose: + elapsed_time = time() - step_start_time + print(f"Max score selection completed in {elapsed_time:.2f} seconds.") + + # Step 3: Load the transcripts DataFrame and merge results + + if verbose: + print(f"Loading transcripts from {transcript_file}...") + + transcripts_df = dd.read_parquet(transcript_file) + transcripts_df['transcript_id'] = transcripts_df['transcript_id'].astype(str) + + step_start_time = time() + if verbose: + print(f"Merging segmentation results with transcripts...") + + # Outer merge to include all transcripts, even those without assigned cell ids + transcripts_df_filtered = transcripts_df.merge(seg_final_filtered, on='transcript_id', how='outer') + + if verbose: + elapsed_time = time() - step_start_time + print(f"Merging segmentation results with transcripts completed in {elapsed_time:.2f} seconds.") + + # Step 4: Handle unassigned transcripts using connected components (if use_cc=True) + if use_cc: + + step_start_time = time() + if verbose: + print(f"Computing connected components for unassigned transcripts...") + # Load edge indices from saved Parquet + edge_index_dd = dd.read_parquet(edge_index_save_path) + + # Step 2: Get unique transcript_ids from edge_index_dd and their positional indices + transcript_ids_in_edges = dd.concat([edge_index_dd['source'], edge_index_dd['target']]).unique().compute() + + # Create a lookup table with unique indices + lookup_table = pd.Series(data=range(len(transcript_ids_in_edges)), index=transcript_ids_in_edges).to_dict() + + # Map source and target to positional indices + edge_index_dd['index_source'] = edge_index_dd['source'].map(lookup_table) + edge_index_dd['index_target'] = edge_index_dd['target'].map(lookup_table) + # Step 3: Compute connected components for transcripts involved in edges + source_indices = np.asarray(edge_index_dd['index_source'].compute()) + target_indices = np.asarray(edge_index_dd['index_target'].compute()) + data_cp = np.ones(len(source_indices), dtype=cp.float32) + + # Create the sparse COO matrix + coo_cp_matrix = scipy_coo_matrix((data_cp, (source_indices, target_indices)), + shape=(len(transcript_ids_in_edges), len(transcript_ids_in_edges))) + + # Use CuPy's connected components algorithm to compute components + n, comps = cc(coo_cp_matrix, directed=True, connection='weak') + + # Step 4: Map back the component labels to the original transcript_ids + comp_labels = pd.Series(comps, index=transcript_ids_in_edges) + # Step 5: Handle only unassigned transcripts in transcripts_df_filtered + unassigned_mask = transcripts_df_filtered['segger_cell_id'].isna() + + unassigned_transcripts_df = transcripts_df_filtered.loc[unassigned_mask, ['transcript_id']] + + # Step 6: Map component labels only to unassigned transcript_ids + new_segger_cell_ids = unassigned_transcripts_df['transcript_id'].map(comp_labels) + + # Step 7: Create a DataFrame with updated 'segger_cell_id' for unassigned transcripts + unassigned_transcripts_df = unassigned_transcripts_df.assign(segger_cell_id=new_segger_cell_ids) + + # Step 8: Merge this DataFrame back into the original to update only the unassigned segger_cell_id + # We perform a left join so that only the rows in unassigned_transcripts_df are updated + # transcripts_df_filtered = transcripts_df_filtered.drop(columns='segger_cell_id') + + # Merging the updates back to the original DataFrame + transcripts_df_filtered = transcripts_df_filtered.merge( + unassigned_transcripts_df[['transcript_id', 'segger_cell_id']], + on='transcript_id', + how='left', # Perform a left join to only update the unassigned rows + suffixes=('', '_new') # Suffix for new column to avoid overwriting + ) + + # Step 9: Fill missing segger_cell_id values with the updated values from the merge + transcripts_df_filtered['segger_cell_id'] = transcripts_df_filtered['segger_cell_id'].fillna( + transcripts_df_filtered['segger_cell_id_new'] + ) + + # Step 10: Clean up by dropping the temporary 'segger_cell_id_new' column + transcripts_df_filtered = transcripts_df_filtered.drop(columns=['segger_cell_id_new']) + + # Fill the NaN values in segger_cell_id with the already existing (assigned) values + # transcripts_df_filtered['segger_cell_id'] = transcripts_df_filtered['segger_cell_id'].fillna(transcripts_df_filtered['segger_cell_id_target']) + + # Drop any temporary columns used during the merge + # transcripts_df_filtered = transcripts_df_filtered.drop(columns=['segger_cell_id_target']) + + if verbose: + elapsed_time = time() - step_start_time + print(f"Connected components computed in {elapsed_time:.2f} seconds.") + + # Step 5: Save the merged results based on options + + if save_transcripts: + if verbose: + step_start_time = time() + print(f"Saving transcirpts.parquet...") + transcripts_save_path = save_dir / "segger_transcripts.parquet" + transcripts_df_filtered = transcripts_df_filtered.repartition(npartitions=100) + transcripts_df_filtered.to_parquet( + transcripts_save_path, + engine="pyarrow", # PyArrow is faster and recommended + compression="snappy", # Use snappy compression for speed + write_index=False, # Skip writing index if not needed + append=False, # Set to True if you're appending to an existing Parquet file + overwrite=True + ) # Dask handles Parquet well + if verbose: + elapsed_time = time() - step_start_time + print(f"Saved trasncripts.parquet in {elapsed_time:.2f} seconds.") + + if save_anndata: + if verbose: + step_start_time = time() + print(f"Saving anndata object...") + anndata_save_path = save_dir / "segger_adata.h5ad" + segger_adata = create_anndata(transcripts_df_filtered.compute(), **anndata_kwargs) # Compute for AnnData + segger_adata.write(anndata_save_path) + if verbose: + elapsed_time = time() - step_start_time + print(f"Saved anndata object in {elapsed_time:.2f} seconds.") + + if save_cell_masks: + if verbose: + step_start_time = time() + print(f"Computing and saving cell masks anndata object...") + # Placeholder for future cell masks implementation as Dask Geopandas Parquet + cell_masks_save_path = save_dir / "segger_cell_boundaries.parquet" + if verbose: + elapsed_time = time() - step_start_time + print(f"Saved cell masks in {elapsed_time:.2f} seconds.") + + if verbose: + elapsed_time = time() - step_start_time + print(f"Results saved in {elapsed_time:.2f} seconds at {save_dir}.") + + # Step 6: Save segmentation parameters as a JSON log + log_data = { + "seg_tag": seg_tag, + "score_cut": score_cut, + "use_cc": use_cc, + "receptive_field": receptive_field, + "knn_method": knn_method, + "save_transcripts": save_transcripts, + "save_anndata": save_anndata, + "save_cell_masks": save_cell_masks, + "timestamp": datetime.now().isoformat() + } + + log_path = save_dir / "segmentation_log.json" + with open(log_path, 'w') as log_file: + json.dump(log_data, log_file, indent=4) + + # Step 7: Garbage collection and memory cleanup + torch.cuda.empty_cache() + gc.collect() + + # Total time taken for the segmentation process + if verbose: + total_time = time() - start_time + print(f"Total segmentation process completed in {total_time:.2f} seconds.") diff --git a/src/segger/validation/utils.py b/src/segger/validation/utils.py index b283b00..63470a3 100644 --- a/src/segger/validation/utils.py +++ b/src/segger/validation/utils.py @@ -586,6 +586,11 @@ def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnD cells_n0 = [i for i in adata.obs_names if i.endswith('-nx')] segmentations_dict['segger_n1'] = adata[cells_n1, :] segmentations_dict['segger_n0'] = adata[cells_n0, :] + if method == 'Baysor': + # cells_n1 = [i for i in adata.obs_names if not i.endswith('-nx')] + # cells_n0 = [i for i in adata.obs_names if i.endswith('-nx')] + segmentations_dict['Baysor_n1'] = adata[adata.obs.has_nucleus, :] + segmentations_dict['Baysor_n0'] = adata[~adata.obs.has_nucleus, :] segmentations_dict[method] = adata return segmentations_dict @@ -927,7 +932,7 @@ def plot_quantized_mecr_counts(quantized_mecr_counts: Dict[str, pd.DataFrame], o output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - quantized_mecr_counts.to_csv(output_path / 'quantized_mecr_counts.csv', index=True) + # quantized_mecr_counts.to_csv(output_path / 'quantized_mecr_counts.csv', index=True) plt.figure(figsize=(9, 6)) for method, df in quantized_mecr_counts.items(): plt.plot( @@ -966,7 +971,7 @@ def plot_quantized_mecr_area(quantized_mecr_area: Dict[str, pd.DataFrame], outpu output_path (Path): Path to the directory where the plot will be saved. palette (Dict[str, str]): Dictionary mapping segmentation method names to color codes. """ - quantized_mecr_area.to_csv(output_path / 'quantized_mecr_area.csv', index=True) + # quantized_mecr_area.to_csv(output_path / 'quantized_mecr_area.csv', index=True) plt.figure(figsize=(6, 4)) for method, df in quantized_mecr_area.items(): plt.plot( From 40d146910bb5dbbe20fdee6fdff66ff340c3cde6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:24:58 +0000 Subject: [PATCH 097/156] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.8.0 → 24.10.0](https://github.com/psf/black/compare/24.8.0...24.10.0) - [github.com/asottile/blacken-docs: 1.18.0 → 1.19.0](https://github.com/asottile/blacken-docs/compare/1.18.0...1.19.0) --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a1e1760..598ff15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,14 +2,14 @@ fail_fast: false default_language_version: python: python3 default_stages: - - commit - - push + - pre-commit + - pre-push minimum_pre_commit_version: 2.16.0 ci: skip: [] repos: - repo: https://github.com/psf/black - rev: 24.8.0 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-prettier @@ -17,6 +17,6 @@ repos: hooks: - id: prettier - repo: https://github.com/asottile/blacken-docs - rev: 1.18.0 + rev: 1.19.0 hooks: - id: blacken-docs From 65a5ef7516c9965c999e21537086e4698acb51fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:25:07 +0000 Subject: [PATCH 098/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/notebooks/benchmark_bc.py | 32 +-- scripts/predict_model_sample.py | 17 +- scripts/sandbox.py | 141 +++++++------ src/segger/prediction/predict.py | 99 +++++----- src/segger/prediction/predict_parquet.py | 241 +++++++++++------------ src/segger/validation/utils.py | 10 +- 6 files changed, 270 insertions(+), 270 deletions(-) diff --git a/docs/notebooks/benchmark_bc.py b/docs/notebooks/benchmark_bc.py index ee27776..6d91c57 100644 --- a/docs/notebooks/benchmark_bc.py +++ b/docs/notebooks/benchmark_bc.py @@ -15,15 +15,15 @@ # Define colors for segmentation methods method_colors = { - 'segger': '#D55E00', - 'segger_n0': '#E69F00', - 'segger_n1': '#F0E442', - 'segger_embedding': '#C72228', - 'Baysor': '#000075', - 'Baysor_n0': '#0F4A9C', - 'Baysor_n1': '#0072B2', - '10X': '#8B008B', - '10X-nucleus': '#CC79A7', + "segger": "#D55E00", + "segger_n0": "#E69F00", + "segger_n1": "#F0E442", + "segger_embedding": "#C72228", + "Baysor": "#000075", + "Baysor_n0": "#0F4A9C", + "Baysor_n1": "#0072B2", + "10X": "#8B008B", + "10X-nucleus": "#CC79A7", # 'BIDCell': '#009E73' } @@ -73,10 +73,18 @@ # transfer_column='celltype_major' # ) # segmentations_dict[method].write(segmentation_paths[method]) - + sc._settings.ScanpyConfig.figdir = figures_path -segmentations_dict['segger_embedding'].obsm['spatial'] = segmentations_dict['segger_embedding'].obs[['cell_centroid_x', 'cell_centroid_y']].values -sc.pl.spatial(segmentations_dict['segger_embedding'], spot_size=10, save= 'embedding.pdf', color='celltype_major', palette=major_colors) +segmentations_dict["segger_embedding"].obsm["spatial"] = ( + segmentations_dict["segger_embedding"].obs[["cell_centroid_x", "cell_centroid_y"]].values +) +sc.pl.spatial( + segmentations_dict["segger_embedding"], + spot_size=10, + save="embedding.pdf", + color="celltype_major", + palette=major_colors, +) # Find mutually exclusive genes based on scRNAseq data exclusive_gene_pairs = find_mutually_exclusive_genes( diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index cdcef98..872b536 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -16,7 +16,6 @@ import dask.dataframe as dd - segger_data_dir = Path("./data_tidy/pyg_datasets/bc_embedding_1001") models_dir = Path("./models/bc_embedding_1001_small") benchmarks_dir = Path("/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc") @@ -24,8 +23,8 @@ # Initialize the Lightning data module dm = SeggerDataModule( data_dir=segger_data_dir, - batch_size=1, - num_workers=0, + batch_size=1, + num_workers=0, ) dm.setup() @@ -43,17 +42,17 @@ model, dm, save_dir=benchmarks_dir, - seg_tag='parquet_test_big', + seg_tag="parquet_test_big", transcript_file=transcripts_file, # file_format='anndata', - receptive_field = receptive_field, + receptive_field=receptive_field, min_transcripts=5, score_cut=0.5, # max_transcripts=1500, cell_id_col="segger_cell_id", use_cc=True, - knn_method='cuda', - verbose=True + knn_method="cuda", + verbose=True, # client=client ) @@ -70,7 +69,7 @@ # lifetime_restart=True # Automatically restart workers # ) # client = Client(cluster) - + # segment( # model, # dm, @@ -89,4 +88,4 @@ # ) # client.close() -# cluster.close() \ No newline at end of file +# cluster.close() diff --git a/scripts/sandbox.py b/scripts/sandbox.py index 8d4a82f..05418a3 100644 --- a/scripts/sandbox.py +++ b/scripts/sandbox.py @@ -5,24 +5,24 @@ # Define method colors method_colors = { - 'segger': '#D55E00', + "segger": "#D55E00", # 'segger_n0': '#E69F00', # 'segger_n1': '#F0E442', # 'segger_embedding': '#C72228', - 'Baysor': '#000075', + "Baysor": "#000075", # 'Baysor_n0': '#0F4A9C', # 'Baysor_n1': '#0072B2', - '10X': '#8B008B', - '10X-nucleus': '#CC79A7', + "10X": "#8B008B", + "10X-nucleus": "#CC79A7", # 'BIDCell': '#009E73' } # Define the path to your figures and data # figures_path = Path("/path/to/your/figures") # Update with the actual path -cell_counts_path = figures_path / 'cell_counts_data.csv' -cell_area_log2_path = figures_path / 'cell_area_log2_data.csv' -mcer_box_path = figures_path / 'mcer_box.csv' -sensitivity_boxplot_data = figures_path / 'sensitivity_results.csv' +cell_counts_path = figures_path / "cell_counts_data.csv" +cell_area_log2_path = figures_path / "cell_area_log2_data.csv" +mcer_box_path = figures_path / "mcer_box.csv" +sensitivity_boxplot_data = figures_path / "sensitivity_results.csv" # Load the data cell_counts_data = pd.read_csv(cell_counts_path) @@ -30,12 +30,8 @@ sensitivity_boxplot_data = pd.read_csv(sensitivity_boxplot_data) # Cell counts barplot -cell_counts_data.rename(columns={'Unnamed: 0': 'Method'}, inplace=True) -cell_counts_data = cell_counts_data[~cell_counts_data['Method'].isin(['segger_n0', 'segger_n1'])] - - - - +cell_counts_data.rename(columns={"Unnamed: 0": "Method"}, inplace=True) +cell_counts_data = cell_counts_data[~cell_counts_data["Method"].isin(["segger_n0", "segger_n1"])] sensitivity_results.csv @@ -44,35 +40,36 @@ # Finally, the MCER plot mcer_methods_final = method_colors.keys() -mcer_data_filtered = mcer_box_data[mcer_box_data['Segmentation Method'].isin(mcer_methods_final)] +mcer_data_filtered = mcer_box_data[mcer_box_data["Segmentation Method"].isin(mcer_methods_final)] import seaborn as sns import matplotlib.pyplot as plt - sns.set_style("white") sns.set_context("paper", font_scale=1.2) mcer_methods_final = method_colors.keys() -cell_area_log2_data = cell_area_log2_data[cell_area_log2_data['Segmentation Method'].isin(mcer_methods_final)] +cell_area_log2_data = cell_area_log2_data[cell_area_log2_data["Segmentation Method"].isin(mcer_methods_final)] # Create the boxplot with the size 4x6 inches and show only the outliers plt.figure(figsize=(2, 4)) -sns.boxplot(data=cell_area_log2_data, - x='Segmentation Method', - y='Cell Area (log2)', - palette=method_colors, - showfliers=False, - showcaps=False, - flierprops={"marker": "x"}) # Hide the default outliers in the boxplot +sns.boxplot( + data=cell_area_log2_data, + x="Segmentation Method", + y="Cell Area (log2)", + palette=method_colors, + showfliers=False, + showcaps=False, + flierprops={"marker": "x"}, +) # Hide the default outliers in the boxplot # Add a stripplot to show only the outliers -# sns.stripplot(data=mcer_data_filtered, -# x='Segmentation Method', -# y='MECR', +# sns.stripplot(data=mcer_data_filtered, +# x='Segmentation Method', +# y='MECR', # jitter=False, # Avoid jittering to keep points in place # dodge=True, # Keep points aligned with the boxplot categories # marker="D", # Use diamond-shaped marker @@ -82,13 +79,13 @@ # size=6) # Set the size of the outliers # Rotate the x-axis labels -plt.xticks(rotation=45, ha='right') +plt.xticks(rotation=45, ha="right") # Setting solid black borders on all four sides ax = plt.gca() for spine in ax.spines.values(): spine.set_visible(True) - spine.set_color('black') # Set border color to black + spine.set_color("black") # Set border color to black # plt.ylim(0, 0.2) @@ -106,33 +103,37 @@ plt.tight_layout() # Save the updated plot as both PNG and PDF -cell_areas_boxplot_pdf_path = figures_path / 'cell_areas.pdf' -cell_areas_boxplot_png_path = figures_path / 'cell_areas.png' +cell_areas_boxplot_pdf_path = figures_path / "cell_areas.pdf" +cell_areas_boxplot_png_path = figures_path / "cell_areas.png" -plt.savefig(cell_areas_boxplot_pdf_path, format='pdf', bbox_inches='tight', dpi=300) -plt.savefig(cell_areas_boxplot_png_path, format='png', bbox_inches='tight', dpi=300) +plt.savefig(cell_areas_boxplot_pdf_path, format="pdf", bbox_inches="tight", dpi=300) +plt.savefig(cell_areas_boxplot_png_path, format="png", bbox_inches="tight", dpi=300) # Close the figure plt.close() mcer_methods_final = method_colors.keys() -sensitivity_boxplot_data = sensitivity_boxplot_data[sensitivity_boxplot_data['Segmentation Method'].isin(mcer_methods_final)] +sensitivity_boxplot_data = sensitivity_boxplot_data[ + sensitivity_boxplot_data["Segmentation Method"].isin(mcer_methods_final) +] plt.figure(figsize=(2.5, 4)) -sns.boxplot(data=sensitivity_boxplot_data, - x='Segmentation Method', - y='Sensitivity', - palette=method_colors, - showfliers=False, - showcaps=False, - flierprops={"marker": "x"}) # Hide the default outliers in the boxplot +sns.boxplot( + data=sensitivity_boxplot_data, + x="Segmentation Method", + y="Sensitivity", + palette=method_colors, + showfliers=False, + showcaps=False, + flierprops={"marker": "x"}, +) # Hide the default outliers in the boxplot # Add a stripplot to show only the outliers -# sns.stripplot(data=mcer_data_filtered, -# x='Segmentation Method', -# y='MECR', +# sns.stripplot(data=mcer_data_filtered, +# x='Segmentation Method', +# y='MECR', # jitter=False, # Avoid jittering to keep points in place # dodge=True, # Keep points aligned with the boxplot categories # marker="D", # Use diamond-shaped marker @@ -142,13 +143,13 @@ # size=6) # Set the size of the outliers # Rotate the x-axis labels -plt.xticks(rotation=45, ha='right') +plt.xticks(rotation=45, ha="right") # Setting solid black borders on all four sides ax = plt.gca() for spine in ax.spines.values(): spine.set_visible(True) - spine.set_color('black') # Set border color to black + spine.set_color("black") # Set border color to black # plt.ylim(0, 0.2) @@ -166,37 +167,35 @@ plt.tight_layout() # Save the updated plot as both PNG and PDF -sensitivity_boxplot_data_boxplot_pdf_path = figures_path / 'sensitivity_boxplot_data.pdf' -sensitivity_boxplot_data_boxplot_png_path = figures_path / 'sensitivity_boxplot_data.png' +sensitivity_boxplot_data_boxplot_pdf_path = figures_path / "sensitivity_boxplot_data.pdf" +sensitivity_boxplot_data_boxplot_png_path = figures_path / "sensitivity_boxplot_data.png" -plt.savefig(sensitivity_boxplot_data_boxplot_pdf_path, format='pdf', bbox_inches='tight', dpi=300) -plt.savefig(sensitivity_boxplot_data_boxplot_png_path, format='png', bbox_inches='tight', dpi=300) +plt.savefig(sensitivity_boxplot_data_boxplot_pdf_path, format="pdf", bbox_inches="tight", dpi=300) +plt.savefig(sensitivity_boxplot_data_boxplot_png_path, format="png", bbox_inches="tight", dpi=300) # Close the figure plt.close() - - - - sns.set_style("white") sns.set_context("paper") # Create the boxplot with the size 4x6 inches and show only the outliers plt.figure(figsize=(2.5, 4)) -sns.boxplot(data=mcer_data_filtered, - x='Segmentation Method', - y='MECR', - palette=method_colors, - showfliers=False, - showcaps=False, - flierprops={"marker": "x"}) # Hide the default outliers in the boxplot +sns.boxplot( + data=mcer_data_filtered, + x="Segmentation Method", + y="MECR", + palette=method_colors, + showfliers=False, + showcaps=False, + flierprops={"marker": "x"}, +) # Hide the default outliers in the boxplot # Add a stripplot to show only the outliers -# sns.stripplot(data=mcer_data_filtered, -# x='Segmentation Method', -# y='MECR', +# sns.stripplot(data=mcer_data_filtered, +# x='Segmentation Method', +# y='MECR', # jitter=False, # Avoid jittering to keep points in place # dodge=True, # Keep points aligned with the boxplot categories # marker="D", # Use diamond-shaped marker @@ -206,13 +205,13 @@ # size=6) # Set the size of the outliers # Rotate the x-axis labels -plt.xticks(rotation=45, ha='right') +plt.xticks(rotation=45, ha="right") # Setting solid black borders on all four sides ax = plt.gca() for spine in ax.spines.values(): spine.set_visible(True) - spine.set_color('black') # Set border color to black + spine.set_color("black") # Set border color to black plt.ylim(0, 0.2) @@ -230,11 +229,11 @@ plt.tight_layout() # Save the updated plot as both PNG and PDF -final_mcer_boxplot_pdf_path = figures_path / 'mcer_boxplot_with_outliers.pdf' -final_mcer_boxplot_png_path = figures_path / 'mcer_boxplot_with_outliers.png' +final_mcer_boxplot_pdf_path = figures_path / "mcer_boxplot_with_outliers.pdf" +final_mcer_boxplot_png_path = figures_path / "mcer_boxplot_with_outliers.png" -plt.savefig(final_mcer_boxplot_pdf_path, format='pdf', bbox_inches='tight', dpi=300) -plt.savefig(final_mcer_boxplot_png_path, format='png', bbox_inches='tight', dpi=300) +plt.savefig(final_mcer_boxplot_pdf_path, format="pdf", bbox_inches="tight", dpi=300) +plt.savefig(final_mcer_boxplot_png_path, format="png", bbox_inches="tight", dpi=300) # Close the figure -plt.close() \ No newline at end of file +plt.close() diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index 25dcaac..a2ab520 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -121,11 +121,11 @@ def get_similarity_scores( """ # Keep everything on GPU until final results - batch = batch.to('cuda') + batch = batch.to("cuda") # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - + # Compute edge indices using knn method (still on GPU) edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions @@ -134,7 +134,7 @@ def get_similarity_scores( dist=receptive_field[f"dist_{to_type}"], method=knn_method, ) - + # Convert to dense adjacency matrix (on GPU) edge_index = coo_to_dense_adj( edge_index.T, @@ -176,7 +176,7 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) - + return sparse_similarity @@ -190,8 +190,8 @@ def predict_batch( ) -> pd.DataFrame: """ Predict cell assignments for a batch of transcript data using a segmentation model. - Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) - or unassigned (bound=0). Unassigned transcripts are handled with connected components + Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) + or unassigned (bound=0). Unassigned transcripts are handled with connected components if use_cc is True. Adds a 'bound' column to indicate if the transcript is assigned to a cell (bound=1) or unassigned (bound=0). @@ -221,8 +221,8 @@ def _get_id(): batch = batch.to("cuda") # Extract transcript IDs and initialize assignments DataFrame - transcript_id = batch['tx'].id.cpu() - assignments = pd.DataFrame({'transcript_id': transcript_id}) + transcript_id = batch["tx"].id.cpu() + assignments = pd.DataFrame({"transcript_id": transcript_id}) transcript_id = cp.asnumpy(batch["tx"].id) assignments = pd.DataFrame({"transcript_id": transcript_id}) @@ -244,7 +244,7 @@ def _get_id(): all_ids = np.concatenate(batch["bd"].id) # Keep IDs as NumPy array assignments["segger_cell_id"] = None # Initialize as None max_indices = cp.argmax(dense_scores, axis=1).get() - assignments.loc[mask, 'segger_cell_id'] = all_ids[max_indices[mask]] + assignments.loc[mask, "segger_cell_id"] = all_ids[max_indices[mask]] assignments.loc[mask, "segger_cell_id"] = all_ids[max_indices[mask]] # Assign IDs @@ -252,9 +252,8 @@ def _get_id(): cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() - assignments['bound'] = 0 - assignments.loc[mask, 'bound'] = 1 - + assignments["bound"] = 0 + assignments.loc[mask, "bound"] = 1 # Handle unassigned transcripts with connected components # Move back to CPU @@ -263,13 +262,15 @@ def _get_id(): if use_cc: # Compute similarity scores between 'tx' and 'tx' - scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method) - - # Convert to dense NumPy array - data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) - row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) - col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) - # Remove from memory + scores_tx = get_similarity_scores( + lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method + ) + + # Convert to dense NumPy array + data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) + row_cpu = scores_tx.row.get() # Transfer row indices to CPU (NumPy) + col_cpu = scores_tx.col.get() # Transfer column indices to CPU (NumPy) + # Remove from memory scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field) # Convert to dense NumPy array data_cpu = scores_tx.data.get() # Transfer data to CPU (NumPy) @@ -279,11 +280,11 @@ def _get_id(): # dense_scores_tx = scores_tx.toarray().astype(cp.float16) # Rebuild the matrix on CPU using SciPy dense_scores_tx = scipy_coo_matrix((data_cpu, (row_cpu, col_cpu)), shape=scores_tx.shape).toarray() - del scores_tx + del scores_tx np.fill_diagonal(dense_scores_tx, 0) # Ignore self-similarity # Get the indices of unassigned transcripts - no_id = assignments['segger_cell_id'].isna().values # Convert to numpy for indexing + no_id = assignments["segger_cell_id"].isna().values # Convert to numpy for indexing del scores_tx # Remove from memory cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory @@ -303,7 +304,7 @@ def _get_id(): non_zero_rows, non_zero_cols = no_id_scores.nonzero() # Map these indices back to the actual transcript IDs (no_id_mask gives us their original position) - unassigned_ids = batch['tx'].id[no_id] # Unassigned transcript IDs + unassigned_ids = batch["tx"].id[no_id] # Unassigned transcript IDs # Construct edge index (source, target) based on non-zero connections in the no_id_scores matrix source_nodes = unassigned_ids[non_zero_rows].cpu() @@ -327,11 +328,6 @@ def _get_id(): return assignments - - - - - def predict( lit_segger: LitSegger, @@ -357,11 +353,13 @@ def predict( Returns: pd.DataFrame: Final DataFrame containing the transcript IDs, similarity scores, and assigned cell IDs. """ - + all_assignments = [] # Use Dask delayed to parallelize the predict_batch execution and create assignments - delayed_batches = [delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) - for batch in data_loader] + delayed_batches = [ + delayed(predict_batch)(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) + for batch in data_loader + ] for batch in data_loader: assignments = predict_batch(lit_segger, batch, score_cut, receptive_field, use_cc, knn_method) @@ -404,9 +402,6 @@ def predict( return result - - - def segment( model: LitSegger, dm: SeggerDataModule, @@ -446,8 +441,8 @@ def segment( The method to use for nearest neighbors ('cuda' by default). **anndata_kwargs : dict, optional Additional keyword arguments passed to the create_anndata function. - Perform segmentation using the model, merge segmentation results with transcripts data, - and save the results in the specified format. The function also handles unassigned + Perform segmentation using the model, merge segmentation results with transcripts data, + and save the results in the specified format. The function also handles unassigned transcripts through connected components analysis if `use_cc` is True. Args: @@ -456,16 +451,16 @@ def segment( save_dir (Union[str, Path]): Directory to save the final segmentation results. seg_tag (str): Tag to include in the saved filename. transcript_file (Union[str, Path]): Path to the transcripts Parquet file. - score_cut (float, optional): The threshold for assigning transcripts to cells based on + score_cut (float, optional): The threshold for assigning transcripts to cells based on similarity scores. Defaults to 0.5. - use_cc (bool, optional): If True, perform connected components analysis for unassigned + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. - file_format (str, optional): The file format to save the results ('csv', 'parquet', or 'anndata'). + file_format (str, optional): The file format to save the results ('csv', 'parquet', or 'anndata'). Defaults to 'anndata'. - receptive_field (dict, optional): Defines the receptive field for transcript-cell and - transcript-transcript relations. Defaults to + receptive_field (dict, optional): Defines the receptive field for transcript-cell and + transcript-transcript relations. Defaults to {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}. - knn_method (str, optional): The method to use for nearest neighbors ('cuda' or 'kd_tree'). + knn_method (str, optional): The method to use for nearest neighbors ('cuda' or 'kd_tree'). Defaults to 'kd_tree'. verbose (bool, optional): Whether to print verbose status updates. Defaults to False. **anndata_kwargs: Additional keyword arguments passed to the `create_anndata` function. @@ -473,7 +468,7 @@ def segment( Returns: None """ - + # Initialize global_edge_index_list to store edge indices across batches global global_edge_index_list global_edge_index_list = [] @@ -557,24 +552,30 @@ def segment( edge_index_computed = concatenated_edge_index.compute() # Get the full edge_index (source, target) # Map transcript_ids to their index positions in the DataFrame - transcript_idx_map = pd.Series(transcripts_df_filtered.index, index=transcripts_df_filtered['transcript_id']).to_dict() + transcript_idx_map = pd.Series( + transcripts_df_filtered.index, index=transcripts_df_filtered["transcript_id"] + ).to_dict() # Convert the transcript_ids in edge_index_computed to positional indices source_indices = [transcript_idx_map[tid] for tid in edge_index_computed[0]] target_indices = [transcript_idx_map[tid] for tid in edge_index_computed[1]] # Use SciPy's connected components algorithm - n, comps = cc(scipy_coo_matrix((np.ones(len(source_indices)), - (source_indices, target_indices)), - shape=(transcripts_df_filtered.shape[0], transcripts_df_filtered.shape[0])), - connection="weak", directed=False) + n, comps = cc( + scipy_coo_matrix( + (np.ones(len(source_indices)), (source_indices, target_indices)), + shape=(transcripts_df_filtered.shape[0], transcripts_df_filtered.shape[0]), + ), + connection="weak", + directed=False, + ) # Generate new cell IDs based on connected components new_ids = np.array([_get_id() for _ in range(n)]) # Assign new cell IDs to the unassigned transcripts in the final assignments - unassigned_mask = transcripts_df_filtered['segger_cell_id'].isna() - transcripts_df_filtered.loc[unassigned_mask, 'segger_cell_id'] = new_ids[comps] + unassigned_mask = transcripts_df_filtered["segger_cell_id"].isna() + transcripts_df_filtered.loc[unassigned_mask, "segger_cell_id"] = new_ids[comps] if verbose: elapsed_time = format_time(time.time() - step_start_time) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 8d607ad..ea3cff1 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -50,13 +50,13 @@ import dask_cudf import cupy as cp import cupyx -import warnings +import warnings import shutil -from time import time +from time import time from cupyx.scipy.sparse import coo_matrix as cp_coo_matrix from cupyx.scipy.sparse.csgraph import connected_components as cp_cc -# Setup Dask cluster with 3 workers +# Setup Dask cluster with 3 workers # CONFIG @@ -78,14 +78,16 @@ def zero_out_diagonal_gpu(sparse_matrix): """ # Filter out the diagonal (where row == col) non_diagonal_mask = sparse_matrix.row != sparse_matrix.col - + # Create a new sparse matrix without diagonal elements sparse_matrix_no_diag = cupyx.scipy.sparse.coo_matrix( - (sparse_matrix.data[non_diagonal_mask], - (sparse_matrix.row[non_diagonal_mask], sparse_matrix.col[non_diagonal_mask])), - shape=sparse_matrix.shape + ( + sparse_matrix.data[non_diagonal_mask], + (sparse_matrix.row[non_diagonal_mask], sparse_matrix.col[non_diagonal_mask]), + ), + shape=sparse_matrix.shape, ) - + return sparse_matrix_no_diag @@ -120,8 +122,6 @@ def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): return coo_matrix((new_data, (row_map, col_map)), shape=(len(row_idx), len(col_idx))) - - def load_model(checkpoint_path: str) -> LitSegger: """ Load a LitSegger model from a checkpoint. @@ -129,7 +129,7 @@ def load_model(checkpoint_path: str) -> LitSegger: Parameters ---------- checkpoint_path : str - Specific checkpoint file to load, or directory where the model checkpoints are stored. + Specific checkpoint file to load, or directory where the model checkpoints are stored. If directory, the latest checkpoint is loaded. Returns @@ -147,13 +147,15 @@ def load_model(checkpoint_path: str) -> LitSegger: # Get last checkpoint if directory is provided if os.path.isdir(checkpoint_path): - checkpoints = glob.glob(str(checkpoint_path / '*.ckpt')) + checkpoints = glob.glob(str(checkpoint_path / "*.ckpt")) if len(checkpoints) == 0: raise FileNotFoundError(msg) + # Sort checkpoints by epoch and step def sort_order(c): - match = re.match(r'.*epoch=(\d+)-step=(\d+).ckpt', c) + match = re.match(r".*epoch=(\d+)-step=(\d+).ckpt", c) return int(match[1]), int(match[2]) + checkpoint_path = Path(sorted(checkpoints, key=sort_order)[-1]) elif not checkpoint_path.exists(): raise FileExistsError(msg) @@ -166,17 +168,11 @@ def sort_order(c): return lit_segger - def get_similarity_scores( - model: torch.nn.Module, - batch: Batch, - from_type: str, - to_type: str, - receptive_field: dict, - knn_method: str = 'cuda' + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, receptive_field: dict, knn_method: str = "cuda" ) -> coo_matrix: """ - Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes + Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes using sparse matrix multiplication with CuPy and the 'sees' edge relation. Args: @@ -187,52 +183,48 @@ def get_similarity_scores( knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. Returns: - coo_matrix: A sparse matrix containing the similarity scores between + coo_matrix: A sparse matrix containing the similarity scores between 'from_type' and 'to_type' nodes. """ # Keep everything on GPU until final results - batch = batch.to('cuda') + batch = batch.to("cuda") # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - + # Compute edge indices using knn method (still on GPU) edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], - method=knn_method + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method=knn_method, ) - + # Convert to dense adjacency matrix (on GPU) - edge_index = coo_to_dense_adj( - edge_index.T, - num_nodes=shape[0], - num_nbrs=receptive_field[f'k_{to_type}'] - ) - + edge_index = coo_to_dense_adj(edge_index.T, num_nodes=shape[0], num_nbrs=receptive_field[f"k_{to_type}"]) + with torch.no_grad(): embeddings = model(batch.x_dict, batch.edge_index_dict) - + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros similarity = torch.bmm( - m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed - embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1), # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 del embeddings # Sigmoid to get most similar 'to_type' neighbor similarity[similarity == 0] = -torch.inf # ensure zero stays zero similarity = F.sigmoid(similarity) # Neighbor-filtered similarity scores # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(edge_index != -1).T + indices = torch.argwhere(edge_index != -1).T indices[1] = edge_index[edge_index != -1] - rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) - columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + rows = cp.fromDlpack(to_dlpack(indices[0, :].to("cuda"))) + columns = cp.fromDlpack(to_dlpack(indices[1, :].to("cuda"))) # print(rows) del indices values = similarity[edge_index != -1].flatten() @@ -242,11 +234,8 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) - - return sparse_similarity - - + return sparse_similarity def predict_batch( @@ -255,10 +244,10 @@ def predict_batch( score_cut: float, receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = 'cuda', + knn_method: str = "cuda", output_ddf: dask_cudf.DataFrame = None, edge_index_save_path: Union[str, Path] = None, - output_ddf_save_path: Union[str, Path] = None + output_ddf_save_path: Union[str, Path] = None, ) -> dask_cudf.DataFrame: """ Predict cell assignments for a batch of transcript data using a segmentation model. @@ -268,37 +257,37 @@ def predict_batch( lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. batch (Batch): A batch of transcript and cell data. score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. - receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell + receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell and transcript-transcript relations. - use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. output_ddf (dask_cudf.DataFrame, optional): Dask-CuDF DataFrame to accumulate and store transcript assignments. edge_index_save_path (str, optional): Path to the Parquet file where edge indices are saved incrementally. output_ddf_save_path (str, optional): Path to the Parquet file where transcript assignments (`output_ddf`) are saved incrementally. - + Returns: dask_cudf.DataFrame: Updated Dask-CuDF DataFrame for assignments. """ - + def _get_id(): """Generate a random Xenium-style ID.""" - return ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '-nx' + return "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 8)) + "-nx" with cp.cuda.Device(0): # Move batch to GPU batch = batch.to("cuda") # Extract transcript IDs and initialize a dictionary for assignments - transcript_id = batch['tx'].id.cpu().numpy().astype('str') - assignments = {'transcript_id': transcript_id} + transcript_id = batch["tx"].id.cpu().numpy().astype("str") + assignments = {"transcript_id": transcript_id} - if len(batch['bd'].pos) >= 10: + if len(batch["bd"].pos) >= 10: # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method) torch.cuda.empty_cache() - + # Convert sparse matrix to dense format (on GPU) dense_scores = scores.toarray() # Convert to dense NumPy array del scores # Remove from memory @@ -306,27 +295,30 @@ def _get_id(): # Step 2: Maximize score and assign transcripts based on score threshold belongs = cp.max(dense_scores, axis=1) # Max score per transcript - assignments['score'] = cp.asnumpy(belongs) # Move back to CPU + assignments["score"] = cp.asnumpy(belongs) # Move back to CPU - mask = assignments['score'] >= score_cut # Mask for assigned transcripts - all_ids = np.concatenate(batch['bd'].id) # Boundary IDs as NumPy array - assignments['segger_cell_id'] = np.where(mask, all_ids[cp.argmax(dense_scores, axis=1).get()], None) + mask = assignments["score"] >= score_cut # Mask for assigned transcripts + all_ids = np.concatenate(batch["bd"].id) # Boundary IDs as NumPy array + assignments["segger_cell_id"] = np.where(mask, all_ids[cp.argmax(dense_scores, axis=1).get()], None) # Clear memory after score processing del dense_scores cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() - assignments['bound'] = np.where(mask, 1, 0) # Mark as 'bound' (1 if assigned, 0 if unassigned) - + assignments["bound"] = np.where(mask, 1, 0) # Mark as 'bound' (1 if assigned, 0 if unassigned) # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) if use_cc: - scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method) + scores_tx = get_similarity_scores( + lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method + ) # Stay on GPU and use CuPy sparse matrices - no_id_scores = cupyx.scipy.sparse.coo_matrix((scores_tx.data, (scores_tx.row, scores_tx.col)), shape=scores_tx.shape) - + no_id_scores = cupyx.scipy.sparse.coo_matrix( + (scores_tx.data, (scores_tx.row, scores_tx.col)), shape=scores_tx.shape + ) + # Apply threshold on GPU no_id_scores.data[no_id_scores.data <= score_cut] = 0 # Apply threshold no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse @@ -335,11 +327,11 @@ def _get_id(): no_id_scores = zero_out_diagonal_gpu(no_id_scores) # Find unassigned transcripts (those with no segger_cell_id) - no_id = cp.asarray(assignments['segger_cell_id'] == None) # Using CuPy to handle None values - + no_id = cp.asarray(assignments["segger_cell_id"] == None) # Using CuPy to handle None values + if cp.any(no_id): # Only compute if there are unassigned transcripts # Apply score cut-off to unassigned transcripts - no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) + no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) no_id_scores.data[no_id_scores.data <= score_cut] = 0 # Apply threshold no_id_scores.eliminate_zeros() # Clean up zeros @@ -352,18 +344,18 @@ def _get_id(): target_nodes = unassigned_ids[non_zero_cols.get()] # Save edge_index using CuDF and Dask-CuDF for GPU acceleration - edge_index_df = cudf.DataFrame({'source': source_nodes, 'target': target_nodes}) + edge_index_df = cudf.DataFrame({"source": source_nodes, "target": target_nodes}) edge_index_ddf = dask_cudf.from_cudf(edge_index_df, npartitions=1) - + # Use delayed for asynchronous disk writing of edge_index delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True) delayed_write_edge_index.persist() # Schedule writing assignments = { - 'transcript_id': assignments['transcript_id'].astype('str'), - 'score': assignments['score'].astype('float32'), - 'segger_cell_id': assignments['segger_cell_id'].astype('str'), # Ensure 'string' dtype - 'bound': assignments['bound'].astype('int8') # Ensure 'int64' dtype + "transcript_id": assignments["transcript_id"].astype("str"), + "score": assignments["score"].astype("float32"), + "segger_cell_id": assignments["segger_cell_id"].astype("str"), # Ensure 'string' dtype + "bound": assignments["bound"].astype("int8"), # Ensure 'int64' dtype } # Step 4: Convert assignments to Dask-CuDF DataFrame for this batch batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) @@ -398,13 +390,13 @@ def segment( save_transcripts: bool = True, save_anndata: bool = True, save_cell_masks: bool = False, # Placeholder for future implementation - receptive_field: dict = {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}, - knn_method: str = 'cuda', + receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, + knn_method: str = "cuda", verbose: bool = False, - **anndata_kwargs + **anndata_kwargs, ) -> None: """ - Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, + Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, and log the parameters used during segmentation. Args: @@ -413,25 +405,25 @@ def segment( save_dir (Union[str, Path]): Directory to save the final segmentation results. seg_tag (str): Tag to include in the saved filename. transcript_file (Union[str, Path]): Path to the transcripts Parquet file. - score_cut (float, optional): The threshold for assigning transcripts to cells based on + score_cut (float, optional): The threshold for assigning transcripts to cells based on similarity scores. Defaults to 0.5. - use_cc (bool, optional): If True, perform connected components analysis for unassigned + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. save_transcripts (bool, optional): Whether to save the transcripts as Parquet. Defaults to True. save_anndata (bool, optional): Whether to save the results in AnnData format. Defaults to True. save_cell_masks (bool, optional): Save cell masks as Dask Geopandas Parquet. Defaults to False. - receptive_field (dict, optional): Defines the receptive field for transcript-cell and - transcript-transcript relations. Defaults to + receptive_field (dict, optional): Defines the receptive field for transcript-cell and + transcript-transcript relations. Defaults to {'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}. - knn_method (str, optional): The method to use for nearest neighbors ('cuda' or 'kd_tree'). + knn_method (str, optional): The method to use for nearest neighbors ('cuda' or 'kd_tree'). Defaults to 'cuda'. verbose (bool, optional): Whether to print verbose status updates. Defaults to False. **anndata_kwargs: Additional keyword arguments passed to the `create_anndata` function. - + Returns: None. Saves the result to disk in various formats and logs the parameter choices. """ - + start_time = time() # Create a subdirectory with important parameter info (receptive field values) @@ -442,17 +434,16 @@ def segment( # Paths for saving the output_ddf and edge_index Parquet files output_ddf_save_path = save_dir / "transcripts_df.parquet" edge_index_save_path = save_dir / "edge_index.parquet" - + if output_ddf_save_path.exists(): warnings.warn(f"Removing existing file: {output_ddf_save_path}") shutil.rmtree(output_ddf_save_path) - + if use_cc: if edge_index_save_path.exists(): warnings.warn(f"Removing existing file: {edge_index_save_path}") shutil.rmtree(edge_index_save_path) - if verbose: print(f"Starting segmentation for {seg_tag}...") @@ -466,12 +457,14 @@ def segment( output_ddf = None # Loop through the data loaders (train, val, and test) - for loader_name, loader in zip(['Train', 'Validation', 'Test'], [train_dataloader, val_dataloader, test_dataloader]): - # for loader_name, loader in zip(['Test'], [test_dataloader]): + for loader_name, loader in zip( + ["Train", "Validation", "Test"], [train_dataloader, val_dataloader, test_dataloader] + ): + # for loader_name, loader in zip(['Test'], [test_dataloader]): if verbose: print(f"Processing {loader_name} data...") - for batch in tqdm(loader, desc=f'Processing {loader_name} batches'): + for batch in tqdm(loader, desc=f"Processing {loader_name} batches"): # Call predict_batch for each batch output_ddf = predict_batch( model, @@ -482,26 +475,24 @@ def segment( knn_method=knn_method, output_ddf=output_ddf, edge_index_save_path=edge_index_save_path, - output_ddf_save_path=output_ddf_save_path + output_ddf_save_path=output_ddf_save_path, ) - if verbose: elapsed_time = time() - step_start_time print(f"Batch processing completed in {elapsed_time:.2f} seconds.") # Load the full saved segmentation results seg_final_dd = dd.read_parquet(output_ddf_save_path) - seg_final_dd = seg_final_dd.set_index('transcript_id', sorted=False) + seg_final_dd = seg_final_dd.set_index("transcript_id", sorted=False) step_start_time = time() if verbose: print(f"Applying max score selection logic...") - # Step 1: Find max bound indices (bound == 1) and max unbound indices (bound == 0) - max_bound_idx = seg_final_dd[seg_final_dd['bound'] == 1].groupby('transcript_id')['score'].idxmax() - max_unbound_idx = seg_final_dd[seg_final_dd['bound'] == 0].groupby('transcript_id')['score'].idxmax() + max_bound_idx = seg_final_dd[seg_final_dd["bound"] == 1].groupby("transcript_id")["score"].idxmax() + max_unbound_idx = seg_final_dd[seg_final_dd["bound"] == 0].groupby("transcript_id")["score"].idxmax() # Step 2: Combine indices, prioritizing bound=1 scores final_idx = max_bound_idx.combine_first(max_unbound_idx).compute() @@ -517,24 +508,24 @@ def segment( print(f"Max score selection completed in {elapsed_time:.2f} seconds.") # Step 3: Load the transcripts DataFrame and merge results - + if verbose: print(f"Loading transcripts from {transcript_file}...") transcripts_df = dd.read_parquet(transcript_file) - transcripts_df['transcript_id'] = transcripts_df['transcript_id'].astype(str) + transcripts_df["transcript_id"] = transcripts_df["transcript_id"].astype(str) step_start_time = time() if verbose: print(f"Merging segmentation results with transcripts...") # Outer merge to include all transcripts, even those without assigned cell ids - transcripts_df_filtered = transcripts_df.merge(seg_final_filtered, on='transcript_id', how='outer') + transcripts_df_filtered = transcripts_df.merge(seg_final_filtered, on="transcript_id", how="outer") if verbose: elapsed_time = time() - step_start_time print(f"Merging segmentation results with transcripts completed in {elapsed_time:.2f} seconds.") - + # Step 4: Handle unassigned transcripts using connected components (if use_cc=True) if use_cc: @@ -545,35 +536,37 @@ def segment( edge_index_dd = dd.read_parquet(edge_index_save_path) # Step 2: Get unique transcript_ids from edge_index_dd and their positional indices - transcript_ids_in_edges = dd.concat([edge_index_dd['source'], edge_index_dd['target']]).unique().compute() + transcript_ids_in_edges = dd.concat([edge_index_dd["source"], edge_index_dd["target"]]).unique().compute() # Create a lookup table with unique indices lookup_table = pd.Series(data=range(len(transcript_ids_in_edges)), index=transcript_ids_in_edges).to_dict() # Map source and target to positional indices - edge_index_dd['index_source'] = edge_index_dd['source'].map(lookup_table) - edge_index_dd['index_target'] = edge_index_dd['target'].map(lookup_table) + edge_index_dd["index_source"] = edge_index_dd["source"].map(lookup_table) + edge_index_dd["index_target"] = edge_index_dd["target"].map(lookup_table) # Step 3: Compute connected components for transcripts involved in edges - source_indices = np.asarray(edge_index_dd['index_source'].compute()) - target_indices = np.asarray(edge_index_dd['index_target'].compute()) + source_indices = np.asarray(edge_index_dd["index_source"].compute()) + target_indices = np.asarray(edge_index_dd["index_target"].compute()) data_cp = np.ones(len(source_indices), dtype=cp.float32) # Create the sparse COO matrix - coo_cp_matrix = scipy_coo_matrix((data_cp, (source_indices, target_indices)), - shape=(len(transcript_ids_in_edges), len(transcript_ids_in_edges))) + coo_cp_matrix = scipy_coo_matrix( + (data_cp, (source_indices, target_indices)), + shape=(len(transcript_ids_in_edges), len(transcript_ids_in_edges)), + ) # Use CuPy's connected components algorithm to compute components - n, comps = cc(coo_cp_matrix, directed=True, connection='weak') + n, comps = cc(coo_cp_matrix, directed=True, connection="weak") # Step 4: Map back the component labels to the original transcript_ids comp_labels = pd.Series(comps, index=transcript_ids_in_edges) # Step 5: Handle only unassigned transcripts in transcripts_df_filtered - unassigned_mask = transcripts_df_filtered['segger_cell_id'].isna() + unassigned_mask = transcripts_df_filtered["segger_cell_id"].isna() - unassigned_transcripts_df = transcripts_df_filtered.loc[unassigned_mask, ['transcript_id']] + unassigned_transcripts_df = transcripts_df_filtered.loc[unassigned_mask, ["transcript_id"]] # Step 6: Map component labels only to unassigned transcript_ids - new_segger_cell_ids = unassigned_transcripts_df['transcript_id'].map(comp_labels) + new_segger_cell_ids = unassigned_transcripts_df["transcript_id"].map(comp_labels) # Step 7: Create a DataFrame with updated 'segger_cell_id' for unassigned transcripts unassigned_transcripts_df = unassigned_transcripts_df.assign(segger_cell_id=new_segger_cell_ids) @@ -584,26 +577,26 @@ def segment( # Merging the updates back to the original DataFrame transcripts_df_filtered = transcripts_df_filtered.merge( - unassigned_transcripts_df[['transcript_id', 'segger_cell_id']], - on='transcript_id', - how='left', # Perform a left join to only update the unassigned rows - suffixes=('', '_new') # Suffix for new column to avoid overwriting + unassigned_transcripts_df[["transcript_id", "segger_cell_id"]], + on="transcript_id", + how="left", # Perform a left join to only update the unassigned rows + suffixes=("", "_new"), # Suffix for new column to avoid overwriting ) # Step 9: Fill missing segger_cell_id values with the updated values from the merge - transcripts_df_filtered['segger_cell_id'] = transcripts_df_filtered['segger_cell_id'].fillna( - transcripts_df_filtered['segger_cell_id_new'] + transcripts_df_filtered["segger_cell_id"] = transcripts_df_filtered["segger_cell_id"].fillna( + transcripts_df_filtered["segger_cell_id_new"] ) # Step 10: Clean up by dropping the temporary 'segger_cell_id_new' column - transcripts_df_filtered = transcripts_df_filtered.drop(columns=['segger_cell_id_new']) + transcripts_df_filtered = transcripts_df_filtered.drop(columns=["segger_cell_id_new"]) # Fill the NaN values in segger_cell_id with the already existing (assigned) values # transcripts_df_filtered['segger_cell_id'] = transcripts_df_filtered['segger_cell_id'].fillna(transcripts_df_filtered['segger_cell_id_target']) # Drop any temporary columns used during the merge # transcripts_df_filtered = transcripts_df_filtered.drop(columns=['segger_cell_id_target']) - + if verbose: elapsed_time = time() - step_start_time print(f"Connected components computed in {elapsed_time:.2f} seconds.") @@ -622,12 +615,12 @@ def segment( compression="snappy", # Use snappy compression for speed write_index=False, # Skip writing index if not needed append=False, # Set to True if you're appending to an existing Parquet file - overwrite=True + overwrite=True, ) # Dask handles Parquet well if verbose: elapsed_time = time() - step_start_time print(f"Saved trasncripts.parquet in {elapsed_time:.2f} seconds.") - + if save_anndata: if verbose: step_start_time = time() @@ -663,11 +656,11 @@ def segment( "save_transcripts": save_transcripts, "save_anndata": save_anndata, "save_cell_masks": save_cell_masks, - "timestamp": datetime.now().isoformat() + "timestamp": datetime.now().isoformat(), } log_path = save_dir / "segmentation_log.json" - with open(log_path, 'w') as log_file: + with open(log_path, "w") as log_file: json.dump(log_data, log_file, indent=4) # Step 7: Garbage collection and memory cleanup diff --git a/src/segger/validation/utils.py b/src/segger/validation/utils.py index 6a4ef33..5a8c28a 100644 --- a/src/segger/validation/utils.py +++ b/src/segger/validation/utils.py @@ -556,11 +556,11 @@ def load_segmentations(segmentation_paths: Dict[str, Path]) -> Dict[str, sc.AnnD for method, path in segmentation_paths.items(): adata = sc.read(path) # Special handling for 'segger' to separate into 'segger_n0' and 'segger_n1' - if method == 'segger': - cells_n1 = [i for i in adata.obs_names if not i.endswith('-nx')] - cells_n0 = [i for i in adata.obs_names if i.endswith('-nx')] - segmentations_dict['segger_n1'] = adata[cells_n1, :] - segmentations_dict['segger_n0'] = adata[cells_n0, :] + if method == "segger": + cells_n1 = [i for i in adata.obs_names if not i.endswith("-nx")] + cells_n0 = [i for i in adata.obs_names if i.endswith("-nx")] + segmentations_dict["segger_n1"] = adata[cells_n1, :] + segmentations_dict["segger_n0"] = adata[cells_n0, :] segmentations_dict[method] = adata return segmentations_dict From cb79d9b03a784619a07b21940848c22e0f046cd2 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Wed, 16 Oct 2024 09:25:12 +0200 Subject: [PATCH 099/156] fixed a bug for predict_parquet --- scripts/predict_model_sample.py | 5 +- scripts/sandbox.py | 146 ++++++++++++- src/segger/prediction/predict_parquet.py | 252 +++++++++++++---------- 3 files changed, 291 insertions(+), 112 deletions(-) diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 872b536..3a53a11 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -36,7 +36,7 @@ model_path = models_dir / "lightning_logs" / f"version_{model_version}" model = load_model(model_path / "checkpoints") -receptive_field = {"k_bd": 4, "dist_bd": 12, "k_tx": 5, "dist_tx": 5} +receptive_field = {"k_bd": 4, "dist_bd": 20, "k_tx": 5, "dist_tx": 3} segment( model, @@ -51,8 +51,9 @@ # max_transcripts=1500, cell_id_col="segger_cell_id", use_cc=True, - knn_method="cuda", + knn_method='cuda', verbose=True, + gpu_ids=['0'] # client=client ) diff --git a/scripts/sandbox.py b/scripts/sandbox.py index 05418a3..e9e8287 100644 --- a/scripts/sandbox.py +++ b/scripts/sandbox.py @@ -1,7 +1,9 @@ -import pandas as pd -import seaborn as sns +import scanpy as sc import matplotlib.pyplot as plt -from pathlib import Path +import numpy as np +import seaborn as sns +import os +import pandas as pd # Define method colors method_colors = { @@ -237,3 +239,141 @@ # Close the figure plt.close() + + + + + + + + + + + +for method in method_colors.keys(): + # Set Seaborn style for minimalistic plots + # sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}) + # Load your AnnData object (replace with your actual file) + adata = segmentations_dict[method] + # Assuming spatial coordinates are stored in 'spatial' and cell areas are in 'cell_area' + x = adata.obs['cell_centroid_x'] # Replace with actual x-coordinate key + y = -adata.obs['cell_centroid_y'] # Replace with actual y-coordinate key + cell_area = adata.obs['cell_area'] # Replace with actual cell area key + # adata = adata[adata.obs.celltype_major == 'CAFs'] + # Create the hexbin plot + # plt.figure(figsize=(8, 6)) + # Use a "cool" colormap like "coolwarm" or "plasma" for a smoother effect + vmax=np.percentile(cell_area, 99) + vmin=np.percentile(cell_area, 50) + cell_area = np.clip(cell_area, vmin, vmax) + hb = plt.hexbin(x, y, C=cell_area, gridsize=50, cmap="viridis", reduce_C_function=np.mean, norm=mcolors.LogNorm(vmin=20, vmax=100)) + # Add a colorbar with a minimalistic design + cb = plt.colorbar(hb, orientation='vertical') + cb.set_label('Average Cell Area', fontsize=10) + # Minimalistic design: Remove unnecessary spines and ticks + plt.gca().spines['top'].set_visible(False) + plt.gca().spines['right'].set_visible(False) + plt.gca().spines['left'].set_visible(False) + plt.gca().spines['bottom'].set_visible(False) + plt.gca().tick_params(left=False, bottom=False) + plt.gca().set_xticks([]) + plt.gca().set_yticks([]) + # Minimalistic labels and title with ample space + plt.xlabel('', fontsize=12) + plt.ylabel('', fontsize=12) + plt.title(method, fontsize=16, pad=20) + # Tight layout for better spacing + plt.tight_layout() + # Define the path where you want to save the figure + # figures_path = 'figures_path' # Replace with your actual path + # Ensure the directory exists + # os.makedirs(figures_path, exist_ok=True) + # Save the figure as a PDF in the specified directory + pdf_path = figures_path / f'average_cell_area_hexbin_{method}.pdf' + plt.savefig(pdf_path, format='pdf', bbox_inches='tight') + # Optionally, show the plot (if needed) + plt.show() + plt.close() + +import matplotlib.colors as mcolors + +for method in method_colors.keys(): + # Set Seaborn style for minimalistic plots + # sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}) + # Load your AnnData object (replace with your actual file) + adata = segmentations_dict[method] + # Assuming spatial coordinates are stored in 'spatial' and cell areas are in 'cell_area' + x = adata.obs['cell_centroid_x'] # Replace with actual x-coordinate key + y = -adata.obs['cell_centroid_y'] # Replace with actual y-coordinate key + # vmin = adata.shape[0] / 3000 + # cell_area = np.log(adata.obs['cell_area']) # Replace with actual cell area key + # adata = adata[adata.obs.celltype_major == 'CAFs'] + # Create the hexbin plot + # plt.figure(figsize=(8, 6)) + # Use a "cool" colormap like "coolwarm" or "plasma" for a smoother effect + # vmax=np.percentile(cell_area, 90) + # vmin=np.percentile(cell_area, 50) + hb = plt.hexbin(x, y, gridsize=50, cmap="mako", mincnt=1, norm=mcolors.LogNorm(vmin=vmin)) + # Add a colorbar with a minimalistic design + cb = plt.colorbar(hb, orientation='vertical') + cb.set_label('# Cells', fontsize=10) + # Minimalistic design: Remove unnecessary spines and ticks + plt.gca().spines['top'].set_visible(False) + plt.gca().spines['right'].set_visible(False) + plt.gca().spines['left'].set_visible(False) + plt.gca().spines['bottom'].set_visible(False) + plt.gca().tick_params(left=False, bottom=False) + plt.gca().set_xticks([]) + plt.gca().set_yticks([]) + # Minimalistic labels and title with ample space + plt.xlabel('', fontsize=12) + plt.ylabel('', fontsize=12) + plt.title(method, fontsize=16, pad=20) + # Tight layout for better spacing + plt.tight_layout() + # Define the path where you want to save the figure + # figures_path = 'figures_path' # Replace with your actual path + # Ensure the directory exists + # os.makedirs(figures_path, exist_ok=True) + # Save the figure as a PDF in the specified directory + pdf_path = figures_path / f'cell_counts_hexbin_{method}.pdf' + plt.savefig(pdf_path, format='pdf', bbox_inches='tight') + # Optionally, show the plot (if needed) + plt.show() + plt.close() + + +for method in method_colors.keys(): + # Clear the current figure to prevent overlapping plots and legends + plt.clf() + # Load your AnnData object (replace with your actual file) + adata = segmentations_dict[method] + # Filter for CAFs + # Assuming spatial coordinates are stored in 'spatial' and cell areas are in 'cell_area' + x = adata.obs['cell_centroid_x'] # Replace with actual x-coordinate key + y = -adata.obs['cell_centroid_y'] # Replace with actual y-coordinate key + # Create a figure + # plt.figure(figsize=(8, 6)) + # Plot the KDE for just counts (density of points) using 'mako' colormap + sns.kdeplot(x=x, y=y, fill=True, thresh=0, levels=30, cmap="mako") + # Remove spines and make the plot minimalistic + plt.gca().spines['top'].set_visible(False) + plt.gca().spines['right'].set_visible(False) + plt.gca().spines['left'].set_visible(False) + plt.gca().spines['bottom'].set_visible(False) + plt.gca().tick_params(left=False, bottom=False) + plt.gca().set_xticks([]) + plt.gca().set_yticks([]) + # Minimalistic labels and title with ample space + plt.xlabel('', fontsize=12) + plt.ylabel('', fontsize=12) + plt.title(f'Density Count KDE Plot ({method})', fontsize=16, pad=20) + # Tight layout for better spacing + plt.tight_layout() + # Ensure the directory exists + os.makedirs(figures_path, exist_ok=True) + # Save the figure as a PDF in the specified directory + pdf_path = figures_path / f'density_kde_{method}.pdf' + plt.savefig(pdf_path, format='pdf', bbox_inches='tight') + # Close the current figure to prevent it from being displayed + plt.close() \ No newline at end of file diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index ea3cff1..941943b 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -55,7 +55,7 @@ from time import time from cupyx.scipy.sparse import coo_matrix as cp_coo_matrix from cupyx.scipy.sparse.csgraph import connected_components as cp_cc - +import random # Setup Dask cluster with 3 workers @@ -94,34 +94,44 @@ def zero_out_diagonal_gpu(sparse_matrix): # Function to subset rows and columns of a sparse matrix def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): """ - Subsets a COO sparse matrix by selecting rows and columns corresponding to given indices. + Subset a sparse matrix using row and column indices. - Args: - sparse_matrix (cupyx.scipy.sparse.coo_matrix): The input sparse matrix. - row_idx (cupy.ndarray): The row indices to subset. - col_idx (cupy.ndarray): The column indices to subset. + Parameters: + sparse_matrix (cupyx.scipy.sparse.spmatrix): The input sparse matrix in COO, CSR, or CSC format. + row_idx (cupy.ndarray): Row indices to keep in the subset. + col_idx (cupy.ndarray): Column indices to keep in the subset. Returns: - cupyx.scipy.sparse.coo_matrix: Subset sparse matrix. + cupyx.scipy.sparse.spmatrix: A new sparse matrix that is a subset of the input matrix. """ - # Filter out the elements where both the row and column match the given indices + + # Convert indices to CuPy arrays if not already + row_idx = cp.asarray(row_idx) + col_idx = cp.asarray(col_idx) + + # Ensure sparse matrix is in COO format for easy indexing (you can use CSR/CSC if more optimal) + sparse_matrix = sparse_matrix.tocoo() + + # Create boolean masks for the row and column indices row_mask = cp.isin(sparse_matrix.row, row_idx) col_mask = cp.isin(sparse_matrix.col, col_idx) - combined_mask = row_mask & col_mask - # Create the subset sparse matrix - new_data = sparse_matrix.data[combined_mask] - new_row = sparse_matrix.row[combined_mask] - new_col = sparse_matrix.col[combined_mask] + # Apply masks to filter the data, row, and column arrays + mask = row_mask & col_mask + row_filtered = sparse_matrix.row[mask] + col_filtered = sparse_matrix.col[mask] + data_filtered = sparse_matrix.data[mask] - # Map the new row and col indices to the range of the subset - row_map = cp.searchsorted(row_idx, new_row) - col_map = cp.searchsorted(col_idx, new_col) + # Map the row and col indices to the new submatrix indices + row_mapped = cp.searchsorted(row_idx, row_filtered) + col_mapped = cp.searchsorted(col_idx, col_filtered) # Return the new subset sparse matrix return coo_matrix((new_data, (row_map, col_map)), shape=(len(row_idx), len(col_idx))) + + def load_model(checkpoint_path: str) -> LitSegger: """ Load a LitSegger model from a checkpoint. @@ -169,7 +179,13 @@ def sort_order(c): def get_similarity_scores( - model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, receptive_field: dict, knn_method: str = "cuda" + model: torch.nn.Module, + batch: Batch, + from_type: str, + to_type: str, + receptive_field: dict, + knn_method: str = 'cuda', + gpu_id: int = 0 # Added argument for GPU ID ) -> coo_matrix: """ Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes @@ -181,60 +197,67 @@ def get_similarity_scores( from_type (str): The type of node from which the similarity is computed. to_type (str): The type of node to which the similarity is computed. knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. + gpu_id (int, optional): The GPU ID to use for the computations. Defaults to 0. Returns: coo_matrix: A sparse matrix containing the similarity scores between 'from_type' and 'to_type' nodes. """ - # Keep everything on GPU until final results - batch = batch.to("cuda") - - # Step 1: Get embeddings from the model (on GPU) - shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - - # Compute edge indices using knn method (still on GPU) - edge_index = get_edge_index( - batch[to_type].pos[:, :2], # 'tx' positions - batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f"k_{to_type}"], - dist=receptive_field[f"dist_{to_type}"], - method=knn_method, - ) - - # Convert to dense adjacency matrix (on GPU) - edge_index = coo_to_dense_adj(edge_index.T, num_nodes=shape[0], num_nbrs=receptive_field[f"k_{to_type}"]) - - with torch.no_grad(): - embeddings = model(batch.x_dict, batch.edge_index_dict) - - def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: - m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros - - similarity = torch.bmm( - m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed - embeddings[from_type].unsqueeze(-1), # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 - del embeddings - # Sigmoid to get most similar 'to_type' neighbor - similarity[similarity == 0] = -torch.inf # ensure zero stays zero - similarity = F.sigmoid(similarity) - # Neighbor-filtered similarity scores - # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(edge_index != -1).T - indices[1] = edge_index[edge_index != -1] - rows = cp.fromDlpack(to_dlpack(indices[0, :].to("cuda"))) - columns = cp.fromDlpack(to_dlpack(indices[1, :].to("cuda"))) - # print(rows) - del indices - values = similarity[edge_index != -1].flatten() - sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) - return sparse_result - # Free GPU memory after computation - - # Call the sparse multiply function - sparse_similarity = sparse_multiply(embeddings, edge_index, shape) - + # Set the specified GPU device for CuPy operations + with cp.cuda.Device(gpu_id): + # Move the batch to the specified GPU + batch = batch.to(f'cuda:{gpu_id}') + + # Step 1: Get embeddings from the model (on GPU) + shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + + # Compute edge indices using knn method (still on GPU) + edge_index = get_edge_index( + batch[to_type].pos[:, :2], # 'tx' positions + batch[from_type].pos[:, :2], # 'bd' positions + k=receptive_field[f'k_{to_type}'], + dist=receptive_field[f'dist_{to_type}'], + method=knn_method + ) + + # Convert to dense adjacency matrix (on GPU) + edge_index = coo_to_dense_adj( + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f'k_{to_type}'] + ) + + with torch.no_grad(): + embeddings = model(batch.x_dict, batch.edge_index_dict) + + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: + m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros + + similarity = torch.bmm( + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 + del embeddings + # Sigmoid to get most similar 'to_type' neighbor + similarity[similarity == 0] = -torch.inf # ensure zero stays zero + similarity = F.sigmoid(similarity) + # Neighbor-filtered similarity scores + # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + indices = torch.argwhere(edge_index != -1).T + indices[1] = edge_index[edge_index != -1] + rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) + columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + # print(rows) + del indices + values = similarity[edge_index != -1].flatten() + sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) + return sparse_result + # Free GPU memory after computation + + # Call the sparse multiply function + sparse_similarity = sparse_multiply(embeddings, edge_index, shape) + return sparse_similarity @@ -244,11 +267,11 @@ def predict_batch( score_cut: float, receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = "cuda", - output_ddf: dask_cudf.DataFrame = None, + knn_method: str = 'cuda', edge_index_save_path: Union[str, Path] = None, output_ddf_save_path: Union[str, Path] = None, -) -> dask_cudf.DataFrame: + gpu_id: int = 0 # Added argument for GPU ID +): """ Predict cell assignments for a batch of transcript data using a segmentation model. Writes both the assignments and edge_index directly into Parquet files incrementally. @@ -262,22 +285,21 @@ def predict_batch( use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. Defaults to True. knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. - output_ddf (dask_cudf.DataFrame, optional): Dask-CuDF DataFrame to accumulate and store transcript assignments. edge_index_save_path (str, optional): Path to the Parquet file where edge indices are saved incrementally. output_ddf_save_path (str, optional): Path to the Parquet file where transcript assignments (`output_ddf`) are saved incrementally. - - Returns: - dask_cudf.DataFrame: Updated Dask-CuDF DataFrame for assignments. + gpu_id (int, optional): The GPU ID to use for the computations. Defaults to 0. """ def _get_id(): """Generate a random Xenium-style ID.""" return "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 8)) + "-nx" - with cp.cuda.Device(0): - # Move batch to GPU - batch = batch.to("cuda") + print(gpu_id) + with cp.cuda.Device(gpu_id): + # Move the batch to the specified GPU + batch = batch.to(f'cuda:{gpu_id}') + lit_segger.model = lit_segger.model.to(f'cuda:{gpu_id}') # Extract transcript IDs and initialize a dictionary for assignments transcript_id = batch["tx"].id.cpu().numpy().astype("str") @@ -285,7 +307,7 @@ def _get_id(): if len(batch["bd"].pos) >= 10: # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method, gpu_id=gpu_id) torch.cuda.empty_cache() # Convert sparse matrix to dense format (on GPU) @@ -310,9 +332,7 @@ def _get_id(): # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) if use_cc: - scores_tx = get_similarity_scores( - lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method - ) + scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id) # Stay on GPU and use CuPy sparse matrices no_id_scores = cupyx.scipy.sparse.coo_matrix( @@ -320,19 +340,19 @@ def _get_id(): ) # Apply threshold on GPU - no_id_scores.data[no_id_scores.data <= score_cut] = 0 # Apply threshold - no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse + no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold # Zero out the diagonal on GPU no_id_scores = zero_out_diagonal_gpu(no_id_scores) + no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse # Find unassigned transcripts (those with no segger_cell_id) - no_id = cp.asarray(assignments["segger_cell_id"] == None) # Using CuPy to handle None values - - if cp.any(no_id): # Only compute if there are unassigned transcripts + no_id = cp.where(cp.asarray(assignments['segger_cell_id'] == None))[0] # Using CuPy to handle None values + + if len(no_id) > 0: # Only compute if there are unassigned transcripts # Apply score cut-off to unassigned transcripts - no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) - no_id_scores.data[no_id_scores.data <= score_cut] = 0 # Apply threshold + no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) + no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold no_id_scores.eliminate_zeros() # Clean up zeros # Find the non-zero entries in the no_id_scores to construct edge_index @@ -343,12 +363,10 @@ def _get_id(): source_nodes = unassigned_ids[non_zero_rows.get()] target_nodes = unassigned_ids[non_zero_cols.get()] - # Save edge_index using CuDF and Dask-CuDF for GPU acceleration - edge_index_df = cudf.DataFrame({"source": source_nodes, "target": target_nodes}) - edge_index_ddf = dask_cudf.from_cudf(edge_index_df, npartitions=1) - - # Use delayed for asynchronous disk writing of edge_index - delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True) + # # Save edge_index using CuDF and Dask-CuDF for GPU acceleration + edge_index_ddf = delayed(dd.from_pandas)(pd.DataFrame({'source': source_nodes, 'target': target_nodes}), npartitions=1) + # Use delayed for asynchronous disk writing of edge_index in Dask DataFrame + delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True, ignore_divisions=True) delayed_write_edge_index.persist() # Schedule writing assignments = { @@ -358,13 +376,8 @@ def _get_id(): "bound": assignments["bound"].astype("int8"), # Ensure 'int64' dtype } # Step 4: Convert assignments to Dask-CuDF DataFrame for this batch - batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) - - # # Append batch to output_ddf, adding it as a new partition - # if output_ddf is None: - # output_ddf = batch_ddf # Initialize if empty - # else: - # output_ddf = dask_cudf.concat([output_ddf, batch_ddf], interleave_partitions=True) + # batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) + batch_ddf = delayed(dd.from_pandas)(pd.DataFrame(assignments), npartitions=1) # Save the updated `output_ddf` asynchronously using Dask delayed delayed_write_output_ddf = delayed(batch_ddf.to_parquet)( @@ -376,8 +389,6 @@ def _get_id(): cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory torch.cuda.empty_cache() - return output_ddf - def segment( model: LitSegger, @@ -393,7 +404,8 @@ def segment( receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, knn_method: str = "cuda", verbose: bool = False, - **anndata_kwargs, + gpu_ids: list = ['0'], + **anndata_kwargs ) -> None: """ Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, @@ -453,8 +465,33 @@ def segment( val_dataloader = dm.val_dataloader() test_dataloader = dm.test_dataloader() - # Initialize Dask DataFrame for assignments - output_ddf = None + # # Initialize Dask DataFrame for assignments + # output_ddf = None + + # @dask.delayed + # def process_batch(batch, gpu_id): + # # Assume you're using CuPy, and you need to use a specific GPU + # predict_batch( + # model, + # batch, + # score_cut, + # receptive_field, + # use_cc=use_cc, + # knn_method=knn_method, + # edge_index_save_path=edge_index_save_path, + # output_ddf_save_path=output_ddf_save_path, + # gpu_id=gpu_id + # ) + + # delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.train)] + # # pqdm(delayed_tasks, n_jobs=len(gpu_ids), argument_type='delayed', progress_bar=True) + # # dask.compute(*delayed_tasks) + # # delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(batches)] + + # # Use tqdm for progress bar + # with ProgressBar(): + # # Execute the delayed tasks with a Dask compute call + # dask.compute(*delayed_tasks) # Loop through the data loaders (train, val, and test) for loader_name, loader in zip( @@ -464,18 +501,19 @@ def segment( if verbose: print(f"Processing {loader_name} data...") - for batch in tqdm(loader, desc=f"Processing {loader_name} batches"): + for batch in tqdm(loader, desc=f'Processing {loader_name} batches'): + gpu_id = random.choice(gpu_ids) # Call predict_batch for each batch - output_ddf = predict_batch( + predict_batch( model, batch, score_cut, receptive_field, use_cc=use_cc, knn_method=knn_method, - output_ddf=output_ddf, edge_index_save_path=edge_index_save_path, output_ddf_save_path=output_ddf_save_path, + gpu_id=gpu_id ) if verbose: From ceeba813a1c7372e15788545d2e7bb3457a3b451 Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:29:43 +0200 Subject: [PATCH 100/156] Jobs are waiting for previous jobs to finish --- submit_job.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/submit_job.sh b/submit_job.sh index d2f0180..0a870ed 100644 --- a/submit_job.sh +++ b/submit_job.sh @@ -41,8 +41,8 @@ SINGULARITY_IMAGE="segger_dev_latest.sif" # Path to the Singularity image # Functions to run different pipelines run_data_processing() { - bsub -o "$OUTPUT_LOG_PREPROCESS" -n "$N_WORKERS_PREPROCESS" -R "rusage[mem=$RAM_PREPROCESS]" -q long \ - "singularity exec --bind $LOCAL_REPO_DIR:$CONTAINER_DIR \ + bsub -J "job_data_processing" -o "$OUTPUT_LOG_PREPROCESS" -n "$N_WORKERS_PREPROCESS" -R "rusage[mem=$RAM_PREPROCESS]" -q long \ + "singularity exec --bind $LOCAL_REPO_DIR:$CONTAINER_DIR --pwd $CONTAINER_DIR \ $SINGULARITY_IMAGE python3 src/segger/cli/create_dataset_fast.py \ --base_dir '$BASE_DIR' \ --data_dir '$DATA_DIR' \ @@ -53,8 +53,8 @@ run_data_processing() { } run_training() { - bsub -o "$OUTPUT_LOG_TRAIN" -n "$N_WORKERS_TRAIN" -R "rusage[mem=$RAM_TRAIN]" -R "tensorcore" -gpu "num=$GPUS:j_exclusive=no:gmem=$GPU_MEM_TRAIN" -q gpu \ - "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR \ + bsub -J "job_training" -w "done(job_data_processing)" -o "$OUTPUT_LOG_TRAIN" -n "$N_WORKERS_TRAIN" -R "rusage[mem=$RAM_TRAIN]" -R "tensorcore" -gpu "num=$GPUS:j_exclusive=no:gmem=$GPU_MEM_TRAIN" -q gpu \ + "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR --pwd $CONTAINER_DIR \ $SINGULARITY_IMAGE python3 src/segger/cli/train_model.py \ --dataset_dir '$DATASET_DIR' \ --models_dir '$MODELS_DIR' \ @@ -64,8 +64,8 @@ run_training() { } run_prediction() { - bsub -o "$OUTPUT_LOG_PREDICT" -n "$N_WORKERS_PREDICT" -R "rusage[mem=$RAM_PREDICT]" -R "tensorcore" -gpu "num=1:j_exclusive=no:gmem=$GPU_MEM_PREDICT" -q gpu \ - "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR \ + bsub -J "job_prediction" -w "done(job_training)" -o "$OUTPUT_LOG_PREDICT" -n "$N_WORKERS_PREDICT" -R "rusage[mem=$RAM_PREDICT]" -R "tensorcore" -gpu "num=1:j_exclusive=no:gmem=$GPU_MEM_PREDICT" -q gpu \ + "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR --pwd $CONTAINER_DIR \ $SINGULARITY_IMAGE python3 src/segger/cli/predict.py \ --segger_data_dir '$SEGGER_DATA_DIR' \ --models_dir '$MODELS_DIR' \ From 4d3e4e387fc1ed137382baf86792d6a13fc4c0aa Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:39:41 +0200 Subject: [PATCH 101/156] Transcript_id overflows, must be torch.long --- src/segger/data/parquet/sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 6937a72..729632a 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -1127,7 +1127,7 @@ def to_pyg_dataset( # Set up Transcript nodes pyg_data["tx"].id = torch.tensor( self.transcripts[self.settings.transcripts.id].values.astype(int), - dtype=torch.int, + dtype=torch.long, ) pyg_data["tx"].pos = torch.tensor( self.transcripts[self.settings.transcripts.xyz].values, From e20b9761aa435eb5a670f793acc366b838915dce Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:53:58 +0200 Subject: [PATCH 102/156] Update utils.py --- src/segger/data/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py index b673a87..7c2d83e 100644 --- a/src/segger/data/utils.py +++ b/src/segger/data/utils.py @@ -409,7 +409,10 @@ def torch_to_cupy(tensor): ) # , compression=compression_params) search_params = cagra.SearchParams() # Build index using CuPy coords - index = cagra.build_index(index_params, cp_coords_1) + try: + index = cagra.build(index_params, cp_coords_1) + except AttributeError: + index = cagra.build_index(index_params, cp_coords_1) # Perform search to get distances and indices (still in CuPy) D, I = cagra.search(search_params, index, cp_coords_2, k) # Boolean mask for filtering distances below the squared threshold (all in CuPy) From 78453ea5dca1ac55d519791870d197a18011093c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Wed, 16 Oct 2024 11:30:09 +0200 Subject: [PATCH 103/156] Submit jobs using yaml config --- config.yaml | 39 +++++++++++++++++++ submit_job.py | 77 ++++++++++++++++++++++++++++++++++++++ submit_job.sh | 101 -------------------------------------------------- 3 files changed, 116 insertions(+), 101 deletions(-) create mode 100644 config.yaml create mode 100644 submit_job.py delete mode 100644 submit_job.sh diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..ef5806c --- /dev/null +++ b/config.yaml @@ -0,0 +1,39 @@ +paths: + local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" + container_dir: "/workspace/segger_dev" + singularity_image: "segger_dev_latest.sif" + +pipelines: + - 1 # Run data processing + - 2 # Run training + - 3 # Run prediction + +preprocessing: + output_log: "preprocess_output.log" + base_dir: "data_xenium" + data_dir: "data_segger_test" + sample_type: "xenium" + tile_width: 120 + tile_height: 120 + workers: 16 + memory: "16G" + +training: + output_log: "train_output.log" + dataset_dir: "data_segger_test" + models_dir: "model_dir" + sample_tag: "first_training" + workers: 16 + memory: "16G" + gpus: 8 + gpu_memory: "8G" + +prediction: + output_log: "predict_output.log" + segger_data_dir: "data_segger_test" + benchmarks_dir: "benchmark_dir" + transcripts_file: "data_xenium/transcripts.parquet" + knn_method: "cuda" + workers: 16 + memory: "16G" + gpu_memory: "8G" diff --git a/submit_job.py b/submit_job.py new file mode 100644 index 0000000..0b99c63 --- /dev/null +++ b/submit_job.py @@ -0,0 +1,77 @@ +import yaml +import subprocess + +# Load the YAML configuration file +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +# Define the pipeline functions +def run_data_processing(): + subprocess.run([ + "bsub", "-J", "job_data_processing", "-o", config['preprocessing']['output_log'], + "-n", str(config['preprocessing']['workers']), + "-R", f"rusage[mem={config['preprocessing']['memory']}]", + "-q", "long", + "singularity", "exec", "--bind", + f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", + "--pwd", config['paths']['container_dir'], + config['paths']['singularity_image'], "python3", "src/segger/cli/create_dataset_fast.py", + "--base_dir", config['preprocessing']['base_dir'], + "--data_dir", config['preprocessing']['data_dir'], + "--sample_type", config['preprocessing']['sample_type'], + "--tile_width", str(config['preprocessing']['tile_width']), + "--tile_height", str(config['preprocessing']['tile_height']), + "--n_workers", str(config['preprocessing']['workers']) + ]) + +def run_training(): + subprocess.run([ + "bsub", "-J", "job_training", "-w", "done(job_data_processing)", + "-o", config['training']['output_log'], + "-n", str(config['training']['workers']), + "-R", f"rusage[mem={config['training']['memory']}]", + "-R", "tensorcore", + "-gpu", f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", + "-q", "gpu", + "singularity", "exec", "--nv", "--bind", + f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", + "--pwd", config['paths']['container_dir'], + config['paths']['singularity_image'], "python3", "src/segger/cli/train_model.py", + "--dataset_dir", config['training']['dataset_dir'], + "--models_dir", config['training']['models_dir'], + "--sample_tag", config['training']['sample_tag'], + "--num_workers", str(config['training']['workers']), + "--devices", str(config['training']['gpus']) + ]) + +def run_prediction(): + subprocess.run([ + "bsub", "-J", "job_prediction", "-w", "done(job_training)", + "-o", config['prediction']['output_log'], + "-n", str(config['prediction']['workers']), + "-R", f"rusage[mem={config['prediction']['memory']}]", + "-R", "tensorcore", + "-gpu", f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", + "-q", "gpu", + "singularity", "exec", "--nv", "--bind", + f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", + "--pwd", config['paths']['container_dir'], + config['paths']['singularity_image'], "python3", "src/segger/cli/predict.py", + "--segger_data_dir", config['prediction']['segger_data_dir'], + "--benchmarks_dir", config['prediction']['benchmarks_dir'], + "--transcripts_file", config['prediction']['transcripts_file'], + "--knn_method", config['prediction']['knn_method'], + "--num_workers", str(config['prediction']['workers']) + ]) + +# Main script logic +pipelines = config.get('pipelines', []) +for pipeline in pipelines: + if pipeline == 1: + run_data_processing() + elif pipeline == 2: + run_training() + elif pipeline == 3: + run_prediction() + else: + print(f"Invalid pipeline number: {pipeline}") diff --git a/submit_job.sh b/submit_job.sh deleted file mode 100644 index 0a870ed..0000000 --- a/submit_job.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash - -# To acquire the Singularity image, run: -# singularity pull docker://danielunyi42/segger_dev - -# Pipeline 1: Data Preprocessing Parameters -OUTPUT_LOG_PREPROCESS="preprocess_output.log" # Path to the output log file for data preprocessing -BASE_DIR="data_xenium" # Base directory for input data -DATA_DIR="data_segger" # Directory for output data -SAMPLE_TYPE="xenium" # Type of sample being processed -TILE_WIDTH=120 # Width of each data tile -TILE_HEIGHT=120 # Height of each data tile -N_WORKERS_PREPROCESS=16 # Number of workers for parallel processing -RAM_PREPROCESS="16G" # Total memory requested for the job - -# Pipeline 2: Training Parameters -OUTPUT_LOG_TRAIN="train_output.log" # Path to the output log file for training -DATASET_DIR="data_segger" # Directory for dataset -MODELS_DIR="model_dir" # Directory to save models -SAMPLE_TAG="first_training" # Tag for the training sample -N_WORKERS_TRAIN=16 # Number of CPUs to request -RAM_TRAIN="16G" # Amount of memory to request -GPUS=8 # Number of GPUs to request -GPU_MEM_TRAIN="8G" # Amount of memory per GPU - -# Pipeline 3: Prediction Parameters -OUTPUT_LOG_PREDICT="predict_output.log" # Path to the output log file for prediction -SEGGER_DATA_DIR="data_segger" # Directory containing the segger data -MODELS_DIR="model_dir" # Directory containing the trained models -BENCHMARKS_DIR="benchmark_dir" # Directory for saving the benchmark results -TRANSCRIPTS_FILE="data_xenium" # Path to the transcripts file -KNN_METHOD="cuda" # Method for KNN search -N_WORKERS_PREDICT=16 # Number of CPUs to request -RAM_PREDICT="16G" # Amount of memory to request -GPU_MEM_PREDICT="8G" # Amount of memory for GPU - -# Paths and common variables -LOCAL_REPO_DIR="/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" # Where the segger_dev repository is located on the local machine -CONTAINER_DIR="/workspace/segger_dev" # Where the segger_dev repository is located in the container -SINGULARITY_IMAGE="segger_dev_latest.sif" # Path to the Singularity image - -# Functions to run different pipelines -run_data_processing() { - bsub -J "job_data_processing" -o "$OUTPUT_LOG_PREPROCESS" -n "$N_WORKERS_PREPROCESS" -R "rusage[mem=$RAM_PREPROCESS]" -q long \ - "singularity exec --bind $LOCAL_REPO_DIR:$CONTAINER_DIR --pwd $CONTAINER_DIR \ - $SINGULARITY_IMAGE python3 src/segger/cli/create_dataset_fast.py \ - --base_dir '$BASE_DIR' \ - --data_dir '$DATA_DIR' \ - --sample_type '$SAMPLE_TYPE' \ - --tile_width $TILE_WIDTH \ - --tile_height $TILE_HEIGHT \ - --n_workers $N_WORKERS_PREPROCESS" -} - -run_training() { - bsub -J "job_training" -w "done(job_data_processing)" -o "$OUTPUT_LOG_TRAIN" -n "$N_WORKERS_TRAIN" -R "rusage[mem=$RAM_TRAIN]" -R "tensorcore" -gpu "num=$GPUS:j_exclusive=no:gmem=$GPU_MEM_TRAIN" -q gpu \ - "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR --pwd $CONTAINER_DIR \ - $SINGULARITY_IMAGE python3 src/segger/cli/train_model.py \ - --dataset_dir '$DATASET_DIR' \ - --models_dir '$MODELS_DIR' \ - --sample_tag '$SAMPLE_TAG' \ - --num_workers $N_WORKERS_TRAIN \ - --devices $GPUS" -} - -run_prediction() { - bsub -J "job_prediction" -w "done(job_training)" -o "$OUTPUT_LOG_PREDICT" -n "$N_WORKERS_PREDICT" -R "rusage[mem=$RAM_PREDICT]" -R "tensorcore" -gpu "num=1:j_exclusive=no:gmem=$GPU_MEM_PREDICT" -q gpu \ - "singularity exec --nv --bind $LOCAL_REPO_DIR:$CONTAINER_DIR --pwd $CONTAINER_DIR \ - $SINGULARITY_IMAGE python3 src/segger/cli/predict.py \ - --segger_data_dir '$SEGGER_DATA_DIR' \ - --models_dir '$MODELS_DIR' \ - --benchmarks_dir '$BENCHMARKS_DIR' \ - --transcripts_file '$TRANSCRIPTS_FILE' \ - --knn_method '$KNN_METHOD' \ - --num_workers $N_WORKERS_PREDICT" -} - -# Main script logic -echo "Which pipelines would you like to run? (1: Data Processing, 2: Training, 3: Prediction)" -echo "Enter the pipeline numbers you want to run (e.g., '1 2 3' for all, or '1' for only data processing):" -read -r pipelines - -for pipeline in $pipelines; do - case $pipeline in - 1) - echo "Running Data Processing..." - run_data_processing - ;; - 2) - echo "Running Training..." - run_training - ;; - 3) - echo "Running Prediction..." - run_prediction - ;; - *) - echo "Invalid choice: $pipeline" - ;; - esac -done From 3a5a0aa8a4f127529e9b0f1f2c9f1260c6db0622 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 09:31:14 +0000 Subject: [PATCH 104/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- config.yaml | 6 +- scripts/predict_model_sample.py | 4 +- scripts/sandbox.py | 97 ++++++------ src/segger/prediction/predict_parquet.py | 93 ++++++------ submit_job.py | 178 ++++++++++++++++------- 5 files changed, 226 insertions(+), 152 deletions(-) diff --git a/config.yaml b/config.yaml index ef5806c..99cf1d6 100644 --- a/config.yaml +++ b/config.yaml @@ -4,9 +4,9 @@ paths: singularity_image: "segger_dev_latest.sif" pipelines: - - 1 # Run data processing - - 2 # Run training - - 3 # Run prediction + - 1 # Run data processing + - 2 # Run training + - 3 # Run prediction preprocessing: output_log: "preprocess_output.log" diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py index 3a53a11..00b133c 100644 --- a/scripts/predict_model_sample.py +++ b/scripts/predict_model_sample.py @@ -51,9 +51,9 @@ # max_transcripts=1500, cell_id_col="segger_cell_id", use_cc=True, - knn_method='cuda', + knn_method="cuda", verbose=True, - gpu_ids=['0'] + gpu_ids=["0"], # client=client ) diff --git a/scripts/sandbox.py b/scripts/sandbox.py index e9e8287..c155e38 100644 --- a/scripts/sandbox.py +++ b/scripts/sandbox.py @@ -241,46 +241,45 @@ plt.close() - - - - - - - - - for method in method_colors.keys(): # Set Seaborn style for minimalistic plots # sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}) # Load your AnnData object (replace with your actual file) adata = segmentations_dict[method] # Assuming spatial coordinates are stored in 'spatial' and cell areas are in 'cell_area' - x = adata.obs['cell_centroid_x'] # Replace with actual x-coordinate key - y = -adata.obs['cell_centroid_y'] # Replace with actual y-coordinate key - cell_area = adata.obs['cell_area'] # Replace with actual cell area key + x = adata.obs["cell_centroid_x"] # Replace with actual x-coordinate key + y = -adata.obs["cell_centroid_y"] # Replace with actual y-coordinate key + cell_area = adata.obs["cell_area"] # Replace with actual cell area key # adata = adata[adata.obs.celltype_major == 'CAFs'] # Create the hexbin plot # plt.figure(figsize=(8, 6)) # Use a "cool" colormap like "coolwarm" or "plasma" for a smoother effect - vmax=np.percentile(cell_area, 99) - vmin=np.percentile(cell_area, 50) + vmax = np.percentile(cell_area, 99) + vmin = np.percentile(cell_area, 50) cell_area = np.clip(cell_area, vmin, vmax) - hb = plt.hexbin(x, y, C=cell_area, gridsize=50, cmap="viridis", reduce_C_function=np.mean, norm=mcolors.LogNorm(vmin=20, vmax=100)) + hb = plt.hexbin( + x, + y, + C=cell_area, + gridsize=50, + cmap="viridis", + reduce_C_function=np.mean, + norm=mcolors.LogNorm(vmin=20, vmax=100), + ) # Add a colorbar with a minimalistic design - cb = plt.colorbar(hb, orientation='vertical') - cb.set_label('Average Cell Area', fontsize=10) + cb = plt.colorbar(hb, orientation="vertical") + cb.set_label("Average Cell Area", fontsize=10) # Minimalistic design: Remove unnecessary spines and ticks - plt.gca().spines['top'].set_visible(False) - plt.gca().spines['right'].set_visible(False) - plt.gca().spines['left'].set_visible(False) - plt.gca().spines['bottom'].set_visible(False) + plt.gca().spines["top"].set_visible(False) + plt.gca().spines["right"].set_visible(False) + plt.gca().spines["left"].set_visible(False) + plt.gca().spines["bottom"].set_visible(False) plt.gca().tick_params(left=False, bottom=False) plt.gca().set_xticks([]) plt.gca().set_yticks([]) # Minimalistic labels and title with ample space - plt.xlabel('', fontsize=12) - plt.ylabel('', fontsize=12) + plt.xlabel("", fontsize=12) + plt.ylabel("", fontsize=12) plt.title(method, fontsize=16, pad=20) # Tight layout for better spacing plt.tight_layout() @@ -289,8 +288,8 @@ # Ensure the directory exists # os.makedirs(figures_path, exist_ok=True) # Save the figure as a PDF in the specified directory - pdf_path = figures_path / f'average_cell_area_hexbin_{method}.pdf' - plt.savefig(pdf_path, format='pdf', bbox_inches='tight') + pdf_path = figures_path / f"average_cell_area_hexbin_{method}.pdf" + plt.savefig(pdf_path, format="pdf", bbox_inches="tight") # Optionally, show the plot (if needed) plt.show() plt.close() @@ -303,8 +302,8 @@ # Load your AnnData object (replace with your actual file) adata = segmentations_dict[method] # Assuming spatial coordinates are stored in 'spatial' and cell areas are in 'cell_area' - x = adata.obs['cell_centroid_x'] # Replace with actual x-coordinate key - y = -adata.obs['cell_centroid_y'] # Replace with actual y-coordinate key + x = adata.obs["cell_centroid_x"] # Replace with actual x-coordinate key + y = -adata.obs["cell_centroid_y"] # Replace with actual y-coordinate key # vmin = adata.shape[0] / 3000 # cell_area = np.log(adata.obs['cell_area']) # Replace with actual cell area key # adata = adata[adata.obs.celltype_major == 'CAFs'] @@ -315,19 +314,19 @@ # vmin=np.percentile(cell_area, 50) hb = plt.hexbin(x, y, gridsize=50, cmap="mako", mincnt=1, norm=mcolors.LogNorm(vmin=vmin)) # Add a colorbar with a minimalistic design - cb = plt.colorbar(hb, orientation='vertical') - cb.set_label('# Cells', fontsize=10) + cb = plt.colorbar(hb, orientation="vertical") + cb.set_label("# Cells", fontsize=10) # Minimalistic design: Remove unnecessary spines and ticks - plt.gca().spines['top'].set_visible(False) - plt.gca().spines['right'].set_visible(False) - plt.gca().spines['left'].set_visible(False) - plt.gca().spines['bottom'].set_visible(False) + plt.gca().spines["top"].set_visible(False) + plt.gca().spines["right"].set_visible(False) + plt.gca().spines["left"].set_visible(False) + plt.gca().spines["bottom"].set_visible(False) plt.gca().tick_params(left=False, bottom=False) plt.gca().set_xticks([]) plt.gca().set_yticks([]) # Minimalistic labels and title with ample space - plt.xlabel('', fontsize=12) - plt.ylabel('', fontsize=12) + plt.xlabel("", fontsize=12) + plt.ylabel("", fontsize=12) plt.title(method, fontsize=16, pad=20) # Tight layout for better spacing plt.tight_layout() @@ -336,8 +335,8 @@ # Ensure the directory exists # os.makedirs(figures_path, exist_ok=True) # Save the figure as a PDF in the specified directory - pdf_path = figures_path / f'cell_counts_hexbin_{method}.pdf' - plt.savefig(pdf_path, format='pdf', bbox_inches='tight') + pdf_path = figures_path / f"cell_counts_hexbin_{method}.pdf" + plt.savefig(pdf_path, format="pdf", bbox_inches="tight") # Optionally, show the plot (if needed) plt.show() plt.close() @@ -350,30 +349,30 @@ adata = segmentations_dict[method] # Filter for CAFs # Assuming spatial coordinates are stored in 'spatial' and cell areas are in 'cell_area' - x = adata.obs['cell_centroid_x'] # Replace with actual x-coordinate key - y = -adata.obs['cell_centroid_y'] # Replace with actual y-coordinate key + x = adata.obs["cell_centroid_x"] # Replace with actual x-coordinate key + y = -adata.obs["cell_centroid_y"] # Replace with actual y-coordinate key # Create a figure # plt.figure(figsize=(8, 6)) # Plot the KDE for just counts (density of points) using 'mako' colormap sns.kdeplot(x=x, y=y, fill=True, thresh=0, levels=30, cmap="mako") # Remove spines and make the plot minimalistic - plt.gca().spines['top'].set_visible(False) - plt.gca().spines['right'].set_visible(False) - plt.gca().spines['left'].set_visible(False) - plt.gca().spines['bottom'].set_visible(False) + plt.gca().spines["top"].set_visible(False) + plt.gca().spines["right"].set_visible(False) + plt.gca().spines["left"].set_visible(False) + plt.gca().spines["bottom"].set_visible(False) plt.gca().tick_params(left=False, bottom=False) plt.gca().set_xticks([]) plt.gca().set_yticks([]) # Minimalistic labels and title with ample space - plt.xlabel('', fontsize=12) - plt.ylabel('', fontsize=12) - plt.title(f'Density Count KDE Plot ({method})', fontsize=16, pad=20) + plt.xlabel("", fontsize=12) + plt.ylabel("", fontsize=12) + plt.title(f"Density Count KDE Plot ({method})", fontsize=16, pad=20) # Tight layout for better spacing plt.tight_layout() # Ensure the directory exists os.makedirs(figures_path, exist_ok=True) # Save the figure as a PDF in the specified directory - pdf_path = figures_path / f'density_kde_{method}.pdf' - plt.savefig(pdf_path, format='pdf', bbox_inches='tight') + pdf_path = figures_path / f"density_kde_{method}.pdf" + plt.savefig(pdf_path, format="pdf", bbox_inches="tight") # Close the current figure to prevent it from being displayed - plt.close() \ No newline at end of file + plt.close() diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 941943b..ede8469 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -56,6 +56,7 @@ from cupyx.scipy.sparse import coo_matrix as cp_coo_matrix from cupyx.scipy.sparse.csgraph import connected_components as cp_cc import random + # Setup Dask cluster with 3 workers @@ -104,11 +105,11 @@ def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): Returns: cupyx.scipy.sparse.spmatrix: A new sparse matrix that is a subset of the input matrix. """ - + # Convert indices to CuPy arrays if not already row_idx = cp.asarray(row_idx) col_idx = cp.asarray(col_idx) - + # Ensure sparse matrix is in COO format for easy indexing (you can use CSR/CSC if more optimal) sparse_matrix = sparse_matrix.tocoo() @@ -130,8 +131,6 @@ def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): return coo_matrix((new_data, (row_map, col_map)), shape=(len(row_idx), len(col_idx))) - - def load_model(checkpoint_path: str) -> LitSegger: """ Load a LitSegger model from a checkpoint. @@ -179,13 +178,13 @@ def sort_order(c): def get_similarity_scores( - model: torch.nn.Module, + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, receptive_field: dict, - knn_method: str = 'cuda', - gpu_id: int = 0 # Added argument for GPU ID + knn_method: str = "cuda", + gpu_id: int = 0, # Added argument for GPU ID ) -> coo_matrix: """ Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes @@ -207,47 +206,43 @@ def get_similarity_scores( # Set the specified GPU device for CuPy operations with cp.cuda.Device(gpu_id): # Move the batch to the specified GPU - batch = batch.to(f'cuda:{gpu_id}') + batch = batch.to(f"cuda:{gpu_id}") # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - + # Compute edge indices using knn method (still on GPU) edge_index = get_edge_index( batch[to_type].pos[:, :2], # 'tx' positions batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], - method=knn_method + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method=knn_method, ) - + # Convert to dense adjacency matrix (on GPU) - edge_index = coo_to_dense_adj( - edge_index.T, - num_nodes=shape[0], - num_nbrs=receptive_field[f'k_{to_type}'] - ) - + edge_index = coo_to_dense_adj(edge_index.T, num_nodes=shape[0], num_nbrs=receptive_field[f"k_{to_type}"]) + with torch.no_grad(): embeddings = model(batch.x_dict, batch.edge_index_dict) - + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros similarity = torch.bmm( - m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed - embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1), # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 del embeddings # Sigmoid to get most similar 'to_type' neighbor similarity[similarity == 0] = -torch.inf # ensure zero stays zero similarity = F.sigmoid(similarity) # Neighbor-filtered similarity scores # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(edge_index != -1).T + indices = torch.argwhere(edge_index != -1).T indices[1] = edge_index[edge_index != -1] - rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) - columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + rows = cp.fromDlpack(to_dlpack(indices[0, :].to("cuda"))) + columns = cp.fromDlpack(to_dlpack(indices[1, :].to("cuda"))) # print(rows) del indices values = similarity[edge_index != -1].flatten() @@ -257,7 +252,7 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) - + return sparse_similarity @@ -267,10 +262,10 @@ def predict_batch( score_cut: float, receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = 'cuda', + knn_method: str = "cuda", edge_index_save_path: Union[str, Path] = None, output_ddf_save_path: Union[str, Path] = None, - gpu_id: int = 0 # Added argument for GPU ID + gpu_id: int = 0, # Added argument for GPU ID ): """ Predict cell assignments for a batch of transcript data using a segmentation model. @@ -298,8 +293,8 @@ def _get_id(): print(gpu_id) with cp.cuda.Device(gpu_id): # Move the batch to the specified GPU - batch = batch.to(f'cuda:{gpu_id}') - lit_segger.model = lit_segger.model.to(f'cuda:{gpu_id}') + batch = batch.to(f"cuda:{gpu_id}") + lit_segger.model = lit_segger.model.to(f"cuda:{gpu_id}") # Extract transcript IDs and initialize a dictionary for assignments transcript_id = batch["tx"].id.cpu().numpy().astype("str") @@ -307,7 +302,9 @@ def _get_id(): if len(batch["bd"].pos) >= 10: # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method, gpu_id=gpu_id) + scores = get_similarity_scores( + lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method, gpu_id=gpu_id + ) torch.cuda.empty_cache() # Convert sparse matrix to dense format (on GPU) @@ -332,7 +329,9 @@ def _get_id(): # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) if use_cc: - scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id) + scores_tx = get_similarity_scores( + lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id + ) # Stay on GPU and use CuPy sparse matrices no_id_scores = cupyx.scipy.sparse.coo_matrix( @@ -347,11 +346,13 @@ def _get_id(): no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse # Find unassigned transcripts (those with no segger_cell_id) - no_id = cp.where(cp.asarray(assignments['segger_cell_id'] == None))[0] # Using CuPy to handle None values - + no_id = cp.where(cp.asarray(assignments["segger_cell_id"] == None))[ + 0 + ] # Using CuPy to handle None values + if len(no_id) > 0: # Only compute if there are unassigned transcripts # Apply score cut-off to unassigned transcripts - no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) + no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold no_id_scores.eliminate_zeros() # Clean up zeros @@ -364,9 +365,13 @@ def _get_id(): target_nodes = unassigned_ids[non_zero_cols.get()] # # Save edge_index using CuDF and Dask-CuDF for GPU acceleration - edge_index_ddf = delayed(dd.from_pandas)(pd.DataFrame({'source': source_nodes, 'target': target_nodes}), npartitions=1) + edge_index_ddf = delayed(dd.from_pandas)( + pd.DataFrame({"source": source_nodes, "target": target_nodes}), npartitions=1 + ) # Use delayed for asynchronous disk writing of edge_index in Dask DataFrame - delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True, ignore_divisions=True) + delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)( + edge_index_save_path, append=True, ignore_divisions=True + ) delayed_write_edge_index.persist() # Schedule writing assignments = { @@ -404,8 +409,8 @@ def segment( receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, knn_method: str = "cuda", verbose: bool = False, - gpu_ids: list = ['0'], - **anndata_kwargs + gpu_ids: list = ["0"], + **anndata_kwargs, ) -> None: """ Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, @@ -467,7 +472,7 @@ def segment( # # Initialize Dask DataFrame for assignments # output_ddf = None - + # @dask.delayed # def process_batch(batch, gpu_id): # # Assume you're using CuPy, and you need to use a specific GPU @@ -482,7 +487,7 @@ def segment( # output_ddf_save_path=output_ddf_save_path, # gpu_id=gpu_id # ) - + # delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.train)] # # pqdm(delayed_tasks, n_jobs=len(gpu_ids), argument_type='delayed', progress_bar=True) # # dask.compute(*delayed_tasks) @@ -501,7 +506,7 @@ def segment( if verbose: print(f"Processing {loader_name} data...") - for batch in tqdm(loader, desc=f'Processing {loader_name} batches'): + for batch in tqdm(loader, desc=f"Processing {loader_name} batches"): gpu_id = random.choice(gpu_ids) # Call predict_batch for each batch predict_batch( @@ -513,7 +518,7 @@ def segment( knn_method=knn_method, edge_index_save_path=edge_index_save_path, output_ddf_save_path=output_ddf_save_path, - gpu_id=gpu_id + gpu_id=gpu_id, ) if verbose: diff --git a/submit_job.py b/submit_job.py index 0b99c63..0e24350 100644 --- a/submit_job.py +++ b/submit_job.py @@ -2,70 +2,140 @@ import subprocess # Load the YAML configuration file -with open('config.yaml', 'r') as file: +with open("config.yaml", "r") as file: config = yaml.safe_load(file) + # Define the pipeline functions def run_data_processing(): - subprocess.run([ - "bsub", "-J", "job_data_processing", "-o", config['preprocessing']['output_log'], - "-n", str(config['preprocessing']['workers']), - "-R", f"rusage[mem={config['preprocessing']['memory']}]", - "-q", "long", - "singularity", "exec", "--bind", - f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'], "python3", "src/segger/cli/create_dataset_fast.py", - "--base_dir", config['preprocessing']['base_dir'], - "--data_dir", config['preprocessing']['data_dir'], - "--sample_type", config['preprocessing']['sample_type'], - "--tile_width", str(config['preprocessing']['tile_width']), - "--tile_height", str(config['preprocessing']['tile_height']), - "--n_workers", str(config['preprocessing']['workers']) - ]) + subprocess.run( + [ + "bsub", + "-J", + "job_data_processing", + "-o", + config["preprocessing"]["output_log"], + "-n", + str(config["preprocessing"]["workers"]), + "-R", + f"rusage[mem={config['preprocessing']['memory']}]", + "-q", + "long", + "singularity", + "exec", + "--bind", + f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", + "--pwd", + config["paths"]["container_dir"], + config["paths"]["singularity_image"], + "python3", + "src/segger/cli/create_dataset_fast.py", + "--base_dir", + config["preprocessing"]["base_dir"], + "--data_dir", + config["preprocessing"]["data_dir"], + "--sample_type", + config["preprocessing"]["sample_type"], + "--tile_width", + str(config["preprocessing"]["tile_width"]), + "--tile_height", + str(config["preprocessing"]["tile_height"]), + "--n_workers", + str(config["preprocessing"]["workers"]), + ] + ) + def run_training(): - subprocess.run([ - "bsub", "-J", "job_training", "-w", "done(job_data_processing)", - "-o", config['training']['output_log'], - "-n", str(config['training']['workers']), - "-R", f"rusage[mem={config['training']['memory']}]", - "-R", "tensorcore", - "-gpu", f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", - "-q", "gpu", - "singularity", "exec", "--nv", "--bind", - f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'], "python3", "src/segger/cli/train_model.py", - "--dataset_dir", config['training']['dataset_dir'], - "--models_dir", config['training']['models_dir'], - "--sample_tag", config['training']['sample_tag'], - "--num_workers", str(config['training']['workers']), - "--devices", str(config['training']['gpus']) - ]) + subprocess.run( + [ + "bsub", + "-J", + "job_training", + "-w", + "done(job_data_processing)", + "-o", + config["training"]["output_log"], + "-n", + str(config["training"]["workers"]), + "-R", + f"rusage[mem={config['training']['memory']}]", + "-R", + "tensorcore", + "-gpu", + f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", + "-q", + "gpu", + "singularity", + "exec", + "--nv", + "--bind", + f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", + "--pwd", + config["paths"]["container_dir"], + config["paths"]["singularity_image"], + "python3", + "src/segger/cli/train_model.py", + "--dataset_dir", + config["training"]["dataset_dir"], + "--models_dir", + config["training"]["models_dir"], + "--sample_tag", + config["training"]["sample_tag"], + "--num_workers", + str(config["training"]["workers"]), + "--devices", + str(config["training"]["gpus"]), + ] + ) + def run_prediction(): - subprocess.run([ - "bsub", "-J", "job_prediction", "-w", "done(job_training)", - "-o", config['prediction']['output_log'], - "-n", str(config['prediction']['workers']), - "-R", f"rusage[mem={config['prediction']['memory']}]", - "-R", "tensorcore", - "-gpu", f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", - "-q", "gpu", - "singularity", "exec", "--nv", "--bind", - f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'], "python3", "src/segger/cli/predict.py", - "--segger_data_dir", config['prediction']['segger_data_dir'], - "--benchmarks_dir", config['prediction']['benchmarks_dir'], - "--transcripts_file", config['prediction']['transcripts_file'], - "--knn_method", config['prediction']['knn_method'], - "--num_workers", str(config['prediction']['workers']) - ]) + subprocess.run( + [ + "bsub", + "-J", + "job_prediction", + "-w", + "done(job_training)", + "-o", + config["prediction"]["output_log"], + "-n", + str(config["prediction"]["workers"]), + "-R", + f"rusage[mem={config['prediction']['memory']}]", + "-R", + "tensorcore", + "-gpu", + f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", + "-q", + "gpu", + "singularity", + "exec", + "--nv", + "--bind", + f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", + "--pwd", + config["paths"]["container_dir"], + config["paths"]["singularity_image"], + "python3", + "src/segger/cli/predict.py", + "--segger_data_dir", + config["prediction"]["segger_data_dir"], + "--benchmarks_dir", + config["prediction"]["benchmarks_dir"], + "--transcripts_file", + config["prediction"]["transcripts_file"], + "--knn_method", + config["prediction"]["knn_method"], + "--num_workers", + str(config["prediction"]["workers"]), + ] + ) + # Main script logic -pipelines = config.get('pipelines', []) +pipelines = config.get("pipelines", []) for pipeline in pipelines: if pipeline == 1: run_data_processing() From 1d3ca2114b4fba648daaaef390a73059d6e819d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Wed, 16 Oct 2024 14:21:08 +0200 Subject: [PATCH 105/156] end-to-end CLI, using singularity is optional --- config.yaml | 8 +++-- submit_job.py | 96 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 67 insertions(+), 37 deletions(-) diff --git a/config.yaml b/config.yaml index ef5806c..b04d509 100644 --- a/config.yaml +++ b/config.yaml @@ -1,7 +1,9 @@ +use_singularity: false + paths: local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" container_dir: "/workspace/segger_dev" - singularity_image: "segger_dev_latest.sif" + singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false pipelines: - 1 # Run data processing @@ -23,7 +25,7 @@ training: dataset_dir: "data_segger_test" models_dir: "model_dir" sample_tag: "first_training" - workers: 16 + workers: 4 memory: "16G" gpus: 8 gpu_memory: "8G" @@ -34,6 +36,6 @@ prediction: benchmarks_dir: "benchmark_dir" transcripts_file: "data_xenium/transcripts.parquet" knn_method: "cuda" - workers: 16 + workers: 4 memory: "16G" gpu_memory: "8G" diff --git a/submit_job.py b/submit_job.py index 0b99c63..44a33ad 100644 --- a/submit_job.py +++ b/submit_job.py @@ -5,66 +5,94 @@ with open('config.yaml', 'r') as file: config = yaml.safe_load(file) -# Define the pipeline functions -def run_data_processing(): - subprocess.run([ - "bsub", "-J", "job_data_processing", "-o", config['preprocessing']['output_log'], - "-n", str(config['preprocessing']['workers']), - "-R", f"rusage[mem={config['preprocessing']['memory']}]", - "-q", "long", +# Helper function to wrap command with Singularity +def wrap_command_with_singularity(command): + return [ "singularity", "exec", "--bind", f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'], "python3", "src/segger/cli/create_dataset_fast.py", + config['paths']['singularity_image'] + ] + command + +# Define the pipeline functions + +# Run the data processing pipeline +def run_data_processing(): + python_command = [ + "python3", "src/segger/cli/create_dataset_fast.py", "--base_dir", config['preprocessing']['base_dir'], "--data_dir", config['preprocessing']['data_dir'], "--sample_type", config['preprocessing']['sample_type'], "--tile_width", str(config['preprocessing']['tile_width']), "--tile_height", str(config['preprocessing']['tile_height']), "--n_workers", str(config['preprocessing']['workers']) - ]) + ] + + if config['use_singularity']: + python_command = wrap_command_with_singularity(python_command) + + bsub_command = [ + "bsub", "-J", "job_data_processing", "-o", config['preprocessing']['output_log'], + "-n", str(config['preprocessing']['workers']), + "-R", f"rusage[mem={config['preprocessing']['memory']}]", + "-q", "long" + ] + python_command + + subprocess.run(bsub_command) +# Run the training pipeline def run_training(): - subprocess.run([ + python_command = [ + "python3", "src/segger/cli/train_model.py", + "--dataset_dir", config['training']['dataset_dir'], + "--models_dir", config['training']['models_dir'], + "--sample_tag", config['training']['sample_tag'], + "--num_workers", str(config['training']['workers']), + "--devices", str(config['training']['gpus']) + ] + + if config['use_singularity']: + python_command = wrap_command_with_singularity(python_command) + + bsub_command = [ "bsub", "-J", "job_training", "-w", "done(job_data_processing)", "-o", config['training']['output_log'], "-n", str(config['training']['workers']), "-R", f"rusage[mem={config['training']['memory']}]", "-R", "tensorcore", "-gpu", f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", - "-q", "gpu", - "singularity", "exec", "--nv", "--bind", - f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'], "python3", "src/segger/cli/train_model.py", - "--dataset_dir", config['training']['dataset_dir'], - "--models_dir", config['training']['models_dir'], - "--sample_tag", config['training']['sample_tag'], - "--num_workers", str(config['training']['workers']), - "--devices", str(config['training']['gpus']) - ]) + "-q", "gpu" + ] + python_command + subprocess.run(bsub_command) + +# Run the prediction pipeline def run_prediction(): - subprocess.run([ + python_command = [ + "python3", "src/segger/cli/predict.py", + "--segger_data_dir", config['prediction']['segger_data_dir'], + "--benchmarks_dir", config['prediction']['benchmarks_dir'], + "--transcripts_file", config['prediction']['transcripts_file'], + "--knn_method", config['prediction']['knn_method'], + "--num_workers", str(config['prediction']['workers']) + ] + + if config['use_singularity']: + python_command = wrap_command_with_singularity(python_command) + + bsub_command = [ "bsub", "-J", "job_prediction", "-w", "done(job_training)", "-o", config['prediction']['output_log'], "-n", str(config['prediction']['workers']), "-R", f"rusage[mem={config['prediction']['memory']}]", "-R", "tensorcore", "-gpu", f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", - "-q", "gpu", - "singularity", "exec", "--nv", "--bind", - f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'], "python3", "src/segger/cli/predict.py", - "--segger_data_dir", config['prediction']['segger_data_dir'], - "--benchmarks_dir", config['prediction']['benchmarks_dir'], - "--transcripts_file", config['prediction']['transcripts_file'], - "--knn_method", config['prediction']['knn_method'], - "--num_workers", str(config['prediction']['workers']) - ]) + "-q", "gpu" + ] + python_command + + subprocess.run(bsub_command) -# Main script logic +# Run the selected pipelines pipelines = config.get('pipelines', []) for pipeline in pipelines: if pipeline == 1: From ac62f5d2c09b93aba26716bee3b6b808d4411d4f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 12:24:01 +0000 Subject: [PATCH 106/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- config.yaml | 2 +- submit_job.py | 145 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 101 insertions(+), 46 deletions(-) diff --git a/config.yaml b/config.yaml index a12e4fa..0f22eb0 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,7 @@ use_singularity: false paths: local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" container_dir: "/workspace/segger_dev" - singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false + singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false pipelines: - 1 # Run data processing diff --git a/submit_job.py b/submit_job.py index c92cdc2..eafa094 100644 --- a/submit_job.py +++ b/submit_job.py @@ -5,95 +5,150 @@ with open("config.yaml", "r") as file: config = yaml.safe_load(file) + # Helper function to wrap command with Singularity def wrap_command_with_singularity(command): return [ - "singularity", "exec", "--bind", + "singularity", + "exec", + "--bind", f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'] + "--pwd", + config["paths"]["container_dir"], + config["paths"]["singularity_image"], ] + command + # Define the pipeline functions + # Run the data processing pipeline def run_data_processing(): python_command = [ - "python3", "src/segger/cli/create_dataset_fast.py", - "--base_dir", config['preprocessing']['base_dir'], - "--data_dir", config['preprocessing']['data_dir'], - "--sample_type", config['preprocessing']['sample_type'], - "--tile_width", str(config['preprocessing']['tile_width']), - "--tile_height", str(config['preprocessing']['tile_height']), - "--n_workers", str(config['preprocessing']['workers']) + "python3", + "src/segger/cli/create_dataset_fast.py", + "--base_dir", + config["preprocessing"]["base_dir"], + "--data_dir", + config["preprocessing"]["data_dir"], + "--sample_type", + config["preprocessing"]["sample_type"], + "--tile_width", + str(config["preprocessing"]["tile_width"]), + "--tile_height", + str(config["preprocessing"]["tile_height"]), + "--n_workers", + str(config["preprocessing"]["workers"]), ] - if config['use_singularity']: + if config["use_singularity"]: python_command = wrap_command_with_singularity(python_command) - + bsub_command = [ - "bsub", "-J", "job_data_processing", "-o", config['preprocessing']['output_log'], - "-n", str(config['preprocessing']['workers']), - "-R", f"rusage[mem={config['preprocessing']['memory']}]", - "-q", "long" + "bsub", + "-J", + "job_data_processing", + "-o", + config["preprocessing"]["output_log"], + "-n", + str(config["preprocessing"]["workers"]), + "-R", + f"rusage[mem={config['preprocessing']['memory']}]", + "-q", + "long", ] + python_command subprocess.run(bsub_command) + # Run the training pipeline def run_training(): python_command = [ - "python3", "src/segger/cli/train_model.py", - "--dataset_dir", config['training']['dataset_dir'], - "--models_dir", config['training']['models_dir'], - "--sample_tag", config['training']['sample_tag'], - "--num_workers", str(config['training']['workers']), - "--devices", str(config['training']['gpus']) + "python3", + "src/segger/cli/train_model.py", + "--dataset_dir", + config["training"]["dataset_dir"], + "--models_dir", + config["training"]["models_dir"], + "--sample_tag", + config["training"]["sample_tag"], + "--num_workers", + str(config["training"]["workers"]), + "--devices", + str(config["training"]["gpus"]), ] - if config['use_singularity']: + if config["use_singularity"]: python_command = wrap_command_with_singularity(python_command) bsub_command = [ - "bsub", "-J", "job_training", "-w", "done(job_data_processing)", - "-o", config['training']['output_log'], - "-n", str(config['training']['workers']), - "-R", f"rusage[mem={config['training']['memory']}]", - "-R", "tensorcore", - "-gpu", f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", - "-q", "gpu" + "bsub", + "-J", + "job_training", + "-w", + "done(job_data_processing)", + "-o", + config["training"]["output_log"], + "-n", + str(config["training"]["workers"]), + "-R", + f"rusage[mem={config['training']['memory']}]", + "-R", + "tensorcore", + "-gpu", + f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", + "-q", + "gpu", ] + python_command subprocess.run(bsub_command) + # Run the prediction pipeline def run_prediction(): python_command = [ - "python3", "src/segger/cli/predict.py", - "--segger_data_dir", config['prediction']['segger_data_dir'], - "--benchmarks_dir", config['prediction']['benchmarks_dir'], - "--transcripts_file", config['prediction']['transcripts_file'], - "--knn_method", config['prediction']['knn_method'], - "--num_workers", str(config['prediction']['workers']) + "python3", + "src/segger/cli/predict.py", + "--segger_data_dir", + config["prediction"]["segger_data_dir"], + "--benchmarks_dir", + config["prediction"]["benchmarks_dir"], + "--transcripts_file", + config["prediction"]["transcripts_file"], + "--knn_method", + config["prediction"]["knn_method"], + "--num_workers", + str(config["prediction"]["workers"]), ] - if config['use_singularity']: + if config["use_singularity"]: python_command = wrap_command_with_singularity(python_command) bsub_command = [ - "bsub", "-J", "job_prediction", "-w", "done(job_training)", - "-o", config['prediction']['output_log'], - "-n", str(config['prediction']['workers']), - "-R", f"rusage[mem={config['prediction']['memory']}]", - "-R", "tensorcore", - "-gpu", f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", - "-q", "gpu" + "bsub", + "-J", + "job_prediction", + "-w", + "done(job_training)", + "-o", + config["prediction"]["output_log"], + "-n", + str(config["prediction"]["workers"]), + "-R", + f"rusage[mem={config['prediction']['memory']}]", + "-R", + "tensorcore", + "-gpu", + f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", + "-q", + "gpu", ] + python_command subprocess.run(bsub_command) + # Run the selected pipelines -pipelines = config.get('pipelines', []) +pipelines = config.get("pipelines", []) for pipeline in pipelines: if pipeline == 1: run_data_processing() From ce086309680a0093c1e216efe9c6e28054c49e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Wed, 16 Oct 2024 14:52:37 +0200 Subject: [PATCH 107/156] Use nvidia/cuda in singularity --- submit_job.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/submit_job.py b/submit_job.py index c92cdc2..4e813d5 100644 --- a/submit_job.py +++ b/submit_job.py @@ -6,15 +6,21 @@ config = yaml.safe_load(file) # Helper function to wrap command with Singularity -def wrap_command_with_singularity(command): - return [ +def wrap_command_with_singularity(command, use_gpu=False): + singularity_command = [ "singularity", "exec", "--bind", f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'], - config['paths']['singularity_image'] - ] + command + "--pwd", config['paths']['container_dir'] + ] + + if use_gpu: + singularity_command.append("--nv") + + singularity_command.append(config['paths']['singularity_image']) + + return singularity_command + command -# Define the pipeline functions +# Define the pipeline functions: # Run the data processing pipeline def run_data_processing(): @@ -29,7 +35,7 @@ def run_data_processing(): ] if config['use_singularity']: - python_command = wrap_command_with_singularity(python_command) + python_command = wrap_command_with_singularity(python_command, use_gpu=False) bsub_command = [ "bsub", "-J", "job_data_processing", "-o", config['preprocessing']['output_log'], @@ -52,7 +58,7 @@ def run_training(): ] if config['use_singularity']: - python_command = wrap_command_with_singularity(python_command) + python_command = wrap_command_with_singularity(python_command, use_gpu=True) bsub_command = [ "bsub", "-J", "job_training", "-w", "done(job_data_processing)", @@ -78,7 +84,7 @@ def run_prediction(): ] if config['use_singularity']: - python_command = wrap_command_with_singularity(python_command) + python_command = wrap_command_with_singularity(python_command, use_gpu=True) bsub_command = [ "bsub", "-J", "job_prediction", "-w", "done(job_training)", From 829afe62f4baf2ed9c809750f0011fca7a8de06c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 12:56:01 +0000 Subject: [PATCH 108/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- submit_job.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/submit_job.py b/submit_job.py index 66dde93..01589f9 100644 --- a/submit_job.py +++ b/submit_job.py @@ -9,20 +9,25 @@ # Helper function to wrap command with Singularity def wrap_command_with_singularity(command, use_gpu=False): singularity_command = [ - "singularity", "exec", "--bind", + "singularity", + "exec", + "--bind", f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", config['paths']['container_dir'] + "--pwd", + config["paths"]["container_dir"], ] - + if use_gpu: singularity_command.append("--nv") - - singularity_command.append(config['paths']['singularity_image']) - + + singularity_command.append(config["paths"]["singularity_image"]) + return singularity_command + command + # Define the pipeline functions + # Run the data processing pipeline def run_data_processing(): python_command = [ @@ -42,9 +47,9 @@ def run_data_processing(): str(config["preprocessing"]["workers"]), ] - if config['use_singularity']: + if config["use_singularity"]: python_command = wrap_command_with_singularity(python_command, use_gpu=False) - + bsub_command = [ "bsub", "-J", @@ -79,7 +84,7 @@ def run_training(): str(config["training"]["gpus"]), ] - if config['use_singularity']: + if config["use_singularity"]: python_command = wrap_command_with_singularity(python_command, use_gpu=True) bsub_command = [ @@ -122,7 +127,7 @@ def run_prediction(): str(config["prediction"]["workers"]), ] - if config['use_singularity']: + if config["use_singularity"]: python_command = wrap_command_with_singularity(python_command, use_gpu=True) bsub_command = [ From 286da4ad22a959dd25196b1a2b211bed9e65138a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Wed, 16 Oct 2024 16:46:37 +0200 Subject: [PATCH 109/156] Fix in PyG dataset --- src/segger/data/parquet/pyg_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/data/parquet/pyg_dataset.py b/src/segger/data/parquet/pyg_dataset.py index 4626ac1..a54803a 100644 --- a/src/segger/data/parquet/pyg_dataset.py +++ b/src/segger/data/parquet/pyg_dataset.py @@ -70,7 +70,7 @@ def get(self, idx: int) -> Data: data["tx"].x = data["tx"].x.unsqueeze(1) assert data["tx"].x.dim() == 2 # this is an issue in PyG's RandomLinkSplit, dimensions are not consistent if there is only one edge in the graph - if hasattr(data, "edge_label_index"): + if hasattr(data["tx", "belongs", "bd"], "edge_label_index"): if data["tx", "belongs", "bd"].edge_label_index.dim() == 1: data["tx", "belongs", "bd"].edge_label_index = data["tx", "belongs", "bd"].edge_label_index.unsqueeze(1) data["tx", "belongs", "bd"].edge_label = data["tx", "belongs", "bd"].edge_label.unsqueeze(0) From 7e8899d0b331ac5d3e82eb923c2ea231f327c7c9 Mon Sep 17 00:00:00 2001 From: quail768 <101423077+quail768@users.noreply.github.com> Date: Wed, 16 Oct 2024 17:49:25 +0200 Subject: [PATCH 110/156] Update utils.py to fix issue with find_markers() Current find_markers() command triggers multi-dimensional indexing by numpy which was depreciated in numpy 1.23. Fix flattens expr_frac to avoid this --- src/segger/validation/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/validation/utils.py b/src/segger/validation/utils.py index 5a8c28a..1adbc15 100644 --- a/src/segger/validation/utils.py +++ b/src/segger/validation/utils.py @@ -56,7 +56,7 @@ def find_markers( cutoff_low = np.percentile(mean_expression, neg_percentile) pos_indices = np.where(mean_expression >= cutoff_high)[0] neg_indices = np.where(mean_expression <= cutoff_low)[0] - expr_frac = np.asarray((subset.X[:, pos_indices] > 0).mean(axis=0))[0] + expr_frac = np.asarray((subset.X[:, pos_indices] > 0).mean(axis=0)).flatten() valid_pos_indices = pos_indices[expr_frac >= (percentage / 100)] positive_markers = genes[valid_pos_indices] negative_markers = genes[neg_indices] From 5306ad3cf4b5113ad9b1585f6e9f85fc9ca09e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Wed, 16 Oct 2024 23:28:10 +0200 Subject: [PATCH 111/156] More advanced job submissions --- config.yaml | 36 +++++----- submit_job.py | 190 ++++++++++++++++++++++++++------------------------ 2 files changed, 117 insertions(+), 109 deletions(-) diff --git a/config.yaml b/config.yaml index 0f22eb0..d34ff33 100644 --- a/config.yaml +++ b/config.yaml @@ -1,9 +1,6 @@ +use_lsf: true use_singularity: false - -paths: - local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" - container_dir: "/workspace/segger_dev" - singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false +use_debugpy: false pipelines: - 1 # Run data processing @@ -13,29 +10,36 @@ pipelines: preprocessing: output_log: "preprocess_output.log" base_dir: "data_xenium" - data_dir: "data_segger_test" + data_dir: "data_segger" sample_type: "xenium" tile_width: 120 tile_height: 120 - workers: 16 - memory: "16G" + workers: 12 + memory: "16G" # this is ignored if use_lsf is false training: output_log: "train_output.log" - dataset_dir: "data_segger_test" + dataset_dir: "data_segger" models_dir: "model_dir" sample_tag: "first_training" - workers: 4 - memory: "16G" + workers: 12 + memory: "16G" # this is ignored if use_lsf is false gpus: 8 - gpu_memory: "8G" + gpu_memory: "8G" # this is ignored if use_lsf is false prediction: output_log: "predict_output.log" - segger_data_dir: "data_segger_test" + segger_data_dir: "data_segger" + models_dir: "model_dir" benchmarks_dir: "benchmark_dir" transcripts_file: "data_xenium/transcripts.parquet" knn_method: "cuda" - workers: 4 - memory: "16G" - gpu_memory: "8G" + workers: 12 + memory: "16G" # this is ignored if use_lsf is false + gpu_memory: "8G" # this is ignored if use_lsf is false + +# path_mappings are ignored if use_singularity is false +path_mappings: + local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" + container_dir: "/workspace/segger_dev" + singularity_image: "segger_dev_latest.sif" diff --git a/submit_job.py b/submit_job.py index 01589f9..44633b7 100644 --- a/submit_job.py +++ b/submit_job.py @@ -1,37 +1,38 @@ import yaml import subprocess +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--config", default="config.yaml", help="Path to the configuration YAML file") +args = parser.parse_args() # Load the YAML configuration file -with open("config.yaml", "r") as file: +with open(args.config, "r") as file: config = yaml.safe_load(file) - -# Helper function to wrap command with Singularity -def wrap_command_with_singularity(command, use_gpu=False): - singularity_command = [ - "singularity", - "exec", - "--bind", - f"{config['paths']['local_repo_dir']}:{config['paths']['container_dir']}", - "--pwd", - config["paths"]["container_dir"], - ] - - if use_gpu: - singularity_command.append("--nv") - - singularity_command.append(config["paths"]["singularity_image"]) - - return singularity_command + command - +# Function to get Singularity command if enabled +def get_singularity_command(use_gpu=False): + if config.get('use_singularity', False): + singularity_command = [ + "singularity", "exec", "--bind", + f"{config['path_mappings']['local_repo_dir']}:{config['path_mappings']['container_dir']}", + "--pwd", config['path_mappings']['container_dir'] + ] + if use_gpu: + singularity_command.append("--nv") + singularity_command.append(config['path_mappings']['singularity_image']) + return singularity_command + return [] # Return an empty list if Singularity is not enabled + +# Function to get Python command +def get_python_command(): + return ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] if config.get('use_debugpy', False) else ["python3"] # Define the pipeline functions - # Run the data processing pipeline def run_data_processing(): - python_command = [ - "python3", + command = get_singularity_command(use_gpu=False) + get_python_command() + [ "src/segger/cli/create_dataset_fast.py", "--base_dir", config["preprocessing"]["base_dir"], @@ -47,30 +48,30 @@ def run_data_processing(): str(config["preprocessing"]["workers"]), ] - if config["use_singularity"]: - python_command = wrap_command_with_singularity(python_command, use_gpu=False) - - bsub_command = [ - "bsub", - "-J", - "job_data_processing", - "-o", - config["preprocessing"]["output_log"], - "-n", - str(config["preprocessing"]["workers"]), - "-R", - f"rusage[mem={config['preprocessing']['memory']}]", - "-q", - "long", - ] + python_command - - subprocess.run(bsub_command) + if config.get('use_lsf', False): + command = [ + "bsub", + "-J", + "job_data_processing", + "-o", + config["preprocessing"]["output_log"], + "-n", + str(config["preprocessing"]["workers"]), + "-R", + f"rusage[mem={config['preprocessing']['memory']}]", + "-q", + "medium", + ] + command + + try: + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + print(f"Error running data processing pipeline: {e}") # Run the training pipeline def run_training(): - python_command = [ - "python3", + command = get_singularity_command(use_gpu=True) + get_python_command() + [ "src/segger/cli/train_model.py", "--dataset_dir", config["training"]["dataset_dir"], @@ -84,39 +85,41 @@ def run_training(): str(config["training"]["gpus"]), ] - if config["use_singularity"]: - python_command = wrap_command_with_singularity(python_command, use_gpu=True) - - bsub_command = [ - "bsub", - "-J", - "job_training", - "-w", - "done(job_data_processing)", - "-o", - config["training"]["output_log"], - "-n", - str(config["training"]["workers"]), - "-R", - f"rusage[mem={config['training']['memory']}]", - "-R", - "tensorcore", - "-gpu", - f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", - "-q", - "gpu", - ] + python_command - - subprocess.run(bsub_command) + if config.get('use_lsf', False): + command = [ + "bsub", + "-J", + "job_training", + "-w", + "done(job_data_processing)", + "-o", + config["training"]["output_log"], + "-n", + str(config["training"]["workers"]), + "-R", + f"rusage[mem={config['training']['memory']}]", + "-R", + "tensorcore", + "-gpu", + f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", + "-q", + "gpu", + ] + command + + try: + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + print(f"Error running training pipeline: {e}") # Run the prediction pipeline def run_prediction(): - python_command = [ - "python3", + command = get_singularity_command(use_gpu=True) + get_python_command() + [ "src/segger/cli/predict.py", "--segger_data_dir", config["prediction"]["segger_data_dir"], + "--models_dir", + config["prediction"]["models_dir"], "--benchmarks_dir", config["prediction"]["benchmarks_dir"], "--transcripts_file", @@ -127,30 +130,31 @@ def run_prediction(): str(config["prediction"]["workers"]), ] - if config["use_singularity"]: - python_command = wrap_command_with_singularity(python_command, use_gpu=True) - - bsub_command = [ - "bsub", - "-J", - "job_prediction", - "-w", - "done(job_training)", - "-o", - config["prediction"]["output_log"], - "-n", - str(config["prediction"]["workers"]), - "-R", - f"rusage[mem={config['prediction']['memory']}]", - "-R", - "tensorcore", - "-gpu", - f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", - "-q", - "gpu", - ] + python_command - - subprocess.run(bsub_command) + if config.get('use_lsf', False): + command = [ + "bsub", + "-J", + "job_prediction", + "-w", + "done(job_training)", + "-o", + config["prediction"]["output_log"], + "-n", + str(config["prediction"]["workers"]), + "-R", + f"rusage[mem={config['prediction']['memory']}]", + "-R", + "tensorcore", + "-gpu", + f"num=1:j_exclusive=no:gmem={config['prediction']['gpu_memory']}", + "-q", + "gpu", + ] + command + + try: + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + print(f"Error running prediction pipeline: {e}") # Run the selected pipelines From 4dfcb9bb1fb0954b3ad681a2c51ecea72a3f3397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Thu, 17 Oct 2024 10:38:53 +0200 Subject: [PATCH 112/156] Had to adjust a few variable names to make predict_parquet work --- src/segger/prediction/predict.py | 2 +- src/segger/prediction/predict_parquet.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/segger/prediction/predict.py b/src/segger/prediction/predict.py index a2ab520..31eac9d 100644 --- a/src/segger/prediction/predict.py +++ b/src/segger/prediction/predict.py @@ -369,7 +369,7 @@ def predict( torch.cuda.empty_cache() # Concatenate all assignments into a single Dask DataFrame - final_assignments = dd.concat(all_assignments_dd, ignore_index=True) + final_assignments = dd.concat(all_assignments, ignore_index=True) # Sort the Dask DataFrame by 'transcript_id' before setting it as an index final_assignments = final_assignments.sort_values(by="transcript_id") diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index ede8469..196646a 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -46,8 +46,8 @@ import json from datetime import datetime import dask_geopandas as dgpd # Assuming dask-geopandas is installed -import cudf -import dask_cudf +# import cudf +# import dask_cudf import cupy as cp import cupyx import warnings @@ -128,7 +128,7 @@ def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): col_mapped = cp.searchsorted(col_idx, col_filtered) # Return the new subset sparse matrix - return coo_matrix((new_data, (row_map, col_map)), shape=(len(row_idx), len(col_idx))) + return coo_matrix((data_filtered, (row_mapped, col_mapped)), shape=(len(row_idx), len(col_idx))) def load_model(checkpoint_path: str) -> LitSegger: @@ -403,6 +403,7 @@ def segment( transcript_file: Union[str, Path], score_cut: float = 0.5, use_cc: bool = True, + file_format: str = "", save_transcripts: bool = True, save_anndata: bool = True, save_cell_masks: bool = False, # Placeholder for future implementation From c31a3d6bec8faae19a77e346d9c2777937b2ae1e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Oct 2024 08:39:26 +0000 Subject: [PATCH 113/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/segger/prediction/predict_parquet.py | 1 + submit_job.py | 131 +++++++++++++---------- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 196646a..1d03281 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -46,6 +46,7 @@ import json from datetime import datetime import dask_geopandas as dgpd # Assuming dask-geopandas is installed + # import cudf # import dask_cudf import cupy as cp diff --git a/submit_job.py b/submit_job.py index 44633b7..01f9ced 100644 --- a/submit_job.py +++ b/submit_job.py @@ -10,45 +10,60 @@ with open(args.config, "r") as file: config = yaml.safe_load(file) + # Function to get Singularity command if enabled def get_singularity_command(use_gpu=False): - if config.get('use_singularity', False): + if config.get("use_singularity", False): singularity_command = [ - "singularity", "exec", "--bind", + "singularity", + "exec", + "--bind", f"{config['path_mappings']['local_repo_dir']}:{config['path_mappings']['container_dir']}", - "--pwd", config['path_mappings']['container_dir'] + "--pwd", + config["path_mappings"]["container_dir"], ] if use_gpu: singularity_command.append("--nv") - singularity_command.append(config['path_mappings']['singularity_image']) + singularity_command.append(config["path_mappings"]["singularity_image"]) return singularity_command return [] # Return an empty list if Singularity is not enabled + # Function to get Python command def get_python_command(): - return ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] if config.get('use_debugpy', False) else ["python3"] + return ( + ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] + if config.get("use_debugpy", False) + else ["python3"] + ) + # Define the pipeline functions + # Run the data processing pipeline def run_data_processing(): - command = get_singularity_command(use_gpu=False) + get_python_command() + [ - "src/segger/cli/create_dataset_fast.py", - "--base_dir", - config["preprocessing"]["base_dir"], - "--data_dir", - config["preprocessing"]["data_dir"], - "--sample_type", - config["preprocessing"]["sample_type"], - "--tile_width", - str(config["preprocessing"]["tile_width"]), - "--tile_height", - str(config["preprocessing"]["tile_height"]), - "--n_workers", - str(config["preprocessing"]["workers"]), - ] - - if config.get('use_lsf', False): + command = ( + get_singularity_command(use_gpu=False) + + get_python_command() + + [ + "src/segger/cli/create_dataset_fast.py", + "--base_dir", + config["preprocessing"]["base_dir"], + "--data_dir", + config["preprocessing"]["data_dir"], + "--sample_type", + config["preprocessing"]["sample_type"], + "--tile_width", + str(config["preprocessing"]["tile_width"]), + "--tile_height", + str(config["preprocessing"]["tile_height"]), + "--n_workers", + str(config["preprocessing"]["workers"]), + ] + ) + + if config.get("use_lsf", False): command = [ "bsub", "-J", @@ -71,21 +86,25 @@ def run_data_processing(): # Run the training pipeline def run_training(): - command = get_singularity_command(use_gpu=True) + get_python_command() + [ - "src/segger/cli/train_model.py", - "--dataset_dir", - config["training"]["dataset_dir"], - "--models_dir", - config["training"]["models_dir"], - "--sample_tag", - config["training"]["sample_tag"], - "--num_workers", - str(config["training"]["workers"]), - "--devices", - str(config["training"]["gpus"]), - ] - - if config.get('use_lsf', False): + command = ( + get_singularity_command(use_gpu=True) + + get_python_command() + + [ + "src/segger/cli/train_model.py", + "--dataset_dir", + config["training"]["dataset_dir"], + "--models_dir", + config["training"]["models_dir"], + "--sample_tag", + config["training"]["sample_tag"], + "--num_workers", + str(config["training"]["workers"]), + "--devices", + str(config["training"]["gpus"]), + ] + ) + + if config.get("use_lsf", False): command = [ "bsub", "-J", @@ -114,23 +133,27 @@ def run_training(): # Run the prediction pipeline def run_prediction(): - command = get_singularity_command(use_gpu=True) + get_python_command() + [ - "src/segger/cli/predict.py", - "--segger_data_dir", - config["prediction"]["segger_data_dir"], - "--models_dir", - config["prediction"]["models_dir"], - "--benchmarks_dir", - config["prediction"]["benchmarks_dir"], - "--transcripts_file", - config["prediction"]["transcripts_file"], - "--knn_method", - config["prediction"]["knn_method"], - "--num_workers", - str(config["prediction"]["workers"]), - ] - - if config.get('use_lsf', False): + command = ( + get_singularity_command(use_gpu=True) + + get_python_command() + + [ + "src/segger/cli/predict.py", + "--segger_data_dir", + config["prediction"]["segger_data_dir"], + "--models_dir", + config["prediction"]["models_dir"], + "--benchmarks_dir", + config["prediction"]["benchmarks_dir"], + "--transcripts_file", + config["prediction"]["transcripts_file"], + "--knn_method", + config["prediction"]["knn_method"], + "--num_workers", + str(config["prediction"]["workers"]), + ] + ) + + if config.get("use_lsf", False): command = [ "bsub", "-J", From 289a81ab46dbaa144948dada0f2f3070bbffa3e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 02:22:15 +0200 Subject: [PATCH 114/156] Job submission with pipeline parameters --- config.yaml => scripts/config.yaml | 48 +++++++++-- submit_job.py => scripts/submit_job.py | 110 +++++++++++++++++++++---- 2 files changed, 134 insertions(+), 24 deletions(-) rename config.yaml => scripts/config.yaml (53%) rename submit_job.py => scripts/submit_job.py (56%) diff --git a/config.yaml b/scripts/config.yaml similarity index 53% rename from config.yaml rename to scripts/config.yaml index d34ff33..6684a5f 100644 --- a/config.yaml +++ b/scripts/config.yaml @@ -2,6 +2,8 @@ use_lsf: true use_singularity: false use_debugpy: false +config_file: "config.yaml" + pipelines: - 1 # Run data processing - 2 # Run training @@ -12,9 +14,18 @@ preprocessing: base_dir: "data_xenium" data_dir: "data_segger" sample_type: "xenium" + k_bd: 3 + dist_bd: 15.0 + k_tx: 3 + dist_tx: 5.0 + tile_size: null tile_width: 120 tile_height: 120 - workers: 12 + neg_sampling_ratio: 5.0 + frac: 1.0 + val_prob: 0.1 + test_prob: 0.2 + n_workers: 12 memory: "16G" # this is ignored if use_lsf is false training: @@ -22,24 +33,45 @@ training: dataset_dir: "data_segger" models_dir: "model_dir" sample_tag: "first_training" - workers: 12 + init_emb: 8 + hidden_channels: 32 + num_tx_tokens: 500 + out_channels: 8 + heads: 2 + num_mid_layers: 2 + batch_size: 4 + num_workers: 12 + accelerator: "cuda" + max_epochs: 200 + devices: 8 + strategy: "auto" + precision: "16-mixed" memory: "16G" # this is ignored if use_lsf is false - gpus: 8 gpu_memory: "8G" # this is ignored if use_lsf is false prediction: - output_log: "predict_output.log" + output_log: "predict_parquet_final_output.log" segger_data_dir: "data_segger" models_dir: "model_dir" benchmarks_dir: "benchmark_dir" transcripts_file: "data_xenium/transcripts.parquet" + batch_size: 1 + num_workers: 12 + model_version: 0 + save_tag: segger_embedding + min_transcripts: 5 + cell_id_col: segger_cell_id + use_cc: false knn_method: "cuda" - workers: 12 + file_format: "anndata" + k_bd: 4 + dist_bd: 15.0 + k_tx: 5 + dist_tx: 5.0 memory: "16G" # this is ignored if use_lsf is false gpu_memory: "8G" # this is ignored if use_lsf is false -# path_mappings are ignored if use_singularity is false path_mappings: local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" - container_dir: "/workspace/segger_dev" - singularity_image: "segger_dev_latest.sif" + container_dir: "/workspace/segger_dev" # this is ignored if use_singularity is false + singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false diff --git a/submit_job.py b/scripts/submit_job.py similarity index 56% rename from submit_job.py rename to scripts/submit_job.py index 44633b7..9ebb9de 100644 --- a/submit_job.py +++ b/scripts/submit_job.py @@ -1,13 +1,17 @@ import yaml import subprocess import argparse +import os parser = argparse.ArgumentParser() parser.add_argument("--config", default="config.yaml", help="Path to the configuration YAML file") args = parser.parse_args() +script_dir = os.path.dirname(os.path.realpath(__file__)) +config_file_path = os.path.join(script_dir, args.config) + # Load the YAML configuration file -with open(args.config, "r") as file: +with open(config_file_path, "r") as file: config = yaml.safe_load(file) # Function to get Singularity command if enabled @@ -28,26 +32,53 @@ def get_singularity_command(use_gpu=False): def get_python_command(): return ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] if config.get('use_debugpy', False) else ["python3"] +# Function to get the correct base directory (local or container) +def get_base_dir(): + if config.get('use_singularity', False): + return config['path_mappings']['container_dir'] + return config['path_mappings']['local_repo_dir'] + +base_dir = get_base_dir() + # Define the pipeline functions # Run the data processing pipeline def run_data_processing(): command = get_singularity_command(use_gpu=False) + get_python_command() + [ - "src/segger/cli/create_dataset_fast.py", + f"{base_dir}/src/segger/cli/create_dataset_fast.py", "--base_dir", config["preprocessing"]["base_dir"], "--data_dir", config["preprocessing"]["data_dir"], "--sample_type", config["preprocessing"]["sample_type"], - "--tile_width", - str(config["preprocessing"]["tile_width"]), - "--tile_height", - str(config["preprocessing"]["tile_height"]), + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + "--neg_sampling_ratio", + str(config["preprocessing"]["neg_sampling_ratio"]), + "--frac", + str(config["preprocessing"]["frac"]), + "--val_prob", + str(config["preprocessing"]["val_prob"]), + "--test_prob", + str(config["preprocessing"]["test_prob"]), "--n_workers", - str(config["preprocessing"]["workers"]), + str(config["preprocessing"]["n_workers"]), ] + if config["preprocessing"].get("tile_size") is not None: + command.extend(["--tile_size", str(config["preprocessing"]["tile_size"])]) + if config["preprocessing"].get("tile_width") is not None: + command.extend(["--tile_width", str(config["preprocessing"]["tile_width"])]) + if config["preprocessing"].get("tile_height") is not None: + command.extend(["--tile_height", str(config["preprocessing"]["tile_height"])]) + if config.get('use_lsf', False): command = [ "bsub", @@ -56,7 +87,7 @@ def run_data_processing(): "-o", config["preprocessing"]["output_log"], "-n", - str(config["preprocessing"]["workers"]), + str(config["preprocessing"]["n_workers"]), "-R", f"rusage[mem={config['preprocessing']['memory']}]", "-q", @@ -64,6 +95,7 @@ def run_data_processing(): ] + command try: + print(f"Running command: {command}") subprocess.run(command, check=True) except subprocess.CalledProcessError as e: print(f"Error running data processing pipeline: {e}") @@ -72,17 +104,39 @@ def run_data_processing(): # Run the training pipeline def run_training(): command = get_singularity_command(use_gpu=True) + get_python_command() + [ - "src/segger/cli/train_model.py", + f"{base_dir}/src/segger/cli/train_model.py", "--dataset_dir", config["training"]["dataset_dir"], "--models_dir", config["training"]["models_dir"], "--sample_tag", config["training"]["sample_tag"], + "--init_emb", + str(config["training"]["init_emb"]), + "--hidden_channels", + str(config["training"]["hidden_channels"]), + "--num_tx_tokens", + str(config["training"]["num_tx_tokens"]), + "--out_channels", + str(config["training"]["out_channels"]), + "--heads", + str(config["training"]["heads"]), + "--num_mid_layers", + str(config["training"]["num_mid_layers"]), + "--batch_size", + str(config["training"]["batch_size"]), "--num_workers", - str(config["training"]["workers"]), + str(config["training"]["num_workers"]), + "--accelerator", + config["training"]["accelerator"], + "--max_epochs", + str(config["training"]["max_epochs"]), "--devices", - str(config["training"]["gpus"]), + str(config["training"]["devices"]), + "--strategy", + config["training"]["strategy"], + "--precision", + config["training"]["precision"], ] if config.get('use_lsf', False): @@ -95,7 +149,7 @@ def run_training(): "-o", config["training"]["output_log"], "-n", - str(config["training"]["workers"]), + str(config["training"]["num_workers"]), "-R", f"rusage[mem={config['training']['memory']}]", "-R", @@ -107,6 +161,7 @@ def run_training(): ] + command try: + print(f"Running command: {command}") subprocess.run(command, check=True) except subprocess.CalledProcessError as e: print(f"Error running training pipeline: {e}") @@ -115,7 +170,7 @@ def run_training(): # Run the prediction pipeline def run_prediction(): command = get_singularity_command(use_gpu=True) + get_python_command() + [ - "src/segger/cli/predict.py", + f"{base_dir}/src/segger/cli/predict.py", "--segger_data_dir", config["prediction"]["segger_data_dir"], "--models_dir", @@ -124,10 +179,32 @@ def run_prediction(): config["prediction"]["benchmarks_dir"], "--transcripts_file", config["prediction"]["transcripts_file"], + "--batch_size", + str(config["prediction"]["batch_size"]), + "--num_workers", + str(config["prediction"]["num_workers"]), + "--model_version", + str(config["prediction"]["model_version"]), + "--save_tag", + config["prediction"]["save_tag"], + "--min_transcripts", + str(config["prediction"]["min_transcripts"]), + "--cell_id_col", + str(config["prediction"]["cell_id_col"]), + "--use_cc", + str(config["prediction"]["use_cc"]), "--knn_method", config["prediction"]["knn_method"], - "--num_workers", - str(config["prediction"]["workers"]), + "--file_format", + config["prediction"]["file_format"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), ] if config.get('use_lsf', False): @@ -140,7 +217,7 @@ def run_prediction(): "-o", config["prediction"]["output_log"], "-n", - str(config["prediction"]["workers"]), + str(config["prediction"]["num_workers"]), "-R", f"rusage[mem={config['prediction']['memory']}]", "-R", @@ -152,6 +229,7 @@ def run_prediction(): ] + command try: + print(f"Running command: {command}") subprocess.run(command, check=True) except subprocess.CalledProcessError as e: print(f"Error running prediction pipeline: {e}") From 19447f440441471e85c33d8b5070b81f7a24474b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 00:28:59 +0000 Subject: [PATCH 115/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/submit_job.py | 217 +++++++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 100 deletions(-) diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 014358e..a3101ca 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -35,40 +35,49 @@ def get_singularity_command(use_gpu=False): # Function to get Python command def get_python_command(): - return ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] if config.get('use_debugpy', False) else ["python3"] + return ( + ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] + if config.get("use_debugpy", False) + else ["python3"] + ) + # Define the pipeline functions # Run the data processing pipeline def run_data_processing(): - command = get_singularity_command(use_gpu=False) + get_python_command() + [ - f"{base_dir}/src/segger/cli/create_dataset_fast.py", - "--base_dir", - config["preprocessing"]["base_dir"], - "--data_dir", - config["preprocessing"]["data_dir"], - "--sample_type", - config["preprocessing"]["sample_type"], - "--k_bd", - str(config["preprocessing"]["k_bd"]), - "--dist_bd", - str(config["preprocessing"]["dist_bd"]), - "--k_tx", - str(config["preprocessing"]["k_tx"]), - "--dist_tx", - str(config["preprocessing"]["dist_tx"]), - "--neg_sampling_ratio", - str(config["preprocessing"]["neg_sampling_ratio"]), - "--frac", - str(config["preprocessing"]["frac"]), - "--val_prob", - str(config["preprocessing"]["val_prob"]), - "--test_prob", - str(config["preprocessing"]["test_prob"]), - "--n_workers", - str(config["preprocessing"]["n_workers"]), - ] + command = ( + get_singularity_command(use_gpu=False) + + get_python_command() + + [ + f"{base_dir}/src/segger/cli/create_dataset_fast.py", + "--base_dir", + config["preprocessing"]["base_dir"], + "--data_dir", + config["preprocessing"]["data_dir"], + "--sample_type", + config["preprocessing"]["sample_type"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + "--neg_sampling_ratio", + str(config["preprocessing"]["neg_sampling_ratio"]), + "--frac", + str(config["preprocessing"]["frac"]), + "--val_prob", + str(config["preprocessing"]["val_prob"]), + "--test_prob", + str(config["preprocessing"]["test_prob"]), + "--n_workers", + str(config["preprocessing"]["n_workers"]), + ] + ) if config["preprocessing"].get("tile_size") is not None: command.extend(["--tile_size", str(config["preprocessing"]["tile_size"])]) @@ -101,41 +110,45 @@ def run_data_processing(): # Run the training pipeline def run_training(): - command = get_singularity_command(use_gpu=True) + get_python_command() + [ - f"{base_dir}/src/segger/cli/train_model.py", - "--dataset_dir", - config["training"]["dataset_dir"], - "--models_dir", - config["training"]["models_dir"], - "--sample_tag", - config["training"]["sample_tag"], - "--init_emb", - str(config["training"]["init_emb"]), - "--hidden_channels", - str(config["training"]["hidden_channels"]), - "--num_tx_tokens", - str(config["training"]["num_tx_tokens"]), - "--out_channels", - str(config["training"]["out_channels"]), - "--heads", - str(config["training"]["heads"]), - "--num_mid_layers", - str(config["training"]["num_mid_layers"]), - "--batch_size", - str(config["training"]["batch_size"]), - "--num_workers", - str(config["training"]["num_workers"]), - "--accelerator", - config["training"]["accelerator"], - "--max_epochs", - str(config["training"]["max_epochs"]), - "--devices", - str(config["training"]["devices"]), - "--strategy", - config["training"]["strategy"], - "--precision", - config["training"]["precision"], - ] + command = ( + get_singularity_command(use_gpu=True) + + get_python_command() + + [ + f"{base_dir}/src/segger/cli/train_model.py", + "--dataset_dir", + config["training"]["dataset_dir"], + "--models_dir", + config["training"]["models_dir"], + "--sample_tag", + config["training"]["sample_tag"], + "--init_emb", + str(config["training"]["init_emb"]), + "--hidden_channels", + str(config["training"]["hidden_channels"]), + "--num_tx_tokens", + str(config["training"]["num_tx_tokens"]), + "--out_channels", + str(config["training"]["out_channels"]), + "--heads", + str(config["training"]["heads"]), + "--num_mid_layers", + str(config["training"]["num_mid_layers"]), + "--batch_size", + str(config["training"]["batch_size"]), + "--num_workers", + str(config["training"]["num_workers"]), + "--accelerator", + config["training"]["accelerator"], + "--max_epochs", + str(config["training"]["max_epochs"]), + "--devices", + str(config["training"]["devices"]), + "--strategy", + config["training"]["strategy"], + "--precision", + config["training"]["precision"], + ] + ) if config.get("use_lsf", False): command = [ @@ -167,43 +180,47 @@ def run_training(): # Run the prediction pipeline def run_prediction(): - command = get_singularity_command(use_gpu=True) + get_python_command() + [ - f"{base_dir}/src/segger/cli/predict.py", - "--segger_data_dir", - config["prediction"]["segger_data_dir"], - "--models_dir", - config["prediction"]["models_dir"], - "--benchmarks_dir", - config["prediction"]["benchmarks_dir"], - "--transcripts_file", - config["prediction"]["transcripts_file"], - "--batch_size", - str(config["prediction"]["batch_size"]), - "--num_workers", - str(config["prediction"]["num_workers"]), - "--model_version", - str(config["prediction"]["model_version"]), - "--save_tag", - config["prediction"]["save_tag"], - "--min_transcripts", - str(config["prediction"]["min_transcripts"]), - "--cell_id_col", - str(config["prediction"]["cell_id_col"]), - "--use_cc", - str(config["prediction"]["use_cc"]), - "--knn_method", - config["prediction"]["knn_method"], - "--file_format", - config["prediction"]["file_format"], - "--k_bd", - str(config["preprocessing"]["k_bd"]), - "--dist_bd", - str(config["preprocessing"]["dist_bd"]), - "--k_tx", - str(config["preprocessing"]["k_tx"]), - "--dist_tx", - str(config["preprocessing"]["dist_tx"]), - ] + command = ( + get_singularity_command(use_gpu=True) + + get_python_command() + + [ + f"{base_dir}/src/segger/cli/predict.py", + "--segger_data_dir", + config["prediction"]["segger_data_dir"], + "--models_dir", + config["prediction"]["models_dir"], + "--benchmarks_dir", + config["prediction"]["benchmarks_dir"], + "--transcripts_file", + config["prediction"]["transcripts_file"], + "--batch_size", + str(config["prediction"]["batch_size"]), + "--num_workers", + str(config["prediction"]["num_workers"]), + "--model_version", + str(config["prediction"]["model_version"]), + "--save_tag", + config["prediction"]["save_tag"], + "--min_transcripts", + str(config["prediction"]["min_transcripts"]), + "--cell_id_col", + str(config["prediction"]["cell_id_col"]), + "--use_cc", + str(config["prediction"]["use_cc"]), + "--knn_method", + config["prediction"]["knn_method"], + "--file_format", + config["prediction"]["file_format"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + ] + ) if config.get("use_lsf", False): command = [ From e66a5de492c86d3e1ad93ae6ce155c7955b5ab7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 03:09:59 +0200 Subject: [PATCH 116/156] Adjustments in CLI / job submission --- scripts/config.yaml | 7 +- scripts/submit_job.py | 225 +++++++++----------- src/segger/cli/configs/predict/default.yaml | 6 +- src/segger/cli/predict.py | 8 +- 4 files changed, 115 insertions(+), 131 deletions(-) diff --git a/scripts/config.yaml b/scripts/config.yaml index 6684a5f..4165861 100644 --- a/scripts/config.yaml +++ b/scripts/config.yaml @@ -71,7 +71,6 @@ prediction: memory: "16G" # this is ignored if use_lsf is false gpu_memory: "8G" # this is ignored if use_lsf is false -path_mappings: - local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" - container_dir: "/workspace/segger_dev" # this is ignored if use_singularity is false - singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false +local_repo_dir: "/omics/groups/OE0540/internal_temp/users/danielu/segger_dev" +container_dir: "/workspace/segger_dev" # this is ignored if use_singularity is false +singularity_image: "segger_dev_latest.sif" # this is ignored if use_singularity is false diff --git a/scripts/submit_job.py b/scripts/submit_job.py index a3101ca..e1c51b9 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -14,6 +14,8 @@ with open(config_file_path, "r") as file: config = yaml.safe_load(file) +# Get the base directory +repo_dir = config["container_dir"] if config.get("use_singularity", False) else config["local_repo_dir"] # Function to get Singularity command if enabled def get_singularity_command(use_gpu=False): @@ -22,62 +24,53 @@ def get_singularity_command(use_gpu=False): "singularity", "exec", "--bind", - f"{config['path_mappings']['local_repo_dir']}:{config['path_mappings']['container_dir']}", + f"{config['local_repo_dir']}:{config['container_dir']}", "--pwd", - config["path_mappings"]["container_dir"], + config["container_dir"], ] if use_gpu: singularity_command.append("--nv") - singularity_command.append(config["path_mappings"]["singularity_image"]) + singularity_command.append(config["singularity_image"]) return singularity_command return [] # Return an empty list if Singularity is not enabled # Function to get Python command def get_python_command(): - return ( - ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] - if config.get("use_debugpy", False) - else ["python3"] - ) - + return ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] if config.get('use_debugpy', False) else ["python3"] # Define the pipeline functions # Run the data processing pipeline def run_data_processing(): - command = ( - get_singularity_command(use_gpu=False) - + get_python_command() - + [ - f"{base_dir}/src/segger/cli/create_dataset_fast.py", - "--base_dir", - config["preprocessing"]["base_dir"], - "--data_dir", - config["preprocessing"]["data_dir"], - "--sample_type", - config["preprocessing"]["sample_type"], - "--k_bd", - str(config["preprocessing"]["k_bd"]), - "--dist_bd", - str(config["preprocessing"]["dist_bd"]), - "--k_tx", - str(config["preprocessing"]["k_tx"]), - "--dist_tx", - str(config["preprocessing"]["dist_tx"]), - "--neg_sampling_ratio", - str(config["preprocessing"]["neg_sampling_ratio"]), - "--frac", - str(config["preprocessing"]["frac"]), - "--val_prob", - str(config["preprocessing"]["val_prob"]), - "--test_prob", - str(config["preprocessing"]["test_prob"]), - "--n_workers", - str(config["preprocessing"]["n_workers"]), - ] - ) + command = get_singularity_command(use_gpu=False) + get_python_command() + [ + f"{repo_dir}/src/segger/cli/create_dataset_fast.py", + "--base_dir", + config["preprocessing"]["base_dir"], + "--data_dir", + config["preprocessing"]["data_dir"], + "--sample_type", + config["preprocessing"]["sample_type"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + "--neg_sampling_ratio", + str(config["preprocessing"]["neg_sampling_ratio"]), + "--frac", + str(config["preprocessing"]["frac"]), + "--val_prob", + str(config["preprocessing"]["val_prob"]), + "--test_prob", + str(config["preprocessing"]["test_prob"]), + "--n_workers", + str(config["preprocessing"]["n_workers"]), + ] if config["preprocessing"].get("tile_size") is not None: command.extend(["--tile_size", str(config["preprocessing"]["tile_size"])]) @@ -110,45 +103,41 @@ def run_data_processing(): # Run the training pipeline def run_training(): - command = ( - get_singularity_command(use_gpu=True) - + get_python_command() - + [ - f"{base_dir}/src/segger/cli/train_model.py", - "--dataset_dir", - config["training"]["dataset_dir"], - "--models_dir", - config["training"]["models_dir"], - "--sample_tag", - config["training"]["sample_tag"], - "--init_emb", - str(config["training"]["init_emb"]), - "--hidden_channels", - str(config["training"]["hidden_channels"]), - "--num_tx_tokens", - str(config["training"]["num_tx_tokens"]), - "--out_channels", - str(config["training"]["out_channels"]), - "--heads", - str(config["training"]["heads"]), - "--num_mid_layers", - str(config["training"]["num_mid_layers"]), - "--batch_size", - str(config["training"]["batch_size"]), - "--num_workers", - str(config["training"]["num_workers"]), - "--accelerator", - config["training"]["accelerator"], - "--max_epochs", - str(config["training"]["max_epochs"]), - "--devices", - str(config["training"]["devices"]), - "--strategy", - config["training"]["strategy"], - "--precision", - config["training"]["precision"], - ] - ) + command = get_singularity_command(use_gpu=True) + get_python_command() + [ + f"{repo_dir}/src/segger/cli/train_model.py", + "--dataset_dir", + config["training"]["dataset_dir"], + "--models_dir", + config["training"]["models_dir"], + "--sample_tag", + config["training"]["sample_tag"], + "--init_emb", + str(config["training"]["init_emb"]), + "--hidden_channels", + str(config["training"]["hidden_channels"]), + "--num_tx_tokens", + str(config["training"]["num_tx_tokens"]), + "--out_channels", + str(config["training"]["out_channels"]), + "--heads", + str(config["training"]["heads"]), + "--num_mid_layers", + str(config["training"]["num_mid_layers"]), + "--batch_size", + str(config["training"]["batch_size"]), + "--num_workers", + str(config["training"]["num_workers"]), + "--accelerator", + config["training"]["accelerator"], + "--max_epochs", + str(config["training"]["max_epochs"]), + "--devices", + str(config["training"]["devices"]), + "--strategy", + config["training"]["strategy"], + "--precision", + config["training"]["precision"], + ] if config.get("use_lsf", False): command = [ @@ -180,47 +169,43 @@ def run_training(): # Run the prediction pipeline def run_prediction(): - command = ( - get_singularity_command(use_gpu=True) - + get_python_command() - + [ - f"{base_dir}/src/segger/cli/predict.py", - "--segger_data_dir", - config["prediction"]["segger_data_dir"], - "--models_dir", - config["prediction"]["models_dir"], - "--benchmarks_dir", - config["prediction"]["benchmarks_dir"], - "--transcripts_file", - config["prediction"]["transcripts_file"], - "--batch_size", - str(config["prediction"]["batch_size"]), - "--num_workers", - str(config["prediction"]["num_workers"]), - "--model_version", - str(config["prediction"]["model_version"]), - "--save_tag", - config["prediction"]["save_tag"], - "--min_transcripts", - str(config["prediction"]["min_transcripts"]), - "--cell_id_col", - str(config["prediction"]["cell_id_col"]), - "--use_cc", - str(config["prediction"]["use_cc"]), - "--knn_method", - config["prediction"]["knn_method"], - "--file_format", - config["prediction"]["file_format"], - "--k_bd", - str(config["preprocessing"]["k_bd"]), - "--dist_bd", - str(config["preprocessing"]["dist_bd"]), - "--k_tx", - str(config["preprocessing"]["k_tx"]), - "--dist_tx", - str(config["preprocessing"]["dist_tx"]), - ] - ) + command = get_singularity_command(use_gpu=True) + get_python_command() + [ + f"{repo_dir}/src/segger/cli/predict.py", + "--segger_data_dir", + config["prediction"]["segger_data_dir"], + "--models_dir", + config["prediction"]["models_dir"], + "--benchmarks_dir", + config["prediction"]["benchmarks_dir"], + "--transcripts_file", + config["prediction"]["transcripts_file"], + "--batch_size", + str(config["prediction"]["batch_size"]), + "--num_workers", + str(config["prediction"]["num_workers"]), + "--model_version", + str(config["prediction"]["model_version"]), + "--save_tag", + config["prediction"]["save_tag"], + "--min_transcripts", + str(config["prediction"]["min_transcripts"]), + "--cell_id_col", + str(config["prediction"]["cell_id_col"]), + "--use_cc", + str(config["prediction"]["use_cc"]), + "--knn_method", + config["prediction"]["knn_method"], + "--file_format", + config["prediction"]["file_format"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + ] if config.get("use_lsf", False): command = [ diff --git a/src/segger/cli/configs/predict/default.yaml b/src/segger/cli/configs/predict/default.yaml index 4ff49d0..e9046b6 100644 --- a/src/segger/cli/configs/predict/default.yaml +++ b/src/segger/cli/configs/predict/default.yaml @@ -39,7 +39,7 @@ cell_id_col: default: "segger_cell_id" help: Column name for cell IDs. use_cc: - is_flag: true + type: bool default: false help: Use connected components if specified. knn_method: @@ -55,7 +55,7 @@ k_bd: default: 4 help: K value for boundary computation. dist_bd: - type: int + type: float default: 12 help: Distance for boundary computation. k_tx: @@ -63,6 +63,6 @@ k_tx: default: 5 help: K value for transcript computation. dist_tx: - type: int + type: float default: 5 help: Distance for transcript computation. diff --git a/src/segger/cli/predict.py b/src/segger/cli/predict.py index 6d16bd1..a362fb1 100755 --- a/src/segger/cli/predict.py +++ b/src/segger/cli/predict.py @@ -24,16 +24,16 @@ @click.option("--batch_size", type=int, default=1, help="Batch size for processing.") @click.option("--num_workers", type=int, default=1, help="Number of workers for data loading.") @click.option("--model_version", type=int, default=0, help="Model version to load.") -@click.option("--save_tag", type=str, default="segger_embedding_1001_0.5", help="Tag for saving segmentation results.") +@click.option("--save_tag", type=str, default="segger_embedding_1001", help="Tag for saving segmentation results.") @click.option("--min_transcripts", type=int, default=5, help="Minimum number of transcripts for segmentation.") @click.option("--cell_id_col", type=str, default="segger_cell_id", help="Column name for cell IDs.") -@click.option("--use_cc", is_flag=True, default=False, help="Use connected components if specified.") +@click.option("--use_cc", type=bool, default=False, help="Use connected components if specified.") @click.option("--knn_method", type=str, default="cuda", help="Method for KNN computation.") @click.option("--file_format", type=str, default="anndata", help="File format for output data.") @click.option("--k_bd", type=int, default=4, help="K value for boundary computation.") -@click.option("--dist_bd", type=int, default=12, help="Distance for boundary computation.") +@click.option("--dist_bd", type=float, default=12.0, help="Distance for boundary computation.") @click.option("--k_tx", type=int, default=5, help="K value for transcript computation.") -@click.option("--dist_tx", type=int, default=5, help="Distance for transcript computation.") +@click.option("--dist_tx", type=float, default=5.0, help="Distance for transcript computation.") def run_segmentation(args: Namespace): # Setup logging From bdcb9153074176b1df21280a73be2e87f8741b3a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 01:10:13 +0000 Subject: [PATCH 117/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/submit_job.py | 218 +++++++++++++++++++++++------------------- 1 file changed, 118 insertions(+), 100 deletions(-) diff --git a/scripts/submit_job.py b/scripts/submit_job.py index e1c51b9..9952ae3 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -17,6 +17,7 @@ # Get the base directory repo_dir = config["container_dir"] if config.get("use_singularity", False) else config["local_repo_dir"] + # Function to get Singularity command if enabled def get_singularity_command(use_gpu=False): if config.get("use_singularity", False): @@ -37,40 +38,49 @@ def get_singularity_command(use_gpu=False): # Function to get Python command def get_python_command(): - return ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] if config.get('use_debugpy', False) else ["python3"] + return ( + ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client"] + if config.get("use_debugpy", False) + else ["python3"] + ) + # Define the pipeline functions # Run the data processing pipeline def run_data_processing(): - command = get_singularity_command(use_gpu=False) + get_python_command() + [ - f"{repo_dir}/src/segger/cli/create_dataset_fast.py", - "--base_dir", - config["preprocessing"]["base_dir"], - "--data_dir", - config["preprocessing"]["data_dir"], - "--sample_type", - config["preprocessing"]["sample_type"], - "--k_bd", - str(config["preprocessing"]["k_bd"]), - "--dist_bd", - str(config["preprocessing"]["dist_bd"]), - "--k_tx", - str(config["preprocessing"]["k_tx"]), - "--dist_tx", - str(config["preprocessing"]["dist_tx"]), - "--neg_sampling_ratio", - str(config["preprocessing"]["neg_sampling_ratio"]), - "--frac", - str(config["preprocessing"]["frac"]), - "--val_prob", - str(config["preprocessing"]["val_prob"]), - "--test_prob", - str(config["preprocessing"]["test_prob"]), - "--n_workers", - str(config["preprocessing"]["n_workers"]), - ] + command = ( + get_singularity_command(use_gpu=False) + + get_python_command() + + [ + f"{repo_dir}/src/segger/cli/create_dataset_fast.py", + "--base_dir", + config["preprocessing"]["base_dir"], + "--data_dir", + config["preprocessing"]["data_dir"], + "--sample_type", + config["preprocessing"]["sample_type"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + "--neg_sampling_ratio", + str(config["preprocessing"]["neg_sampling_ratio"]), + "--frac", + str(config["preprocessing"]["frac"]), + "--val_prob", + str(config["preprocessing"]["val_prob"]), + "--test_prob", + str(config["preprocessing"]["test_prob"]), + "--n_workers", + str(config["preprocessing"]["n_workers"]), + ] + ) if config["preprocessing"].get("tile_size") is not None: command.extend(["--tile_size", str(config["preprocessing"]["tile_size"])]) @@ -103,41 +113,45 @@ def run_data_processing(): # Run the training pipeline def run_training(): - command = get_singularity_command(use_gpu=True) + get_python_command() + [ - f"{repo_dir}/src/segger/cli/train_model.py", - "--dataset_dir", - config["training"]["dataset_dir"], - "--models_dir", - config["training"]["models_dir"], - "--sample_tag", - config["training"]["sample_tag"], - "--init_emb", - str(config["training"]["init_emb"]), - "--hidden_channels", - str(config["training"]["hidden_channels"]), - "--num_tx_tokens", - str(config["training"]["num_tx_tokens"]), - "--out_channels", - str(config["training"]["out_channels"]), - "--heads", - str(config["training"]["heads"]), - "--num_mid_layers", - str(config["training"]["num_mid_layers"]), - "--batch_size", - str(config["training"]["batch_size"]), - "--num_workers", - str(config["training"]["num_workers"]), - "--accelerator", - config["training"]["accelerator"], - "--max_epochs", - str(config["training"]["max_epochs"]), - "--devices", - str(config["training"]["devices"]), - "--strategy", - config["training"]["strategy"], - "--precision", - config["training"]["precision"], - ] + command = ( + get_singularity_command(use_gpu=True) + + get_python_command() + + [ + f"{repo_dir}/src/segger/cli/train_model.py", + "--dataset_dir", + config["training"]["dataset_dir"], + "--models_dir", + config["training"]["models_dir"], + "--sample_tag", + config["training"]["sample_tag"], + "--init_emb", + str(config["training"]["init_emb"]), + "--hidden_channels", + str(config["training"]["hidden_channels"]), + "--num_tx_tokens", + str(config["training"]["num_tx_tokens"]), + "--out_channels", + str(config["training"]["out_channels"]), + "--heads", + str(config["training"]["heads"]), + "--num_mid_layers", + str(config["training"]["num_mid_layers"]), + "--batch_size", + str(config["training"]["batch_size"]), + "--num_workers", + str(config["training"]["num_workers"]), + "--accelerator", + config["training"]["accelerator"], + "--max_epochs", + str(config["training"]["max_epochs"]), + "--devices", + str(config["training"]["devices"]), + "--strategy", + config["training"]["strategy"], + "--precision", + config["training"]["precision"], + ] + ) if config.get("use_lsf", False): command = [ @@ -169,43 +183,47 @@ def run_training(): # Run the prediction pipeline def run_prediction(): - command = get_singularity_command(use_gpu=True) + get_python_command() + [ - f"{repo_dir}/src/segger/cli/predict.py", - "--segger_data_dir", - config["prediction"]["segger_data_dir"], - "--models_dir", - config["prediction"]["models_dir"], - "--benchmarks_dir", - config["prediction"]["benchmarks_dir"], - "--transcripts_file", - config["prediction"]["transcripts_file"], - "--batch_size", - str(config["prediction"]["batch_size"]), - "--num_workers", - str(config["prediction"]["num_workers"]), - "--model_version", - str(config["prediction"]["model_version"]), - "--save_tag", - config["prediction"]["save_tag"], - "--min_transcripts", - str(config["prediction"]["min_transcripts"]), - "--cell_id_col", - str(config["prediction"]["cell_id_col"]), - "--use_cc", - str(config["prediction"]["use_cc"]), - "--knn_method", - config["prediction"]["knn_method"], - "--file_format", - config["prediction"]["file_format"], - "--k_bd", - str(config["preprocessing"]["k_bd"]), - "--dist_bd", - str(config["preprocessing"]["dist_bd"]), - "--k_tx", - str(config["preprocessing"]["k_tx"]), - "--dist_tx", - str(config["preprocessing"]["dist_tx"]), - ] + command = ( + get_singularity_command(use_gpu=True) + + get_python_command() + + [ + f"{repo_dir}/src/segger/cli/predict.py", + "--segger_data_dir", + config["prediction"]["segger_data_dir"], + "--models_dir", + config["prediction"]["models_dir"], + "--benchmarks_dir", + config["prediction"]["benchmarks_dir"], + "--transcripts_file", + config["prediction"]["transcripts_file"], + "--batch_size", + str(config["prediction"]["batch_size"]), + "--num_workers", + str(config["prediction"]["num_workers"]), + "--model_version", + str(config["prediction"]["model_version"]), + "--save_tag", + config["prediction"]["save_tag"], + "--min_transcripts", + str(config["prediction"]["min_transcripts"]), + "--cell_id_col", + str(config["prediction"]["cell_id_col"]), + "--use_cc", + str(config["prediction"]["use_cc"]), + "--knn_method", + config["prediction"]["knn_method"], + "--file_format", + config["prediction"]["file_format"], + "--k_bd", + str(config["preprocessing"]["k_bd"]), + "--dist_bd", + str(config["preprocessing"]["dist_bd"]), + "--k_tx", + str(config["preprocessing"]["k_tx"]), + "--dist_tx", + str(config["preprocessing"]["dist_tx"]), + ] + ) if config.get("use_lsf", False): command = [ From 60d5296fa9dbfec23c0ed38d804215fa8897b4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 12:00:32 +0200 Subject: [PATCH 118/156] Fix sourcedir in docs makefile --- docs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Makefile b/docs/Makefile index 89fe005..cd45ea5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,7 +5,7 @@ SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = segger -SOURCEDIR = . +SOURCEDIR = source BUILDDIR = _build # Put it first so that "make" without argument is like "make help". From b9b706360f28d66b6871b0ae27225d9deef52f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 14:22:30 +0200 Subject: [PATCH 119/156] Tutorial for faster dataset creation --- docs/notebooks/segger_tutorial.ipynb | 71 ++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index 6982d34..4a63aae 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -173,6 +173,77 @@ " print(f'Dataset already exists at {segger_data_dir}')" ] }, + { + "cell_type": "markdown", + "id": "9d2b090b", + "metadata": {}, + "source": [ + "## **1.2 Faster Dataset Creation with Segger**\n", + "\n", + "Segger introduces a faster, more efficient pipeline for processing spatial transcriptomics data. This method accelerates dataset creation, particularly for large datasets, by using **ND-tree-based spatial partitioning** and **parallel processing**. This results in a much faster preparation of the dataset, which is saved in PyTorch Geometric (PyG) format, similar to the previous method.\n", + "\n", + "**Note**: The previous dataset creation method will soon be deprecated in favor of this optimized pipeline.\n", + "\n", + "#### **Requirements for the Faster Pipeline**\n", + "The pipeline requires the following inputs:\n", + "\n", + "- **base_dir**: The directory containing the raw dataset.\n", + "- **data_dir**: The directory where the processed dataset (tiles in PyG format) will be saved.\n", + "\n", + "The core improvements in this method come from the use of **ND-tree partitioning**, which splits the data efficiently into spatial regions, and **parallel processing**, which speeds up the handling of these regions across multiple CPU cores. For example, using this pipeline, the Xenium Human Pancreatic Dataset can be processed in just a few minutes when running with 16 workers.\n", + "\n", + "#### **Running the Faster Dataset Creation Pipeline**\n", + "Below is an example of how to create a dataset using the faster Segger pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e933ebf3", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 create_dataset_fast.py --base_dir path/to/raw_data \\\n", + "--data_dir path/to/save/processed_data \\\n", + "--sample_type xenium \\\n", + "--k_bd 3 \\\n", + "--dist_bd 15.0 \\\n", + "--k_tx 3 \\\n", + "--dist_tx 5.0 \\\n", + "--tile_width 200 \\\n", + "--tile_height 200 \\\n", + "--neg_sampling_ratio 5.0 \\\n", + "--frac 1.0 \\\n", + "--val_prob 0.1 \\\n", + "--test_prob 0.2 \\\n", + "--n_workers 12" + ] + }, + { + "cell_type": "markdown", + "id": "6ab27f9a", + "metadata": {}, + "source": [ + "#### **Parameters**\n", + "Here is a complete list of parameters you can use to control the dataset creation process:\n", + "\n", + "- **--base_dir**: Directory containing the raw spatial transcriptomics dataset.\n", + "- **--data_dir**: Directory where the processed Segger dataset (in PyG format) will be saved.\n", + "- **--sample_type**: (Optional) Specifies the type of dataset (e.g., \"xenium\" or \"merscope\"). Defaults to None.\n", + "- **--k_bd**: Number of nearest neighbors for boundary nodes (default: 3).\n", + "- **--dist_bd**: Maximum distance for boundary neighbors (default: 15.0).\n", + "- **--k_tx**: Number of nearest neighbors for transcript nodes (default: 3).\n", + "- **--dist_tx**: Maximum distance for transcript neighbors (default: 5.0).\n", + "- **--tile_size**: Specifies the size of the tile. If provided, it overrides both tile_width and tile_height.\n", + "- **--tile_width**: Width of the tiles in pixels (ignored if tile_size is provided).\n", + "- **--tile_height**: Height of the tiles in pixels (ignored if tile_size is provided).\n", + "- **--neg_sampling_ratio**: Ratio of negative samples (default: 5.0).\n", + "- **--frac**: Fraction of the dataset to process (default: 1.0).\n", + "- **--val_prob**: Proportion of data used for validation split (default: 0.1).\n", + "- **--test_prob**: Proportion of data used for testing split (default: 0.2).\n", + "- **--n_workers**: Number of workers for parallel processing (default: 1)." + ] + }, { "cell_type": "markdown", "id": "9962e4b8-4028-4683-9b75-d674fa6fb01d", From 47e61ddac5df0a28922f87d84e260ec15506cb94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 16:51:38 +0200 Subject: [PATCH 120/156] Tutorial for faster prediction --- docs/notebooks/segger_tutorial.ipynb | 77 +++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index 4a63aae..d1daa90 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -178,7 +178,7 @@ "id": "9d2b090b", "metadata": {}, "source": [ - "## **1.2 Faster Dataset Creation with Segger**\n", + "### **1.2 Faster Dataset Creation with Segger**\n", "\n", "Segger introduces a faster, more efficient pipeline for processing spatial transcriptomics data. This method accelerates dataset creation, particularly for large datasets, by using **ND-tree-based spatial partitioning** and **parallel processing**. This results in a much faster preparation of the dataset, which is saved in PyTorch Geometric (PyG) format, similar to the previous method.\n", "\n", @@ -488,6 +488,81 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "9807abf3", + "metadata": {}, + "source": [ + "### **3.2 Faster Prediction with Segger**\n", + "We introduce a faster and more efficient pipeline for making predictions using a segger model. This new method accelerates the segmentation process by using CUDA-accelerated **nearest neighbors search** using [CAGRA](https://docs.rapids.ai/api/cuvs/stable/python_api/neighbors_cagra/) and **parallel processing**.\n", + "\n", + "**Note**: The previous prediction method will soon be deprecated in favor of this optimized pipeline.\n", + "\n", + "#### **Requirements for the Faster Prediction Pipeline**\n", + "The pipeline requires the following inputs:\n", + "\n", + "- **segger_data_dir**: The directory containing the processed Segger dataset (in PyG format).\n", + "- **models_dir**: The directory containing the trained Segger model checkpoints.\n", + "- **benchmarks_dir**: The directory where the segmentation results will be saved.\n", + "- **transcripts_file**: Path to the file containing the transcript data for prediction.\n", + "\n", + "#### **Running the Faster Prediction Pipeline**\n", + "Below is an example of how to run the faster Segger prediction pipeline using the command line:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e802c3f", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 predict_fast.py --segger_data_dir path/to/processed_data \\\n", + "--models_dir path/to/models_dir \\\n", + "--benchmarks_dir path/to/save/results \\\n", + "--transcripts_file path/to/transcripts_file \\\n", + "--batch_size 1 \\\n", + "--num_workers 12 \\\n", + "--model_version 0 \\\n", + "--save_tag segger_embedding_1001 \\\n", + "--min_transcripts 5 \\\n", + "--cell_id_col segger_cell_id \\\n", + "--use_cc False \\\n", + "--knn_method cuda \\\n", + "--file_format anndata \\\n", + "--k_bd 4 \\\n", + "--dist_bd 12.0 \\\n", + "--k_tx 5 \\\n", + "--dist_tx 5.0" + ] + }, + { + "cell_type": "markdown", + "id": "0a823035", + "metadata": {}, + "source": [ + "#### **Parameters**\n", + "Here is a detailed explanation of each parameter used in the faster prediction pipeline:\n", + "\n", + "- **--segger_data_dir**: The directory containing the processed Segger dataset, saved as PyTorch Geometric data objects, that will be used for prediction.\n", + "- **--models_dir**: The directory containing the trained Segger model checkpoints. These checkpoints store the learned weights required for making predictions.\n", + "- **--benchmarks_dir**: The directory where the segmentation results will be saved.\n", + "- **--transcripts_file**: Path to the *transcripts.parquet* file.\n", + "- **--batch_size**: Specifies the batch size for processing during prediction. Larger batch sizes speed up inference but use more memory (default: 1).\n", + "- **--num_workers**: Number of workers to use for parallel data loading (default: 1).\n", + "- **--model_version**: Version of the trained model to load for predictions, based on the version number from the training logs (default: 0).\n", + "- **--save_tag**: A tag used to name and organize the segmentation results (default: segger_embedding).\n", + "- **--min_transcripts**: The minimum number of transcripts required for segmentation (default: 5).\n", + "- **--cell_id_col**: The name of the column that stores the cell IDs (default: segger_cell_id).\n", + "- **--use_cc**: Enables the use of connected components (CC) for grouping transcripts that are not associated with any nucleus (default: False).\n", + "- **--knn_method**: Method for KNN (K-Nearest Neighbors) computation. Only option is \"cuda\" for this pipeline (default: cuda).\n", + "- **--file_format**: The format for saving the output segmentation data. Only option is \"anndata\" for this pipeline (default: anndata).\n", + "- **--k_bd**: Number of nearest neighbors for boundary nodes during segmentation (default: 4).\n", + "- **--dist_bd**: Maximum distance for boundary nodes during segmentation (default: 12.0).\n", + "- **--k_tx**: Number of nearest neighbors for transcript nodes during segmentation (default: 5).\n", + "- **--dist_tx**: Maximum distance for transcript nodes during segmentation (default: 5.0)." + ] + }, { "cell_type": "markdown", "id": "b0917be9-4e82-4ba5-869d-5a9203721699", From 94fe2cc458fbcd211e5e82e19e68c821bd695cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 16:53:18 +0200 Subject: [PATCH 121/156] CLI for faster prediction in separate file --- src/segger/cli/predict_fast.py | 78 ++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/segger/cli/predict_fast.py diff --git a/src/segger/cli/predict_fast.py b/src/segger/cli/predict_fast.py new file mode 100644 index 0000000..a2b46b3 --- /dev/null +++ b/src/segger/cli/predict_fast.py @@ -0,0 +1,78 @@ +import click +from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.predict_parquet import segment, load_model +from segger.cli.utils import add_options, CustomFormatter +from pathlib import Path +import logging +from argparse import Namespace +import os + +# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' + +# Path to default YAML configuration file +predict_yml = Path(__file__).parent / "configs" / "predict" / "default.yaml" + +help_msg = "Run the Segger segmentation model." + + +@click.command(name="run_segmentation", help=help_msg) +@add_options(config_path=predict_yml) +@click.option("--segger_data_dir", type=Path, required=True, help="Directory containing the processed Segger dataset.") +@click.option("--models_dir", type=Path, required=True, help="Directory containing the trained models.") +@click.option("--benchmarks_dir", type=Path, required=True, help="Directory to save the segmentation results.") +@click.option("--transcripts_file", type=str, required=True, help="Path to the transcripts file.") +@click.option("--batch_size", type=int, default=1, help="Batch size for processing.") +@click.option("--num_workers", type=int, default=1, help="Number of workers for data loading.") +@click.option("--model_version", type=int, default=0, help="Model version to load.") +@click.option("--save_tag", type=str, default="segger_embedding_1001", help="Tag for saving segmentation results.") +@click.option("--min_transcripts", type=int, default=5, help="Minimum number of transcripts for segmentation.") +@click.option("--cell_id_col", type=str, default="segger_cell_id", help="Column name for cell IDs.") +@click.option("--use_cc", type=bool, default=False, help="Use connected components if specified.") +@click.option("--knn_method", type=str, default="cuda", help="Method for KNN computation.") +@click.option("--file_format", type=str, default="anndata", help="File format for output data.") +@click.option("--k_bd", type=int, default=4, help="K value for boundary computation.") +@click.option("--dist_bd", type=float, default=12.0, help="Distance for boundary computation.") +@click.option("--k_tx", type=int, default=5, help="K value for transcript computation.") +@click.option("--dist_tx", type=float, default=5.0, help="Distance for transcript computation.") +def run_segmentation(args: Namespace): + + # Setup logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + + logger.info("Initializing Segger data module...") + # Initialize the Lightning data module + dm = SeggerDataModule( + data_dir=args.segger_data_dir, + batch_size=args.batch_size, + num_workers=args.num_workers, + ) + + dm.setup() + + logger.info("Loading the model...") + # Load in the latest checkpoint + model_path = Path(args.models_dir) / "lightning_logs" / f"version_{args.model_version}" + model = load_model(model_path / "checkpoints") + + logger.info("Running segmentation...") + segment( + model, + dm, + save_dir=args.benchmarks_dir, + seg_tag=args.save_tag, + transcript_file=args.transcripts_file, + file_format=args.file_format, + receptive_field={"k_bd": args.k_bd, "dist_bd": args.dist_bd, "k_tx": args.k_tx, "dist_tx": args.dist_tx}, + min_transcripts=args.min_transcripts, + cell_id_col=args.cell_id_col, + use_cc=args.use_cc, + knn_method=args.knn_method, + verbose=True, + ) + + logger.info("Segmentation completed.") + + +if __name__ == "__main__": + run_segmentation() From 88047067b3ae8edf9f1e148b49643a6492bf480e Mon Sep 17 00:00:00 2001 From: Gleb Rukhovich Date: Fri, 18 Oct 2024 17:07:51 +0200 Subject: [PATCH 122/156] Implemented boundary identification algo --- src/segger/prediction/boundary.py | 374 +++++++++++++++++++++++ src/segger/prediction/predict_parquet.py | 5 + 2 files changed, 379 insertions(+) create mode 100644 src/segger/prediction/boundary.py diff --git a/src/segger/prediction/boundary.py b/src/segger/prediction/boundary.py new file mode 100644 index 0000000..3135b65 --- /dev/null +++ b/src/segger/prediction/boundary.py @@ -0,0 +1,374 @@ +import geopandas as gpd +import matplotlib.pyplot as plt +import numpy as np +import rtree.index +from scipy.spatial import Delaunay +from shapely.geometry import MultiPolygon, Polygon +from tqdm import tqdm + + +def vector_angle(v1, v2): + # Calculate the dot product and magnitudes of vectors + dot_product = np.dot(v1, v2) + magnitude_v1 = np.linalg.norm(v1) + magnitude_v2 = np.linalg.norm(v2) + + # Avoid division by zero, clip the cosine values to [-1, 1] for numerical sta lity + cos_angle = np.clip(dot_product / (magnitude_v1 * magnitude_v2), -1.0, 1.0) + + # Return the angle in degrees + return np.degrees(np.arccos(cos_angle)) + + +def triangle_angles_from_points(points, triangles): + angles_list = [] + + for tri in triangles: + # Extract the points based on the triangle's indices + p1, p2, p3 = points[tri] + + # Define vectors for the triangle sides + v1 = p2 - p1 # Vector from p1 to p2 + v2 = p3 - p1 # Vector from p1 to p3 + v3 = p3 - p2 # Vector from p2 to p3 + + # Calculate the angles using the vectors + a = vector_angle(v1, v2) # Angle at vertex p1 + b = vector_angle(-v1, v3) # Angle at vertex p2 + c = vector_angle(-v2, -v3) # Angle at vertex p3 (fixed calculation) + angles_list.append((a, b, c)) + + return np.array(angles_list) + + +def dfs(v, graph, path, colors): + colors[v] = 1 + path.append(v) + for d in graph[v]: + if colors[d] == 0: + dfs(d, graph, path, colors) + + +def plot_points(points, s=3, color='black', zorder=None): + plt.scatter(points[:, 0], points[:, 1], color=color, s=s, zorder=zorder) + + +def plot_edges(edges, d_max, part=1): + if part == 1: + for edge in edges: + coords = edges[edge]['coords'] + if len(edges[edge]['simplices']) < 2: + color = 'magenta' + if edges[edge]['length'] > 2 * d_max: + color = 'red' + # if edges[edge]['simplices'].values()[0] + else: + color = 'cyan' + + plt.plot(coords[:, 0], coords[:, 1], color=color) + + if part == 2: + for edge in edges: + coords = edges[edge]['coords'] + if len(edges[edge]['simplices']) < 2: + color = 'magenta' + if edges[edge]['length'] > 2 * d_max: + color = 'red' + # if edges[edge]['simplices'].values()[0] + else: + color = 'cyan' + # print(coords) + plt.plot(coords[:, 0], coords[:, 1], color=color) + + +class BoundaryIdentification: + + def __init__(self, data): # 2d d = Delaunay(t[['x_location', 'y_location']].values) + self.graph = None + self.edges = {} + self.d = Delaunay(data) + self.d_max = self.calculate_d_max(self.d.points) + + # self.angles = triangle_angles_from_points(d.points, d.simplices) + self.generate_edges() + + def generate_edges(self): + d = self.d + + edges = {} + angles = triangle_angles_from_points(d.points, d.simplices) + for index, simplex in enumerate(d.simplices): + for p in range(3): + edge = tuple(sorted((simplex[p], simplex[(p + 1) % 3]))) + if edge not in edges: + edges[edge] = { + "simplices": {}, # simplex -> angle + # "angles": [] + } + + edges[edge]["simplices"][index] = angles[index][(p + 2) % 3] + + edges_coordinates = d.points[np.array(list(edges.keys()))] + edges_length = ( + (edges_coordinates[:, 1, 0] - edges_coordinates[:, 0, 0]) ** 2 + + (edges_coordinates[:, 1, 1] - edges_coordinates[:, 0, 1]) ** 2 + ) ** 0.5 + + for edge, coords, length in zip(edges, edges_coordinates, edges_length): + edges[edge]['coords'] = coords + edges[edge]['length'] = length + + self.edges = edges + + def calculate_part_1(self, plot=True): + edges = self.edges + d = self.d + d_max = self.d_max + + boundary_edges = [edge for edge in edges if len(edges[edge]['simplices']) < 2] + + if plot: + plt.figure(figsize=(10, 10)) + + iters = 0 + flag = True + while flag: + flag = False + next_boundary_edges = [] + + iters += 1 + if plot: + plt.subplot(330 + iters) + self.plot(title=f"iteration: {iters}") + + for current_edge in boundary_edges: + if current_edge not in edges: # yeah, it changes + continue + + if edges[current_edge]['length'] > 2 * d_max: + if len(edges[current_edge]['simplices'].keys()) == 0: + del edges[current_edge] + continue + + simplex_id = list(edges[current_edge]['simplices'].keys())[0] + simplex = d.simplices[simplex_id] + + # delete edge and the simplex start + for edge in self.get_edges_from_simplex(simplex): + if edge != current_edge: + edges[edge]['simplices'].pop(simplex_id) + next_boundary_edges.append(edge) + + del edges[current_edge] + flag = True + # delete edge and the simple end + + else: + next_boundary_edges.append(current_edge) + + boundary_edges = next_boundary_edges + + if plot: + plt.subplot(331 + iters) + self.plot(title='final') + plt.tight_layout() + + def plot(self, title="", s=3): + + plt.title(title) + for edge in self.edges: + coords = self.edges[edge]['coords'] + if len(self.edges[edge]['simplices']) < 2: + color = 'magenta' + if self.edges[edge]['length'] > 2 * self.d_max: + color = 'red' + else: + color = 'cyan' + plt.plot(coords[:, 0], coords[:, 1], color=color) + + plt.scatter(self.d.points[:, 0], self.d.points[:, 1], color='black', s=s) + plt.axis('equal') + plt.axis('off') + + def calculate_part_2(self, plot=True): + edges = self.edges + d = self.d + d_max = self.d_max + + boundary_edges = [edge for edge in edges if len(edges[edge]['simplices']) < 2] + boundary_edges_length = len(boundary_edges) + next_boundary_edges = [] + + if plot: + plt.figure(figsize=(10, 10)) + + iters = 0 + while len(next_boundary_edges) != boundary_edges_length: + next_boundary_edges = [] + + iters += 1 + if plot: + plt.subplot(330 + iters) + self.plot(title=f"iteration: {iters}") + + for current_edge in boundary_edges: + if current_edge not in edges: # yeah, it changes + continue + + # need to think about! + if len(edges[current_edge]['simplices'].keys()) == 0: + del edges[current_edge] + continue + + simplex_id = list(edges[current_edge]['simplices'].keys())[0] + simplex = d.simplices[simplex_id] + if ( + edges[current_edge]['length'] > 1.5 * d_max and + edges[current_edge]['simplices'][simplex_id] > 90 + ) or edges[current_edge]['simplices'][simplex_id] > 180 - 180 / 16: + + # delete edge and the simplex start + for edge in self.get_edges_from_simplex(simplex): + if edge != current_edge: + edges[edge]['simplices'].pop(simplex_id) + next_boundary_edges.append(edge) + + del edges[current_edge] + # delete edge and the simple end + + else: + next_boundary_edges.append(current_edge) + + boundary_edges_length = len(boundary_edges) + boundary_edges = next_boundary_edges + + if plot: + plt.subplot(331 + iters) + self.plot(title='final') + plt.tight_layout() + + def calculate_part_3(self): # inside boundary hole identification + # TODO + pass + + def find_cycles(self): + e = self.edges + boundary_edges = [edge for edge in e if len(e[edge]['simplices']) < 2] + self.graph = self.generate_graph(boundary_edges) + cycles = self.get_cycles(self.graph) + try: + if len(cycles) == 1: + geom = Polygon(self.d.points[cycles[0]]) + else: + geom = MultiPolygon([Polygon(self.d.points[c]) for c in cycles if len(c) >= 3]) + except Exception as e: + print(e, cycles) + return None + + return geom + + @staticmethod + def calculate_d_max(points): + index = rtree.index.Index() + for i, p in enumerate(points): + index.insert(i, p[[0, 1, 0, 1]]) + + short_edges = [] + for i, p in enumerate(points): + res = list(index.nearest(p[[0, 1, 0, 1]], 2))[-1] + short_edges.append([i, res]) + + nearest_points = points[short_edges] + + nearest_dists = ( + (nearest_points[:, 0, 0] - nearest_points[:, 1, 0]) ** 2 + + (nearest_points[:, 0, 1] - nearest_points[:, 1, 1]) ** 2 + ) ** 0.5 + d_max = nearest_dists.max() + + return d_max + + @staticmethod + def get_edges_from_simplex(simplex): + edges = [] + for p in range(3): + edges.append(tuple(sorted((simplex[p], simplex[(p + 1) % 3])))) + + return edges + + @staticmethod + def generate_graph(edges): + vertices = set([]) + for edge in edges: + vertices.add(edge[0]) + vertices.add(edge[1]) + + vertices = sorted(list(vertices)) + graph = {v: [] for v in vertices} + + for e in edges: + graph[e[0]].append(e[1]) + graph[e[1]].append(e[0]) + + return graph + + @staticmethod + def get_cycles(graph: dict): + colors = {v: 0 for v in graph} + cycles = [] + + for v in graph.keys(): + if colors[v] == 0: + cycle = [] + dfs(v, graph, cycle, colors) + cycles.append(cycle) + + return cycles + + +def generate_boundaries(df, x='x_location', y='y_location', cell_id='segger_cell_id'): + res = [] + group_df = df.groupby(cell_id) + for cell_id, t in tqdm(group_df, total=len(group_df)): + res.append({ + "cell_id": cell_id, + "length": len(t), + "geom": generate_boundary(t, x=x, y=y) + }) + + return gpd.GeoDataFrame( + data=[[b['cell_id'], b['length']] for b in res], + geometry=[b['geom'] for b in res], + columns=['cell_id', 'length'] + ) + + +def generate_boundary(t, x='x_location', y='y_location'): + if len(t) < 3: + return None + + bi = BoundaryIdentification(t[[x, y]].values) + bi.calculate_part_1(plot=False) + bi.calculate_part_2(plot=False) + geom = bi.find_cycles() + + return geom + + +if __name__ == '__main__': + points = np.array([ + [0, 0], # Point 0 + [3, 0], # Point 1 + [0, 4], # Point 2 + [5, 5], # Point 3 + [1, 6], # Point 4 + ]) + + simplices = triangles = np.array([ + [0, 1, 2], # Triangle formed by points 0, 1, 2 + [1, 3, 4], # Triangle formed by points 1, 3, 4 + ]) + + angles = triangle_angles_from_points(points, triangles) + print("Angles of each triangle (in degrees):") + print(angles) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 1d03281..ad235a3 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -21,6 +21,8 @@ ) from segger.training.train import LitSegger from segger.training.segger_data_module import SeggerDataModule +from segger.prediction.boundary import generate_boundaries + from scipy.sparse.csgraph import connected_components as cc from typing import Union, Dict import dask.dataframe as dd @@ -682,7 +684,10 @@ def segment( step_start_time = time() print(f"Computing and saving cell masks anndata object...") # Placeholder for future cell masks implementation as Dask Geopandas Parquet + boundaries_gdf = generate_boundaries(transcripts_df_filtered) cell_masks_save_path = save_dir / "segger_cell_boundaries.parquet" + + boundaries_gdf.to_parquet(cell_masks_save_path) if verbose: elapsed_time = time() - step_start_time print(f"Saved cell masks in {elapsed_time:.2f} seconds.") From cace92e901cc0f0ed906e3e12f440d90263dbbfb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:09:21 +0000 Subject: [PATCH 123/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/segger/prediction/boundary.py | 131 +++++++++++++++--------------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/src/segger/prediction/boundary.py b/src/segger/prediction/boundary.py index 3135b65..e1865f9 100644 --- a/src/segger/prediction/boundary.py +++ b/src/segger/prediction/boundary.py @@ -49,34 +49,34 @@ def dfs(v, graph, path, colors): dfs(d, graph, path, colors) -def plot_points(points, s=3, color='black', zorder=None): +def plot_points(points, s=3, color="black", zorder=None): plt.scatter(points[:, 0], points[:, 1], color=color, s=s, zorder=zorder) def plot_edges(edges, d_max, part=1): if part == 1: for edge in edges: - coords = edges[edge]['coords'] - if len(edges[edge]['simplices']) < 2: - color = 'magenta' - if edges[edge]['length'] > 2 * d_max: - color = 'red' + coords = edges[edge]["coords"] + if len(edges[edge]["simplices"]) < 2: + color = "magenta" + if edges[edge]["length"] > 2 * d_max: + color = "red" # if edges[edge]['simplices'].values()[0] else: - color = 'cyan' + color = "cyan" plt.plot(coords[:, 0], coords[:, 1], color=color) if part == 2: for edge in edges: - coords = edges[edge]['coords'] - if len(edges[edge]['simplices']) < 2: - color = 'magenta' - if edges[edge]['length'] > 2 * d_max: - color = 'red' + coords = edges[edge]["coords"] + if len(edges[edge]["simplices"]) < 2: + color = "magenta" + if edges[edge]["length"] > 2 * d_max: + color = "red" # if edges[edge]['simplices'].values()[0] else: - color = 'cyan' + color = "cyan" # print(coords) plt.plot(coords[:, 0], coords[:, 1], color=color) @@ -110,13 +110,13 @@ def generate_edges(self): edges_coordinates = d.points[np.array(list(edges.keys()))] edges_length = ( - (edges_coordinates[:, 1, 0] - edges_coordinates[:, 0, 0]) ** 2 + - (edges_coordinates[:, 1, 1] - edges_coordinates[:, 0, 1]) ** 2 - ) ** 0.5 + (edges_coordinates[:, 1, 0] - edges_coordinates[:, 0, 0]) ** 2 + + (edges_coordinates[:, 1, 1] - edges_coordinates[:, 0, 1]) ** 2 + ) ** 0.5 for edge, coords, length in zip(edges, edges_coordinates, edges_length): - edges[edge]['coords'] = coords - edges[edge]['length'] = length + edges[edge]["coords"] = coords + edges[edge]["length"] = length self.edges = edges @@ -125,7 +125,7 @@ def calculate_part_1(self, plot=True): d = self.d d_max = self.d_max - boundary_edges = [edge for edge in edges if len(edges[edge]['simplices']) < 2] + boundary_edges = [edge for edge in edges if len(edges[edge]["simplices"]) < 2] if plot: plt.figure(figsize=(10, 10)) @@ -145,18 +145,18 @@ def calculate_part_1(self, plot=True): if current_edge not in edges: # yeah, it changes continue - if edges[current_edge]['length'] > 2 * d_max: - if len(edges[current_edge]['simplices'].keys()) == 0: + if edges[current_edge]["length"] > 2 * d_max: + if len(edges[current_edge]["simplices"].keys()) == 0: del edges[current_edge] continue - simplex_id = list(edges[current_edge]['simplices'].keys())[0] + simplex_id = list(edges[current_edge]["simplices"].keys())[0] simplex = d.simplices[simplex_id] # delete edge and the simplex start for edge in self.get_edges_from_simplex(simplex): if edge != current_edge: - edges[edge]['simplices'].pop(simplex_id) + edges[edge]["simplices"].pop(simplex_id) next_boundary_edges.append(edge) del edges[current_edge] @@ -170,32 +170,32 @@ def calculate_part_1(self, plot=True): if plot: plt.subplot(331 + iters) - self.plot(title='final') + self.plot(title="final") plt.tight_layout() def plot(self, title="", s=3): plt.title(title) for edge in self.edges: - coords = self.edges[edge]['coords'] - if len(self.edges[edge]['simplices']) < 2: - color = 'magenta' - if self.edges[edge]['length'] > 2 * self.d_max: - color = 'red' + coords = self.edges[edge]["coords"] + if len(self.edges[edge]["simplices"]) < 2: + color = "magenta" + if self.edges[edge]["length"] > 2 * self.d_max: + color = "red" else: - color = 'cyan' + color = "cyan" plt.plot(coords[:, 0], coords[:, 1], color=color) - plt.scatter(self.d.points[:, 0], self.d.points[:, 1], color='black', s=s) - plt.axis('equal') - plt.axis('off') + plt.scatter(self.d.points[:, 0], self.d.points[:, 1], color="black", s=s) + plt.axis("equal") + plt.axis("off") def calculate_part_2(self, plot=True): edges = self.edges d = self.d d_max = self.d_max - boundary_edges = [edge for edge in edges if len(edges[edge]['simplices']) < 2] + boundary_edges = [edge for edge in edges if len(edges[edge]["simplices"]) < 2] boundary_edges_length = len(boundary_edges) next_boundary_edges = [] @@ -216,21 +216,20 @@ def calculate_part_2(self, plot=True): continue # need to think about! - if len(edges[current_edge]['simplices'].keys()) == 0: + if len(edges[current_edge]["simplices"].keys()) == 0: del edges[current_edge] continue - simplex_id = list(edges[current_edge]['simplices'].keys())[0] + simplex_id = list(edges[current_edge]["simplices"].keys())[0] simplex = d.simplices[simplex_id] if ( - edges[current_edge]['length'] > 1.5 * d_max and - edges[current_edge]['simplices'][simplex_id] > 90 - ) or edges[current_edge]['simplices'][simplex_id] > 180 - 180 / 16: + edges[current_edge]["length"] > 1.5 * d_max and edges[current_edge]["simplices"][simplex_id] > 90 + ) or edges[current_edge]["simplices"][simplex_id] > 180 - 180 / 16: # delete edge and the simplex start for edge in self.get_edges_from_simplex(simplex): if edge != current_edge: - edges[edge]['simplices'].pop(simplex_id) + edges[edge]["simplices"].pop(simplex_id) next_boundary_edges.append(edge) del edges[current_edge] @@ -244,7 +243,7 @@ def calculate_part_2(self, plot=True): if plot: plt.subplot(331 + iters) - self.plot(title='final') + self.plot(title="final") plt.tight_layout() def calculate_part_3(self): # inside boundary hole identification @@ -253,7 +252,7 @@ def calculate_part_3(self): # inside boundary hole identification def find_cycles(self): e = self.edges - boundary_edges = [edge for edge in e if len(e[edge]['simplices']) < 2] + boundary_edges = [edge for edge in e if len(e[edge]["simplices"]) < 2] self.graph = self.generate_graph(boundary_edges) cycles = self.get_cycles(self.graph) try: @@ -281,8 +280,8 @@ def calculate_d_max(points): nearest_points = points[short_edges] nearest_dists = ( - (nearest_points[:, 0, 0] - nearest_points[:, 1, 0]) ** 2 + - (nearest_points[:, 0, 1] - nearest_points[:, 1, 1]) ** 2 + (nearest_points[:, 0, 0] - nearest_points[:, 1, 0]) ** 2 + + (nearest_points[:, 0, 1] - nearest_points[:, 1, 1]) ** 2 ) ** 0.5 d_max = nearest_dists.max() @@ -326,24 +325,20 @@ def get_cycles(graph: dict): return cycles -def generate_boundaries(df, x='x_location', y='y_location', cell_id='segger_cell_id'): +def generate_boundaries(df, x="x_location", y="y_location", cell_id="segger_cell_id"): res = [] group_df = df.groupby(cell_id) for cell_id, t in tqdm(group_df, total=len(group_df)): - res.append({ - "cell_id": cell_id, - "length": len(t), - "geom": generate_boundary(t, x=x, y=y) - }) + res.append({"cell_id": cell_id, "length": len(t), "geom": generate_boundary(t, x=x, y=y)}) return gpd.GeoDataFrame( - data=[[b['cell_id'], b['length']] for b in res], - geometry=[b['geom'] for b in res], - columns=['cell_id', 'length'] + data=[[b["cell_id"], b["length"]] for b in res], + geometry=[b["geom"] for b in res], + columns=["cell_id", "length"], ) -def generate_boundary(t, x='x_location', y='y_location'): +def generate_boundary(t, x="x_location", y="y_location"): if len(t) < 3: return None @@ -355,19 +350,23 @@ def generate_boundary(t, x='x_location', y='y_location'): return geom -if __name__ == '__main__': - points = np.array([ - [0, 0], # Point 0 - [3, 0], # Point 1 - [0, 4], # Point 2 - [5, 5], # Point 3 - [1, 6], # Point 4 - ]) +if __name__ == "__main__": + points = np.array( + [ + [0, 0], # Point 0 + [3, 0], # Point 1 + [0, 4], # Point 2 + [5, 5], # Point 3 + [1, 6], # Point 4 + ] + ) - simplices = triangles = np.array([ - [0, 1, 2], # Triangle formed by points 0, 1, 2 - [1, 3, 4], # Triangle formed by points 1, 3, 4 - ]) + simplices = triangles = np.array( + [ + [0, 1, 2], # Triangle formed by points 0, 1, 2 + [1, 3, 4], # Triangle formed by points 1, 3, 4 + ] + ) angles = triangle_angles_from_points(points, triangles) print("Angles of each triangle (in degrees):") From 70cd033547f89e90fb04fd66de871b3a6a35bdf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 18:38:49 +0200 Subject: [PATCH 124/156] Added documentation/tutorial for CLI --- docs/cli.md | 314 ++++++++++++++++++++++-------------------- scripts/config.yaml | 2 - scripts/submit_job.py | 2 +- 3 files changed, 162 insertions(+), 156 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 2b49f65..5138607 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1,91 +1,60 @@ ## Segger Command Line Interface - - -This section will simulate typing the `segger --help` command output. - - - - -```console -$ segger --help - -Usage: segger [OPTIONS] COMMAND [ARGS]... - -╭─ Commands ─────────────────────────────────────────────────────╮ -│ create_dataset Create a dataset for spatial transcriptomics │ -│ train Train the model using the prepared dataset │ -│ predict Run predictions using a trained model │ -╰────────────────────────────────────────────────────────────────╯ -``` - - ### 1. Creating a Dataset -The `create_dataset` command helps you build a dataset for spatial transcriptomics. Here’s a breakdown of the options available: +The `create_dataset` command helps you to build a dataset for spatial transcriptomics. Here’s a breakdown of the options available: ```console -// Example: Create a dataset for spatial transcriptomics -python create_dataset create_dataset \ - --dataset_dir /path/to/dataset \ +// Example: Creating a dataset for spatial transcriptomics +python3 src/segger/cli/create_dataset_fast.py \ + --base_dir /path/to/raw_data \ --data_dir /path/to/save/processed_data \ - --sample_tag sample_name \ - --transcripts_file transcripts.parquet \ - --boundaries_file nucleus_boundaries.parquet \ - --x_size 300 \ - --y_size 300 \ - --d_x 280 \ - --d_y 280 \ - --margin_x 10 \ - --margin_y 10 \ - --r_tx 5 \ - --k_tx 5 \ + --sample_type xenium \ + --k_bd 3 \ + --dist_bd 15.0 \ + --k_tx 3 \ + --dist_tx 5.0 \ + --tile_width 200 \ + --tile_height 200 \ + --neg_sampling_ratio 5.0 \ + --frac 1.0 \ --val_prob 0.1 \ --test_prob 0.2 \ - --neg_sampling_ratio 5 \ - --sampling_rate 1 \ - --workers 4 \ - --gpu + --n_workers 16 ``` #### Parameters -| Parameter | Description | Default Value | -|----------------------|----------------------------------------------------------------------------------------|---------------| -| `dataset_type` | Specifies the type of dataset (e.g., `xenium`, `merscope`). | `xenium` | -| `dataset_dir` | Path to the directory where raw data is stored. | None | -| `sample_tag` | Tag to identify the dataset, useful for version control. | None | -| `transcripts_file` | File path to the transcript data in Parquet format. | None | -| `boundaries_file` | File path to the nucleus or cell boundaries data in Parquet format. | None | -| `data_dir` | Directory to store processed datasets (used during model training). | None | -| `x_size`, `y_size` | Size of the tiles in the x and y directions. | `300` | -| `d_x`, `d_y` | Step size in the x and y directions for overlapping tiles. | `280` | -| `margin_x`, `margin_y`| Additional margins added to each tile in the x and y directions. | `10` | -| `r_tx` | Radius for computing the neighborhood graph for transcripts. | `5` | -| `k_tx` | Number of nearest neighbors for the neighborhood graph. | `5` | -| `val_prob` | Proportion of the dataset used for validation. | `0.1` | -| `test_prob` | Proportion of the dataset used for testing. | `0.2` | -| `compute_labels` | Flag to enable or disable the computation of labels for segmentation. | `True` | -| `neg_sampling_ratio` | Approximate ratio for negative sampling. | `5` | -| `sampling_rate` | Proportion of the dataset to sample (useful for large datasets). | `1` (no sampling) | -| `workers` | Number of CPU cores to use for parallel processing. | `1` | -| `gpu` | Whether to use a GPU for processing. | `False` | +| Parameter | Description | Default Value | +|----------------------|-----------------------------------------------------------------------------------------|---------------| +| `base_dir` | Directory containing the raw dataset (e.g., transcripts, boundaries). | None | +| `data_dir` | Directory to save the processed Segger dataset (in PyTorch Geometric format). | None | +| `sample_type` | The sample type of the raw data, e.g., "xenium" or "merscope". | `xenium` | +| `k_bd` | Number of nearest neighbors for boundary nodes. | `3` | +| `dist_bd` | Maximum distance for boundary neighbors. | `15.0` | +| `k_tx` | Number of nearest neighbors for transcript nodes. | `3` | +| `dist_tx` | Maximum distance for transcript neighbors. | `5.0` | +| `tile_width` | Width of the tiles in pixels (ignored if `tile_size` is provided). | `200` | +| `tile_height` | Height of the tiles in pixels (ignored if `tile_size` is provided). | `200` | +| `neg_sampling_ratio` | Ratio of negative samples. | `5.0` | +| `frac` | Fraction of the dataset to process. Useful for subsampling large datasets. | `1.0` | +| `val_prob` | Proportion of the dataset used for validation split. | `0.1` | +| `test_prob` | Proportion of the dataset used for testing split. | `0.2` | +| `n_workers` | Number of workers for parallel processing. | `1` | + ### Key Updates: -- **Bounding box options (`x_min`, `y_min`, etc.)** were removed. -- **`x_size`, `y_size`** now refer to tile sizes, not bounding boxes. -- **`MerscopeSample`** is added as a supported dataset type alongside `XeniumSample`. -- **`r_tx` and `k_tx`** refer to parameters for computing neighborhood graphs. -- **`neg_sampling_ratio`** is included for negative sampling. +- **Faster Dataset Creation** This method is way faster due to the use of ND-tree-based partitioning and parallel processing. !!! note "Customizing Your Dataset" - - **dataset_type**: Defines the type of spatial transcriptomics data. - - **x_size, y_size**: Bounding box dimensions are important for memory efficiency. - - **val_prob, test_prob**: Adjust these probabilities based on the need for model validation and testing. + - **dataset_type**: Defines the type of spatial transcriptomics data. Currently, **xenium** and **merscope** are supported and have been tested. + - **val_prob, test_prob**: Control the dataset portions for validation and testing. Adjust based on your dataset size and evaluation needs. + - **frac**: Specifies the fraction of the dataset to process. Reducing `frac` can be useful when working with very large datasets, allowing for faster dataset creation by only processing a subset of the data. + !!! tip "Faster Dataset Creation" - You can reduce the **sampling_rate** to process only a subset of your dataset, which is useful for large datasets. + Increasing the number of workers (`n_workers`) can significantly accelerate the dataset creation process, especially for large datasets, by taking advantage of parallel processing across multiple CPU cores. --- @@ -95,121 +64,160 @@ The `train` command initializes and trains a model using the dataset created. He ```console -// Example: Train the model using SLURM -$ segger train slurm \ - --data_dir data_tidy/pyg_datasets \ - --batch_size_train 32 \ - --batch_size_val 16 \ - --init_emb 128 \ - --hidden_channels 256 \ - --out_channels 3 \ - --heads 8 \ - --aggr mean \ - --accelerator gpu \ - --strategy ddp \ - --precision 16 \ +// Example: Training a segger model +$ python3 src/segger/cli/train_model.py \ + --dataset_dir /path/to/saved/processed_data \ + --models_dir /path/to/save/model/checkpoints \ + --sample_tag first_training \ + --init_emb 8 \ + --hidden_channels 32 \ + --num_tx_tokens 500 \ + --out_channels 8 \ + --heads 2 \ + --num_mid_layers 2 \ + --batch_size 4 \ + --num_workers 2 \ + --accelerator cuda \ + --max_epochs 200 \ --devices 4 \ - --epochs 100 \ - --model_dir /path/to/save/model/checkpoints + --strategy auto \ + --precision 16-mixed ``` #### Parameters -| Parameter | Description | Default Value | -|--------------------|----------------------------------------------------------------------------------------|---------------| -| `data_dir` | Directory containing the dataset to be used for training. | None | -| `batch_size_train` | Number of samples to process per training batch. | `32` | -| `batch_size_val` | Number of samples to process per validation batch. | `16` | -| `init_emb` | Size of the initial embedding for the input data. | `128` | -| `hidden_channels` | Number of hidden units in each layer of the neural network. | `256` | -| `out_channels` | Number of output channels. | `3` | -| `heads` | Number of attention heads used in graph attention layers. | `8` | -| `aggr` | Aggregation method for attention layers (e.g., `mean`, `sum`). | `mean` | -| `accelerator` | Device used for training (e.g., `gpu` or `cpu`). | `gpu` | -| `strategy` | Strategy for distributed training (e.g., `ddp` for Distributed Data Parallel). | `ddp` | -| `precision` | Floating-point precision for training (e.g., `16` for FP16). | `16` | -| `devices` | Number of devices (GPUs or CPUs) to use during training. | `4` | -| `epochs` | Number of training epochs. | `100` | -| `model_dir` | Directory to save model checkpoints. | None | - -!!! tip "Adjusting for Your Hardware" - - **batch_size_train**: For larger datasets, you might need to decrease this value based on your GPU memory. - - **epochs**: Increasing the number of epochs can lead to better model performance but will take longer to train. - -!!! warning "Ensure Correct GPU Setup" - Before using the `--accelerator gpu` flag, make sure your system supports GPU computation and that CUDA is properly installed. +| Parameter | Description | Default Value | +|--------------------|-----------------------------------------------------------------------------------------|---------------| +| `dataset_dir` | Directory containing the processed Segger dataset (in PyTorch Geometric format). | None | +| `models_dir` | Directory to save the trained model and training logs. | None | +| `sample_tag` | Tag used to identify the dataset during training. | None | +| `init_emb` | Size of the embedding layer for input data. | `8` | +| `hidden_channels` | Number of hidden units in each layer of the neural network. | `32` | +| `num_tx_tokens` | Number of transcript tokens used during training. | `500` | +| `out_channels` | Number of output channels from the model. | `8` | +| `heads` | Number of attention heads used in graph attention layers. | `2` | +| `num_mid_layers` | Number of mid layers in the model. | `2` | +| `batch_size` | Number of samples to process per training batch. | `4` | +| `num_workers` | Number of workers to use for parallel data loading. | `2` | +| `accelerator` | Device used for training (e.g., `cuda` for GPU or `cpu`). | `cuda` | +| `max_epochs` | Number of training epochs. | `200` | +| `devices` | Number of devices (GPUs) to use during training. | `4` | +| `strategy` | Strategy used for training (e.g., `ddp` for distributed training or `auto`). | `auto` | +| `precision` | Precision used for training (e.g., `16-mixed` for mixed precision training). | `16-mixed` | + +!!! tip "Optimizing training time" + - **devices**: Use multiple GPUs by increasing the `devices` parameter to further accelerate training. + - **batch_size**: A larger batch size can speed up training, but requires more memory. Adjust based on your hardware capabilities. + - **epochs**: Increasing the number of epochs can improve model performance by allowing more learning cycles, but it will also extend the overall training time. Balance this based on your time constraints and hardware capacity. + +!!! warning "Ensure Correct CUDA and PyTorch Setup" + Before using the `--accelerator cuda` flag, ensure your system has CUDA installed and configured correctly. Also, check that the installed CUDA version is compatible with your PyTorch and PyTorch Geometric versions. --- ### 3. Making Predictions -After training the model, use the `predict` command to make predictions on new data. +After training the model, use the `predict` command to make predictions on new data: ```console // Example: Make predictions using a trained model -$ segger predict \ - --dataset_path /path/to/new/dataset \ - --checkpoint_path /path/to/saved/checkpoint \ - --output_path /path/to/save/predictions.csv \ - --batch_size 16 \ - --workers 4 \ - --score_cut 0.5 \ - --use_cc true +$ python3 src/segger/cli/predict_fast.py \ + --segger_data_dir /path/to/saved/processed_data \ + --models_dir /path/to/saved/model/checkpoints \ + --benchmarks_dir /path/to/save/segmentation/results \ + --transcripts_file /path/to/raw_data/transcripts.parquet \ + --batch_size 1 \ + --num_workers 1 \ + --model_version 0 \ + --save_tag segger_embedding_1001 \ + --min_transcripts 5 \ + --cell_id_col segger_cell_id \ + --use_cc false \ + --knn_method cuda \ + --file_format anndata \ + --k_bd 4 \ + --dist_bd 12.0 \ + --k_tx 5 \ + --dist_tx 5.0 ``` #### Parameters -| Parameter | Description | Default Value | -|--------------------|----------------------------------------------------------------------------------------|---------------| -| `dataset_path` | Path to the dataset for which predictions will be made. | None | -| `checkpoint_path` | Path to the saved model checkpoint from training. | None | -| `output_path` | File where the predictions will be saved. | None | -| `batch_size` | Number of samples processed simultaneously during prediction. | `16` | -| `workers` | Number of CPU cores used for parallel processing during prediction. | `4` | -| `score_cut` | Cutoff threshold for confidence scores in predictions. | `0.5` | -| `use_cc` | Enable connected component analysis to refine predictions. | `true` | +| Parameter | Description | Default Value | +|-----------------------|------------------------------------------------------------------------------------------|---------------| +| `segger_data_dir` | Directory containing the processed Segger dataset (in PyTorch Geometric format). | None | +| `models_dir` | Directory containing the trained models. | None | +| `benchmarks_dir` | Directory to save the segmentation results, including cell boundaries and associations. | None | +| `transcripts_file` | Path to the transcripts.parquet file. | None | +| `batch_size` | Number of samples to process per batch during prediction. | `1` | +| `num_workers` | Number of workers for parallel data loading. | `1` | +| `model_version` | Model version number to load for predictions, corresponding to the version from training logs. | `0` | +| `save_tag` | Tag used to name and organize the segmentation results. | `segger_embedding_1001` | +| `min_transcripts` | Minimum number of transcripts required for segmentation. | `5` | +| `cell_id_col` | Column name for cell IDs in the output data. | `segger_cell_id` | +| `use_cc` | Whether to use connected components for grouping transcripts without direct nucleus association. | `False` | +| `knn_method` | Method for KNN computation (e.g., `cuda` for GPU-based computation). | `cuda` | +| `file_format` | Format for the output segmentation data (e.g., `anndata`). | `anndata` | +| `k_bd` | Number of nearest neighbors for boundary nodes. | `4` | +| `dist_bd` | Maximum distance for boundary nodes. | `12.0` | +| `k_tx` | Number of nearest neighbors for transcript nodes. | `5` | +| `dist_tx` | Maximum distance for transcript nodes. | `5.0` | + +!!! tip "Improving Prediction Pipeline" + - **batch_size**: A larger batch size can speed up training, but requires more memory. Adjust based on your hardware capabilities. + - **use_cc**: Enabling connected component analysis can improve the accuracy of transcript assignments. + +!!! warning "Ensure Correct CUDA, cuVS, and PyTorch Setup" + Before using the `knn_method cuda` flag, ensure your system has CUDA installed and configured properly. Also, verify that the installed CUDA version is compatible with your cuPy, cuVS, PyTorch, and PyTorch Geometric versions. + +--- -!!! tip "Improve Prediction Efficiency" - - **batch_size**: Adjust this based on the size of the dataset and available GPU memory. - - **use_cc**: Enabling connected component analysis can improve the accuracy of transcript assignments +### 4. Running the Entire Pipeline -. +The `submit_job.py` script allows you to run the complete Segger pipeline or specific stages like dataset creation, training, or prediction. The pipeline execution is determined by the configuration provided in a YAML file, supporting various environments like Docker, Singularity, and HPC systems (with LSF, Slurm support is planned). ---- +#### Selecting Pipelines +You can run the three stages—dataset creation, training, and prediction—sequentially or independently by specifying the pipelines in the YAML configuration file: -### 4. Utility Commands and Reports + - `1` for dataset creation + - `2` for model training + - `3` for prediction -Segger includes utility commands for checking dataset and model setup as well as generating reports. +This allows you to run the full pipeline or just specific steps. Set the desired stages under the pipelines field in your YAML file. +#### Running the Pipeline + +Use the following command to run the pipeline: ```console -// Example: Check dataset and model setup -$ segger check \ - --dataset_dir data_raw/xenium \ - --model_dir /path/to/model/checkpoints - -// Example: Generate a report -$ segger report \ - --dataset_path /path/to/dataset \ - --output_path /path/to/report.html +$ python3 submit_job.py --config_file=filename.yaml ``` -#### Parameters for `check` +- If no `--config_file` is provided, the default `config.yaml` file will be used. -| Parameter | Description | -|------------------|------------------------------------------------------| -| `dataset_dir` | Path to the raw dataset. | -| `model_dir` | Path to the directory where model checkpoints are saved. | +#### Using Docker -#### Parameters for `report` +If you prefer using Docker, you can pull the Segger container with the following command: + +```console +$ docker pull danielunyi42/segger_dev:latest +``` + +To run the pipeline in a Conda environment or Docker, ensure that `use_singularity: false` and `use_lsf: false` are set in your YAML configuration file. Make sure you are in a terminal inside your Conda environment or inside Docker, and then type the command above. + +#### Using Singularity +For Singularity containerized environments, pull the Singularity image with: + +```console +$ singularity pull docker://danielunyi42/segger_dev:latest +``` -| Parameter | Description | -|------------------|------------------------------------------------------| -| `dataset_path` | Path to the dataset for which the report will be generated. | -| `output_path` | Path where the HTML report will be saved. | +Set `use_singularity: true` in the YAML file and specify the Singularity image (e.g., `segger_dev_latest.sif`) in the `singularity_image` field. -!!! info "Utility Commands" - - Use `check` to verify that your dataset and model are correctly set up. - - The `report` command provides a detailed HTML output of your model's performance. +!!! note "Containerization" + - Docker and Singularity provides a safe and portable environment with everything pre-installed. + - The Docker image currently supports CUDA 12.1. A CUDA 11.8 compatible version will be added soon. +#### Running on HPC with LSF +For HPC environments, the pipeline supports job submission via LSF. Set `use_lsf: true` in the YAML configuration to enable LSF job scheduling. The current setup is for LSF, but Slurm support is planned soon. diff --git a/scripts/config.yaml b/scripts/config.yaml index 4165861..1e79a03 100644 --- a/scripts/config.yaml +++ b/scripts/config.yaml @@ -2,8 +2,6 @@ use_lsf: true use_singularity: false use_debugpy: false -config_file: "config.yaml" - pipelines: - 1 # Run data processing - 2 # Run training diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 9952ae3..28c392b 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -187,7 +187,7 @@ def run_prediction(): get_singularity_command(use_gpu=True) + get_python_command() + [ - f"{repo_dir}/src/segger/cli/predict.py", + f"{repo_dir}/src/segger/cli/predict_fast.py", "--segger_data_dir", config["prediction"]["segger_data_dir"], "--models_dir", From 010b65d98638bd4f023bed2ca1989665876c3732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 18 Oct 2024 19:15:41 +0200 Subject: [PATCH 125/156] Finalizing docs/cli.md --- docs/cli.md | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 5138607..6847a1c 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -44,7 +44,7 @@ python3 src/segger/cli/create_dataset_fast.py \ | `n_workers` | Number of workers for parallel processing. | `1` | -### Key Updates: +#### Key Updates - **Faster Dataset Creation** This method is way faster due to the use of ND-tree-based partitioning and parallel processing. !!! note "Customizing Your Dataset" @@ -189,35 +189,46 @@ This allows you to run the full pipeline or just specific steps. Set the desired #### Running the Pipeline Use the following command to run the pipeline: - + ```console -$ python3 submit_job.py --config_file=filename.yaml +python3 submit_job.py --config_file=filename.yaml ``` - If no `--config_file` is provided, the default `config.yaml` file will be used. +### 5. Containerization + +For users who want a portable, containerized environment, segger supports both Docker and Singularity containers. These containers provide a consistent runtime environment with all dependencies pre-installed. + #### Using Docker -If you prefer using Docker, you can pull the Segger container with the following command: - +You can pull the segger Docker image from Docker Hub with this command: + ```console -$ docker pull danielunyi42/segger_dev:latest +docker pull danielunyi42/segger_dev:latest ``` -To run the pipeline in a Conda environment or Docker, ensure that `use_singularity: false` and `use_lsf: false` are set in your YAML configuration file. Make sure you are in a terminal inside your Conda environment or inside Docker, and then type the command above. +To run the pipeline in Docker, make sure your YAML configuration includes the following settings: + +- `use_singularity`: false +- `use_lsf`: false + +Afterwards, run the pipeline inside the Docker container with the same `submit_job.py` command. #### Using Singularity -For Singularity containerized environments, pull the Singularity image with: - +For a Singularity environment, pull the image with: + ```console -$ singularity pull docker://danielunyi42/segger_dev:latest +singularity pull docker://danielunyi42/segger_dev:latest ``` -Set `use_singularity: true` in the YAML file and specify the Singularity image (e.g., `segger_dev_latest.sif`) in the `singularity_image` field. +Ensure `use_singularity: true` in the YAML file and specify the Singularity image file (e.g., `segger_dev_latest.sif`) in the `singularity_image` field. !!! note "Containerization" - - Docker and Singularity provides a safe and portable environment with everything pre-installed. - - The Docker image currently supports CUDA 12.1. A CUDA 11.8 compatible version will be added soon. + - The segger Docker image currently supports CUDA 12.1. A CUDA 11.8 compatible version will be added soon. + +### 6. HPC Environments + +Segger also supports HPC environments with LSF job scheduling. To run the pipeline on an HPC cluster using LSF, set `use_lsf: true` in your YAML configuration. -#### Running on HPC with LSF -For HPC environments, the pipeline supports job submission via LSF. Set `use_lsf: true` in the YAML configuration to enable LSF job scheduling. The current setup is for LSF, but Slurm support is planned soon. +If your HPC system supports Slurm, a similar setup is planned and will be introduced soon. From b6027a8711c719052419dca1e1248e6c2dc92662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sat, 19 Oct 2024 01:44:21 +0200 Subject: [PATCH 126/156] Update README with docker image and pyg dependency --- README.md | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a077fa5..66a7bd0 100644 --- a/README.md +++ b/README.md @@ -51,24 +51,22 @@ segger tackles these with a **graph-based approach**, achieving superior segment ### Important: PyTorch Geometric Dependencies -Segger **highly depends** on PyTorch Geometric. One **must** install its dependencies (such as `torch-sparse` and `torch-scatter`) based on their system’s specifications, especially CUDA and PyTorch versions. +Segger **relies heavily** on PyTorch Geometric for its graph-based operations. One **must** install its dependencies (such as `torch-sparse` and `torch-scatter`) based on their system’s specifications, especially the **CUDA** and **PyTorch** versions. Please follow the official [PyTorch Geometric Installation Guide](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) to install the correct versions of `torch-sparse`, `torch-scatter`, and other relevant libraries. -Here’s how to install them manually, e.g., for torch 2.0.0: +Below is a quick guide for installing PyTorch Geometric dependencies for **torch 2.4.0**: #### For CUDA 11.x: ```bash -pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu117.html -pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu117.html +pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.4.0+cu121.html ``` #### For CUDA 12.x: ```bash -pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu120.html -pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu120.html +pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.4.0+cu118.html ``` Afterwards choose the installation method that best suits your needs. @@ -112,6 +110,22 @@ For installations requiring RAPIDS and CUDA 12 support, run: pip install -e ".[rapids12]" ``` +### Docker Installation +Segger provides an easy-to-use Docker container for those who prefer a containerized environment. To pull the latest Docker image: + +```bash +docker pull danielunyi42/segger_dev:latest +``` + +The Docker image comes with all dependencies packaged, including RAPIDS. It currently supports only CUDA 12.2, and we will soon release a version that supports CUDA 11.8. + +### Singularity Installation +For users who prefer Singularity, you can pull the Docker image as follows: + +```bash +singularity pull docker://danielunyi42/segger_dev:latest +``` + --- # Powered by From 4317080de59e450ca29f2741f2b228e943d58a28 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 23:44:37 +0000 Subject: [PATCH 127/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 66a7bd0..df20333 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ pip install -e ".[rapids12]" ``` ### Docker Installation + Segger provides an easy-to-use Docker container for those who prefer a containerized environment. To pull the latest Docker image: ```bash @@ -120,6 +121,7 @@ docker pull danielunyi42/segger_dev:latest The Docker image comes with all dependencies packaged, including RAPIDS. It currently supports only CUDA 12.2, and we will soon release a version that supports CUDA 11.8. ### Singularity Installation + For users who prefer Singularity, you can pull the Docker image as follows: ```bash From 85c7f5960dfae2d5f793c33890135661c985562f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sat, 19 Oct 2024 02:11:23 +0200 Subject: [PATCH 128/156] Containerization and improvement of installation.md --- docs/installation.md | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index ec3c284..df45ab5 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -6,8 +6,8 @@ Select the appropriate installation method based on your requirements. ```bash micromamba create -n segger-rapids --channel-priority 1 \ -c rapidsai -c conda-forge -c nvidia -c pytorch -c pyg \ - rapids=24.08 python=3.* 'cuda-version>=11.4,<=11.8' jupyterlab \ - 'pytorch=*=*cuda*' 'pyg=*=*cu118' pyg-lib pytorch-sparse + rapids=24.10 python=3.* 'cuda-version>=12.0,<=12.1' jupyterlab \ + 'pytorch=*=*cuda*' 'pyg=*=*cu121' pyg-lib pytorch-sparse micromamba install -n segger-rapids --channel-priority 1 --file mamba_environment.yml micromamba run -n segger-rapids pip install --no-deps ./ ``` @@ -16,36 +16,39 @@ micromamba run -n segger-rapids pip install --no-deps ./ ```bash conda create -n segger-env python=3.10 conda activate segger-env -conda install pytorch torchvision torchaudio cudatoolkit=11.7 -c pytorch +conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia conda install pyg -c pyg pip install . ``` -=== ":octocat: Github Installation" +=== ":whale: Docker Installation" ```bash -git clone https://github.com/EliHei2/segger_dev.git -cd segger_dev -pip install . +docker pull danielunyi42/segger_dev:latest ``` -=== ":package: Pip Installation (CUDA 11)" +The Docker image comes with all required packages pre-installed, including PyTorch, RAPIDS, and PyTorch Geometric. The current image supports CUDA 12.1, and we are working on another image that will support CUDA 11.8 soon. + +For users who prefer Singularity: + ```bash -pip install -e ".[cuda11]" +singularity pull docker://danielunyi42/segger_dev:latest ``` -=== ":package: Pip Installation (CUDA 12)" +=== ":octocat: Github Installation" ```bash -pip install -e ".[cuda12]" +git clone https://github.com/EliHei2/segger_dev.git +cd segger_dev +pip install -e "." ``` -=== ":rocket: Pip Installation (RAPIDS and CUDA 11)" +=== ":rocket: Pip Installation (RAPIDS with CUDA 11)" ```bash -pip install "segger[cuda11,rapids11,cupy11,faiss]" +pip install "segger[rapids11]" ``` -=== ":rocket: Pip Installation (RAPIDS and CUDA 12)" +=== ":rocket: Pip Installation (RAPIDS with CUDA 12)" ```bash -pip install "segger[cuda12,rapids12,cupy12,faiss]" +pip install "segger[rapids12]" ``` !!! warning "Common Installation Issues" From ffae0333c77f2036e9b5dbe795c03d7b865e3f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sat, 19 Oct 2024 02:14:50 +0200 Subject: [PATCH 129/156] Small fix in installation.md --- docs/installation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index df45ab5..470cba6 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -41,12 +41,12 @@ cd segger_dev pip install -e "." ``` -=== ":rocket: Pip Installation (RAPIDS with CUDA 11)" +=== ":rocket: Pip Installation of RAPIDS with CUDA 11" ```bash pip install "segger[rapids11]" ``` -=== ":rocket: Pip Installation (RAPIDS with CUDA 12)" +=== ":rocket: Pip Installation of RAPIDS with CUDA 12" ```bash pip install "segger[rapids12]" ``` From dee1b1b232bee45b8d10961f1f2967407cbc95d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sat, 19 Oct 2024 16:19:20 +0200 Subject: [PATCH 130/156] Fix in submit_job --- scripts/submit_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 28c392b..938e51b 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -101,7 +101,7 @@ def run_data_processing(): "-R", f"rusage[mem={config['preprocessing']['memory']}]", "-q", - "medium", + "long", ] + command try: @@ -169,7 +169,7 @@ def run_training(): "-R", "tensorcore", "-gpu", - f"num={config['training']['gpus']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", + f"num={config['training']['devices']}:j_exclusive=no:gmem={config['training']['gpu_memory']}", "-q", "gpu", ] + command From 40c007eba834f4da1453c2a506fc7c34addb6c33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Mon, 21 Oct 2024 16:11:48 +0200 Subject: [PATCH 131/156] Using transcript embeddings in fast prediction pipeline. --- .../cli/configs/create_dataset/default_fast.yaml | 8 ++++++++ src/segger/cli/create_dataset_fast.py | 15 +++++++++++++++ src/segger/data/parquet/sample.py | 9 ++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/segger/cli/configs/create_dataset/default_fast.yaml b/src/segger/cli/configs/create_dataset/default_fast.yaml index 72f6294..2b959e8 100644 --- a/src/segger/cli/configs/create_dataset/default_fast.yaml +++ b/src/segger/cli/configs/create_dataset/default_fast.yaml @@ -10,6 +10,14 @@ sample_type: type: str default: null help: The sample type of the raw data, e.g., "xenium" or "merscope". +scrnaseq_file: + type: str + default: null + help: Name of the scRNA-seq file. +celltype_column: + type: str + default: null + help: Column name for cell type annotations in the scRNAseq file. k_bd: type: int default: 3 diff --git a/src/segger/cli/create_dataset_fast.py b/src/segger/cli/create_dataset_fast.py index 33a3a63..5e94d39 100644 --- a/src/segger/cli/create_dataset_fast.py +++ b/src/segger/cli/create_dataset_fast.py @@ -1,5 +1,7 @@ import click import os +import scanpy as sc +from segger.data.utils import calculate_gene_celltype_abundance_embedding from segger.cli.utils import add_options, CustomFormatter from pathlib import Path import logging @@ -22,6 +24,8 @@ @click.option( "--sample_type", type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".' ) +@click.option("--scrnaseq_file", type=Path, default=None, help="Path to the scRNAseq file.") +@click.option("--celltype_column", type=str, default=None, help="Column name for cell type annotations in the scRNAseq file.") @click.option("--k_bd", type=int, default=3, help="Number of nearest neighbors for boundary nodes.") @click.option("--dist_bd", type=float, default=15.0, help="Maximum distance for boundary neighbors.") @click.option("--k_tx", type=int, default=3, help="Number of nearest neighbors for transcript nodes.") @@ -51,12 +55,23 @@ def create_dataset(args: Namespace): ch.setFormatter(CustomFormatter()) logging.basicConfig(level=logging.INFO, handlers=[ch]) + # If scRNAseq file is provided, calculate gene-celltype embeddings + if args.scrnaseq_file: + logging.info("Calculating gene and celltype embeddings...") + scRNAseq = sc.read(args.scrnaseq_file) + sc.pp.subsample(scRNAseq, 0.1) + gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( + scRNAseq, + args.celltype_column + ) + # Initialize the sample class logging.info("Initializing sample...") sample = STSampleParquet( base_dir=args.base_dir, n_workers=args.n_workers, sample_type=args.sample_type, + weights=gene_celltype_abundance_embedding, ) # Save Segger dataset diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 729632a..8c89d8d 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -35,6 +35,7 @@ def __init__( base_dir: os.PathLike, n_workers: Optional[int] = 1, sample_type: str = None, + weights: pd.DataFrame = None, ): """ Initializes the STSampleParquet instance. @@ -73,8 +74,10 @@ def __init__( self._boundaries_metadata = None # Setup default embedding for transcripts + if weights is not None: + self.emb_genes = weights.index.to_list() classes = self.transcripts_metadata["feature_names"] - self._transcript_embedding = TranscriptEmbedding(np.array(classes)) + self._transcript_embedding = TranscriptEmbedding(np.array(classes), weights) @classmethod def _get_parquet_metadata( @@ -167,6 +170,10 @@ def transcripts_metadata(self) -> dict: # Get filtered unique feature names table = pq.read_table(self._transcripts_filepath) names = pc.unique(table[self.settings.transcripts.label]) + if self.emb_genes is not None: + # Filter substring is extended with the genes missing in the embedding + missing_genes = list(set(names.to_pylist()) - set(self.emb_genes)) + self.settings.transcripts.filter_substrings.extend(missing_genes) pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) metadata["feature_names"] = pc.filter(names, mask).tolist() From 5efb4c5a9e239a5d175590ff489fe9c2451de6a7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 14:12:54 +0000 Subject: [PATCH 132/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/segger/cli/create_dataset_fast.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/segger/cli/create_dataset_fast.py b/src/segger/cli/create_dataset_fast.py index 5e94d39..94ebe21 100644 --- a/src/segger/cli/create_dataset_fast.py +++ b/src/segger/cli/create_dataset_fast.py @@ -25,7 +25,9 @@ "--sample_type", type=str, default=None, help='The sample type of the raw data, e.g., "xenium" or "merscope".' ) @click.option("--scrnaseq_file", type=Path, default=None, help="Path to the scRNAseq file.") -@click.option("--celltype_column", type=str, default=None, help="Column name for cell type annotations in the scRNAseq file.") +@click.option( + "--celltype_column", type=str, default=None, help="Column name for cell type annotations in the scRNAseq file." +) @click.option("--k_bd", type=int, default=3, help="Number of nearest neighbors for boundary nodes.") @click.option("--dist_bd", type=float, default=15.0, help="Maximum distance for boundary neighbors.") @click.option("--k_tx", type=int, default=3, help="Number of nearest neighbors for transcript nodes.") @@ -60,10 +62,7 @@ def create_dataset(args: Namespace): logging.info("Calculating gene and celltype embeddings...") scRNAseq = sc.read(args.scrnaseq_file) sc.pp.subsample(scRNAseq, 0.1) - gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( - scRNAseq, - args.celltype_column - ) + gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(scRNAseq, args.celltype_column) # Initialize the sample class logging.info("Initializing sample...") From ea41ed7844cb6d6e7e801adab1937f973b81d952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Mon, 21 Oct 2024 16:31:35 +0200 Subject: [PATCH 133/156] CLI update --- scripts/config.yaml | 2 ++ scripts/submit_job.py | 24 +++++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/scripts/config.yaml b/scripts/config.yaml index 1e79a03..a2121e3 100644 --- a/scripts/config.yaml +++ b/scripts/config.yaml @@ -12,6 +12,8 @@ preprocessing: base_dir: "data_xenium" data_dir: "data_segger" sample_type: "xenium" + scrnaseq_file: "scrnaseq.h5ad" + celltype_column: "celltype_minor" k_bd: 3 dist_bd: 15.0 k_tx: 3 diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 938e51b..35151ec 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -2,6 +2,7 @@ import subprocess import argparse import os +import time parser = argparse.ArgumentParser() parser.add_argument("--config", default="config.yaml", help="Path to the configuration YAML file") @@ -17,6 +18,7 @@ # Get the base directory repo_dir = config["container_dir"] if config.get("use_singularity", False) else config["local_repo_dir"] +time_stamp = time.strftime("%Y%m%d-%H%M%S") # Function to get Singularity command if enabled def get_singularity_command(use_gpu=False): @@ -61,6 +63,10 @@ def run_data_processing(): config["preprocessing"]["data_dir"], "--sample_type", config["preprocessing"]["sample_type"], + "--scrnaseq_file", + config["preprocessing"]["scrnaseq_file"], + "--celltype_column", + config["preprocessing"]["celltype_column"], "--k_bd", str(config["preprocessing"]["k_bd"]), "--dist_bd", @@ -93,7 +99,7 @@ def run_data_processing(): command = [ "bsub", "-J", - "job_data_processing", + f"job_data_processing_{time_stamp}", "-o", config["preprocessing"]["output_log"], "-n", @@ -157,9 +163,7 @@ def run_training(): command = [ "bsub", "-J", - "job_training", - "-w", - "done(job_data_processing)", + f"job_training_{time_stamp}", "-o", config["training"]["output_log"], "-n", @@ -173,6 +177,9 @@ def run_training(): "-q", "gpu", ] + command + # only run training after data_processing + if "1" in config["pipelines"]: + command[4:4] = ["-w", f"done(job_data_processing_{time_stamp})"] try: print(f"Running command: {command}") @@ -229,9 +236,7 @@ def run_prediction(): command = [ "bsub", "-J", - "job_prediction", - "-w", - "done(job_training)", + f"job_prediction_{time_stamp}", "-o", config["prediction"]["output_log"], "-n", @@ -245,6 +250,11 @@ def run_prediction(): "-q", "gpu", ] + command + # only run prediction after training/data_processing + if "2" in config["pipelines"]: + command[4:4] = ["-w", f"done(job_training_{time_stamp})"] + elif "1" in config["pipelines"]: + command[4:4] = ["-w", f"done(job_data_processing_{time_stamp})"] try: print(f"Running command: {command}") From 0427a9fbf40b8fc11cde4da426c0be7ab82c8efb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 14:33:17 +0000 Subject: [PATCH 134/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/submit_job.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 35151ec..0949ab3 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -20,6 +20,7 @@ time_stamp = time.strftime("%Y%m%d-%H%M%S") + # Function to get Singularity command if enabled def get_singularity_command(use_gpu=False): if config.get("use_singularity", False): From 6cb46607516742b73d502a478ecc1d229a707dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Mon, 21 Oct 2024 23:45:18 +0200 Subject: [PATCH 135/156] Python notebook tutorial for segger --- docs/notebooks/segger_tutorial.ipynb | 1939 ++++++++++++++------------ 1 file changed, 1032 insertions(+), 907 deletions(-) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index d1daa90..680495e 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -1,917 +1,1042 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "21ed4db6-5234-46b1-9f38-b5883ac88946", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-11T22:22:55.404267Z", - "iopub.status.busy": "2024-09-11T22:22:55.403876Z", - "iopub.status.idle": "2024-09-11T22:22:58.089917Z", - "shell.execute_reply": "2024-09-11T22:22:58.089303Z", - "shell.execute_reply.started": "2024-09-11T22:22:55.404248Z" - } - }, - "source": [ - "# **Introduction to Segger**\n", - "\n", - "Segger is a cutting-edge cell segmentation model specifically designed for **single-molecule resolved spatial omics** datasets. It addresses the challenge of accurately segmenting individual cells in complex imaging datasets, leveraging a unique approach based on graph neural networks (GNNs). \n", - "\n", - "The core idea behind Segger is to model both **nuclei** and **transcripts** as graph nodes, with edges connecting them based on their spatial proximity. This allows the model to learn from the co-occurrence of nucleic and cytoplasmic molecules, resulting in more refined and accurate cell boundaries. By using spatial information and GNNs, Segger achieves state-of-the-art performance in segmenting single cells in datasets such as 10X Xenium and MERSCOPE, outperforming traditional methods like Baysor and Cellpose.\n", - "\n", - "Segger's workflow consists of:\n", - "1. **Dataset creation**: Converting raw transcriptomic data into a graph-based dataset.\n", - "2. **Training**: Training the Segger model on the graph to learn cell boundaries.\n", - "3. **Prediction**: Using the trained model to make predictions on new datasets.\n", - "\n", - "This tutorial will guide you through each step of the process, ensuring you can train and apply Segger for your own data." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "546982ee-2cee-4a66-9086-435d0cde0167", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:48:04.549084Z", - "iopub.status.busy": "2024-09-12T00:48:04.548821Z", - "iopub.status.idle": "2024-09-12T00:48:58.719362Z", - "shell.execute_reply": "2024-09-12T00:48:58.718889Z", - "shell.execute_reply.started": "2024-09-12T00:48:04.549070Z" - } - }, - "outputs": [], - "source": [ - "from segger.data.io import XeniumSample\n", - "from segger.training.train import LitSegger\n", - "from segger.training.segger_data_module import SeggerDataModule\n", - "from segger.prediction.predict import predict, load_model\n", - "from lightning.pytorch.loggers import CSVLogger\n", - "from pytorch_lightning import Trainer\n", - "from pathlib import Path\n", - "from lightning.pytorch.plugins.environments import LightningEnvironment\n", - "from matplotlib import pyplot as plt\n", - "import seaborn as sns" - ] - }, - { - "cell_type": "markdown", - "id": "db009015-c379-4f50-97ed-81dca9df28ac", - "metadata": {}, - "source": [ - "# **1. Create your Segger Dataset**\n", - "\n", - "In this step, we generate the dataset required for Segger's cell segmentation tasks. \n", - "\n", - "Segger relies on spatial transcriptomics data, combining **nuclei** and **transcripts** from single-cell resolved imaging datasets. These nuclei and transcript nodes are represented in a graph, and the spatial proximity of transcripts to nuclei is used to establish edges between them.\n", - "\n", - "To use Segger with a Xenium dataset, you need the **`transcripts.csv.gz`** and **`nucleus_boundaries.csv.gz`** files. The **transcripts** file contains spatial coordinates and information for each transcript, while the **nucleus boundaries** file defines the polygon boundaries of the nuclei. These files enable Segger to map transcripts to their respective nuclei and perform cell segmentation based on spatial relationships. Segger can also be extended to other platforms by modifying the column names or formats in the input files to match its expected structure, making it adaptable for various spatial transcriptomics technologies." - ] - }, - { - "cell_type": "markdown", - "id": "4e5df7f3-7f36-45b4-b7da-b301513efbce", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-11T22:56:41.967336Z", - "iopub.status.busy": "2024-09-11T22:56:41.966988Z" - } - }, - "source": [ - "To create the dataset, you need to specify the path to the **transcripts** file and the **nuclei boundaries** file. These are typically downloaded from a spatial transcriptomics dataset like the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer).\n", - "\n", - "- **`--transcripts_path`**: Path to the transcripts file, which contains single-cell transcriptomic data.\n", - "- **`--boundaries_path`**: Path to the boundaries file, most often representing the nuclei boundaries in the imaging dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "edd35db3-56e4-4a3e-9309-f83133274d47", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:49:03.132088Z", - "iopub.status.busy": "2024-09-12T00:49:03.131930Z", - "iopub.status.idle": "2024-09-12T00:49:05.472827Z", - "shell.execute_reply": "2024-09-12T00:49:05.472394Z", - "shell.execute_reply.started": "2024-09-12T00:49:03.132072Z" - } - }, - "outputs": [], - "source": [ - "# Paths to Xenium sample data and where to store Segger data\n", - "xenium_data_dir = Path('path/to/tutorial/xenium_data')\n", - "segger_data_dir = Path('path/to/tutorial/segger_data/')\n", - "\n", - "# Setup Xenium sample to create dataset\n", - "xs = XeniumSample(verbose=False)\n", - "xs.set_file_paths(\n", - " transcripts_path=xenium_data_dir / 'transcripts.parquet',\n", - " boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet',\n", - ")\n", - "xs.set_metadata()" - ] - }, - { - "cell_type": "markdown", - "id": "33bd04f6-c4e3-42f8-81b2-c1e483d9faaf", - "metadata": {}, - "source": [ - "The following parameters are used to build a tiled Segger dataset:\n", - "\n", - "- **`--processed_dir`**: Directory where the processed dataset will be saved.\n", - "- **`--x_size`, `--y_size`**: These parameters specify the size of the tiles used to divide the image. The size of the tiles determines how the spatial region is partitioned for processing.\n", - "- **`--d_x`, `--d_y`**: These define the step size of the spatial grid used to bin transcripts and nuclei into tiles.\n", - "- **`--r_tx`**: Specifies the radius used for graph construction. A smaller radius will connect transcripts to nearby nuclei, while a larger radius might connect them to more distant neighbors.\n", - "- **`--scale_boundaries`**: The factor by which to scale the boundary polygons. Suggested to keep `=1` when boundaries refer to nuclei.\n", - "- **`--k_tx`**: Defines the number of nearest neighbors considered when building graphs for transcripts (`k_tx`).\n", - "- **`--val_prob` and `--test_prob`**: These control the proportion of the dataset that will be set aside for validation and testing. For instance, `--val_prob 0.1` means 10% of the data will be used for validation.\n", - "- **`--compute_labels`**: When set to `True`, this flag triggers the computation of labels (cell assignments) for each transcript. Use False if you just plan to perform prediction using a pre-existing model.\n", - "\n", - "Once the dataset is processed, the output will be ready for training the Segger model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c8cf7102-ad9c-4bd0-bbd7-61a1d73abccd", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:49:06.357990Z", - "iopub.status.busy": "2024-09-12T00:49:06.357793Z", - "iopub.status.idle": "2024-09-12T00:49:07.235307Z", - "shell.execute_reply": "2024-09-12T00:49:07.234925Z", - "shell.execute_reply.started": "2024-09-12T00:49:06.357975Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset already exists at ../../dev/tutorial/segger_data\n" - ] - } - ], - "source": [ - "try:\n", - " xs.save_dataset_for_segger(\n", - " processed_dir=segger_data_dir,\n", - " r_tx=5,\n", - " k_tx=15,\n", - " x_size=120,\n", - " y_size=120,\n", - " d_x=100,\n", - " d_y=100,\n", - " margin_x=10,\n", - " margin_y=10,\n", - " scale_boundaries=1,\n", - " num_workers=4, # change to you number of CPUs\n", - " )\n", - "except AssertionError as err:\n", - " print(f'Dataset already exists at {segger_data_dir}')" - ] - }, - { - "cell_type": "markdown", - "id": "9d2b090b", - "metadata": {}, - "source": [ - "### **1.2 Faster Dataset Creation with Segger**\n", - "\n", - "Segger introduces a faster, more efficient pipeline for processing spatial transcriptomics data. This method accelerates dataset creation, particularly for large datasets, by using **ND-tree-based spatial partitioning** and **parallel processing**. This results in a much faster preparation of the dataset, which is saved in PyTorch Geometric (PyG) format, similar to the previous method.\n", - "\n", - "**Note**: The previous dataset creation method will soon be deprecated in favor of this optimized pipeline.\n", - "\n", - "#### **Requirements for the Faster Pipeline**\n", - "The pipeline requires the following inputs:\n", - "\n", - "- **base_dir**: The directory containing the raw dataset.\n", - "- **data_dir**: The directory where the processed dataset (tiles in PyG format) will be saved.\n", - "\n", - "The core improvements in this method come from the use of **ND-tree partitioning**, which splits the data efficiently into spatial regions, and **parallel processing**, which speeds up the handling of these regions across multiple CPU cores. For example, using this pipeline, the Xenium Human Pancreatic Dataset can be processed in just a few minutes when running with 16 workers.\n", - "\n", - "#### **Running the Faster Dataset Creation Pipeline**\n", - "Below is an example of how to create a dataset using the faster Segger pipeline:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e933ebf3", - "metadata": {}, - "outputs": [], - "source": [ - "!python3 create_dataset_fast.py --base_dir path/to/raw_data \\\n", - "--data_dir path/to/save/processed_data \\\n", - "--sample_type xenium \\\n", - "--k_bd 3 \\\n", - "--dist_bd 15.0 \\\n", - "--k_tx 3 \\\n", - "--dist_tx 5.0 \\\n", - "--tile_width 200 \\\n", - "--tile_height 200 \\\n", - "--neg_sampling_ratio 5.0 \\\n", - "--frac 1.0 \\\n", - "--val_prob 0.1 \\\n", - "--test_prob 0.2 \\\n", - "--n_workers 12" - ] - }, - { - "cell_type": "markdown", - "id": "6ab27f9a", - "metadata": {}, - "source": [ - "#### **Parameters**\n", - "Here is a complete list of parameters you can use to control the dataset creation process:\n", - "\n", - "- **--base_dir**: Directory containing the raw spatial transcriptomics dataset.\n", - "- **--data_dir**: Directory where the processed Segger dataset (in PyG format) will be saved.\n", - "- **--sample_type**: (Optional) Specifies the type of dataset (e.g., \"xenium\" or \"merscope\"). Defaults to None.\n", - "- **--k_bd**: Number of nearest neighbors for boundary nodes (default: 3).\n", - "- **--dist_bd**: Maximum distance for boundary neighbors (default: 15.0).\n", - "- **--k_tx**: Number of nearest neighbors for transcript nodes (default: 3).\n", - "- **--dist_tx**: Maximum distance for transcript neighbors (default: 5.0).\n", - "- **--tile_size**: Specifies the size of the tile. If provided, it overrides both tile_width and tile_height.\n", - "- **--tile_width**: Width of the tiles in pixels (ignored if tile_size is provided).\n", - "- **--tile_height**: Height of the tiles in pixels (ignored if tile_size is provided).\n", - "- **--neg_sampling_ratio**: Ratio of negative samples (default: 5.0).\n", - "- **--frac**: Fraction of the dataset to process (default: 1.0).\n", - "- **--val_prob**: Proportion of data used for validation split (default: 0.1).\n", - "- **--test_prob**: Proportion of data used for testing split (default: 0.2).\n", - "- **--n_workers**: Number of workers for parallel processing (default: 1)." - ] - }, - { - "cell_type": "markdown", - "id": "9962e4b8-4028-4683-9b75-d674fa6fb01d", - "metadata": {}, - "source": [ - "# **2. Train your Segger Model**\n", - "\n", - "The Segger model training process begins after the dataset has been created. This model is a **heterogeneous graph neural network (GNN)** designed to segment single cells by leveraging both nuclei and transcript data. \n", - "\n", - "Segger uses graph attention layers to propagate information across nodes (nuclei and transcripts) and refine cell boundaries. The model architecture includes initial embedding layers, attention-based graph convolutions, and residual connections for stable learning.\n", - "\n", - "Segger leverages the **PyTorch Lightning** framework to streamline the training and evaluation of its graph neural network (GNN). PyTorch Lightning simplifies the training process by abstracting away much of the boilerplate code, allowing users to focus on model development and experimentation. It also supports multi-GPU training, mixed-precision, and efficient scaling, making it an ideal framework for training complex models like Segger.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8cbf5be9-27f3-45c2-ab28-8d8ceb078745", - "metadata": {}, - "source": [ - "Key parameters for training:\n", - "- **`--data_dir`**: Directory containing the training data.\n", - "- **`--model_dir`**: Directory in which to store models.\n", - "- **`--epochs`**: Specifies the number of training epochs.\n", - "- **`--batch_size`**: Batch sizes for training and validation data.\n", - "- **`--learning_rate`**: The initial learning rate for the optimizer.\n", - "- **`--hidden_channels`**: Number of hidden channels in the GNN layers.\n", - "- **`--heads`**: Number of attention heads used in each graph convolutional layer.\n", - "- **`--init_emb`**: Sets the dimensionality of the initial embeddings applied to the input node features (e.g., transcripts). A higher embedding dimension may capture more feature complexity but also requires more computation.\n", - "- **`--out_channels`**: Specifies the number of output channels after the final graph attention layer, e.g. the final learned representations of the graph nodes.\n", - "\n", - "Additional Options for Training the Segger Model:\n", - "\n", - "- **`--aggr`**: This option controls the aggregation method used in the graph convolution layers.\n", - "- **`--accelerator`**: Controls the hardware used for training, such as `cuda` for GPU training. This enables Segger to leverage GPU resources for faster training, especially useful for large datasets.\n", - "- **`--strategy`**: Defines the distributed training strategy, with `auto` allowing PyTorch Lightning to automatically configure the best strategy based on the hardware setup.\n", - "- **`--precision`**: Enables mixed precision training (e.g., `16-mixed`), which can speed up training and reduce memory usage while maintaining accuracy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4db89cb4-d0eb-426a-a71f-d127926fa412", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T00:49:07.236043Z", - "iopub.status.busy": "2024-09-12T00:49:07.235854Z", - "iopub.status.idle": "2024-09-12T00:49:08.351946Z", - "shell.execute_reply": "2024-09-12T00:49:08.351565Z", - "shell.execute_reply.started": "2024-09-12T00:49:07.236028Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/lilac/data/peer/moormana/.micromamba/envs/segger-rapids-11.8/lib/python3.11/site-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n", - " torch.has_cuda,\n", - "/lilac/data/peer/moormana/.micromamba/envs/segger-rapids-11.8/lib/python3.11/site-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n", - " torch.has_cudnn,\n", - "/lilac/data/peer/moormana/.micromamba/envs/segger-rapids-11.8/lib/python3.11/site-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n", - " torch.has_mps,\n", - "/lilac/data/peer/moormana/.micromamba/envs/segger-rapids-11.8/lib/python3.11/site-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n", - " torch.has_mkldnn,\n", - "Using 16bit Automatic Mixed Precision (AMP)\n", - "GPU available: True (cuda), used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n" - ] - } - ], - "source": [ - "# Base directory to store Pytorch Lightning models\n", - "models_dir = Path('path/to/tutorial/models/')\n", - "\n", - "# Initialize the Lightning model\n", - "metadata = ([\"tx\", \"bd\"], [(\"tx\", \"belongs\", \"bd\"), (\"tx\", \"neighbors\", \"tx\")])\n", - "ls = LitSegger(\n", - " num_tx_tokens=500,\n", - " init_emb=8,\n", - " hidden_channels=32,\n", - " out_channels=8,\n", - " heads=2,\n", - " num_mid_layers=2,\n", - " aggr='sum',\n", - " metadata=metadata,\n", - ")\n", - "\n", - "# Initialize the Lightning data module\n", - "dm = SeggerDataModule(\n", - " data_dir=segger_data_dir,\n", - " batch_size=2, \n", - " num_workers=2, \n", - ")\n", - "\n", - "dm.setup()\n", - "\n", - "\n", - "# if you wish to use more than 1 device for training you should run this:\n", - "batch = dm.train[0]\n", - "ls.forward(batch)\n", - "\n", - "# Initialize the Lightning trainer\n", - "trainer = Trainer(\n", - " accelerator='cuda', \n", - " strategy='auto',\n", - " precision='16-mixed',\n", - " devices=1, \n", - " max_epochs=100, \n", - " default_root_dir=models_dir,\n", - " logger=CSVLogger(models_dir),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "207864b8-7e52-4add-a4a2-e95a4debdc06", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Fit model\n", - "trainer.fit(\n", - " model=ls,\n", - " datamodule=dm\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "9a7d20c6-ca16-4beb-b627-afb41e3fb491", - "metadata": {}, - "source": [ - "### *Troubleshooting #1*\n", - "\n", - "In the cell below, we are visualizing key metrics from the model training and validation process. The plot displays **training loss**, **validation loss**, **F1 validation score**, and **AUROC validation score** over training steps. We expect to see the loss curves decreasing over time, signaling the model's improvement, and the F1 and AUROC scores increasing, reflecting improved segmentation performance as the model learns.\n", - "\n", - "If training is not working effectively, you might observe the following in the plot displaying **training loss**, **validation loss**, **F1 score**, and **AUROC**:\n", - "\n", - "- **Training loss not decreasing**: If the training loss remains high or fluctuates without a consistent downward trend, this indicates that the model is not learning effectively from the training data.\n", - "- **Validation loss decreases, then increases**: If validation loss decreases initially but starts to increase while training loss continues to drop, this could be a sign of **overfitting**, where the model is performing well on the training data but not generalizing to the validation data.\n", - "- **F1 score and AUROC not improving**: If these metrics remain flat or show inconsistent improvement, the model may be struggling to correctly segment cells or classify transcripts, indicating an issue with learning performance.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "id": "43a9c1a4-3898-407d-ac0f-f98b13694593", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-11T22:06:58.182616Z", - "iopub.status.busy": "2024-09-11T22:06:58.182357Z", - "iopub.status.idle": "2024-09-11T22:07:01.063645Z", - "shell.execute_reply": "2024-09-11T22:07:01.063184Z", - "shell.execute_reply.started": "2024-09-11T22:06:58.182599Z" - } - }, - "outputs": [ + "cells": [ { - "data": { - "text/plain": [ - "Text(0.5, 0, 'Step')" + "cell_type": "markdown", + "id": "21ed4db6-5234-46b1-9f38-b5883ac88946", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-11T22:22:55.404267Z", + "iopub.status.busy": "2024-09-11T22:22:55.403876Z", + "iopub.status.idle": "2024-09-11T22:22:58.089917Z", + "shell.execute_reply": "2024-09-11T22:22:58.089303Z", + "shell.execute_reply.started": "2024-09-11T22:22:55.404248Z" + }, + "id": "21ed4db6-5234-46b1-9f38-b5883ac88946" + }, + "source": [ + "# **Introduction to Segger**\n", + "\n", + "Segger is a cutting-edge cell segmentation model specifically designed for **single-molecule resolved spatial omics** datasets. It addresses the challenge of accurately segmenting individual cells in complex imaging datasets, leveraging a unique approach based on graph neural networks (GNNs).\n", + "\n", + "The core idea behind Segger is to model both **nuclei** and **transcripts** as graph nodes, with edges connecting them based on their spatial proximity. This allows the model to learn from the co-occurrence of nucleic and cytoplasmic molecules, resulting in more refined and accurate cell boundaries. By using spatial information and GNNs, Segger achieves state-of-the-art performance in segmenting single cells in datasets such as 10X Xenium and MERSCOPE, outperforming traditional methods like Baysor and Cellpose.\n", + "\n", + "Segger's workflow consists of:\n", + "1. **Dataset creation**: Converting raw transcriptomic data into a graph-based dataset.\n", + "2. **Training**: Training the Segger model on the graph to learn cell boundaries.\n", + "3. **Prediction**: Using the trained model to make predictions on new datasets.\n", + "\n", + "This tutorial will guide you through each step of the process, ensuring you can train and apply Segger for your own data." ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2oAAAKACAYAAADtg4tbAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAC4jAAAuIwF4pT92AAEAAElEQVR4nOzdeXhTZdoG8PtkbZrupaVAadn3TUBQ2TcBBRQRZRAH1HHDEcdlnM8ZRcXRcWQUV0YBBVFHUQRFUAREQDYRAdnBAqVAge57mvV8f4ScnNMkbdqmTdrev+tiJsl5z8mbFEuePM/7vIIoiiKIiIiIiIgoZKiCPQEiIiIiIiJSYqBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGICdlA7cSJEwgLC0NycnJArvfZZ59h1KhRiIuLQ1hYGNq3b48HH3wQJ0+eDMj1iYiIiIiIAkUQRVEM9iQqMpvNGDlyJHbu3IlWrVrh/PnzNb6W1WrF9OnTsXLlSq/Hw8PD8eGHH+LWW2+t8XMQEREREREFUshl1Gw2G26//Xbs3LkzINf761//KgVpY8aMwddff42dO3fiP//5D2JiYlBWVoYZM2bg119/DcjzERERERER1VZIZdRycnIwbdo0/PDDD9JjtcmoHT58GL1794bD4cCUKVPwxRdfQBAE6fjx48dx7bXXoqCgAEOHDsXWrVtr/RqIiIiIiIhqK2QyauvWrUO/fv2kIE2lqv3U3n77bTgcDmi1WixYsEARpAFAly5d8PTTTwMAtm3bxqwaERERERGFhJAI1G6//XZMmDABGRkZEAQBTz/9NIYMGVLr63799dcAgKFDh6J169Zex/zxj3+Ubq9atarWz0lERERERFRbIRGo/fzzzwCANm3aYOPGjXjhhRdqfc3Tp0/j0qVLAJyBmi8JCQno3LkzALD0kYiIiIiIQkJIBGotW7bEK6+8guPHj2PUqFEBueaxY8ek2x06dKh0bNu2bT3OISIiIiIiChZNsCcAANu3bw/ImjS5zMxM6XZKSkqlY1u1agUAyMvLg9lshl6vr9VzZ2VlITs7u1rnmM1mnDt3DrGxsYiPj0f79u0RFhZWq3kQEREREVHDFBKBWqCDNADIz8+XbkdERFQ61mg0SrcLCwuRmJhYq+deuHAhnn/++Vpd4/Dhw+jevXutrkFERERERA1TSJQ+1gWz2SzdriozZTAYvJ5HREREREQUDI02UJNn6Sq25a9IvpVcXWT3iIiIiIiIqiMkSh/rgrzcsby8vNKx8uO1XZ8GALNnz8bUqVOrdU5aWhpuvvnmWj83ERERERE1fI02UIuMjJRul5aWVjrWdVwQBMTExNT6uRMTE2u9zo2IiIiIiJquRlvnl5qaKt2+cOFCpWNdxxMSEqDRNNrYlYiIiIiIGohGG6h17dpVun3q1KlKx54+fdrjHCIiIiIiomBptIFay5YtkZycDADYsWOHz3HZ2dk4efIkAOC6666rl7kRERERERFVptEGagAwefJkAMDGjRsVG2DLffjhh9JtNvMgIiIiIqJQ0KgDtfvuuw9qtRpmsxkPPPAA7Ha74viJEyfw4osvAgAGDhyIAQMGBGOaREREDZpos8N+uQDWQ2dh/uE3WH4+CdFq8zrWdvoSSt5eh5I319bzLImIGpYG3TkjPT0dbdu2BQAMGzYMW7ZsURzv0aMHZs+ejbfeegvffPMNhg4discffxwtWrTA7t278cILL6CgoABarRZvv/12EF4BERFR9TiKTbCdvAD76csQoo3QdEiCOiUBgkbtObaoDLaj52A9cg62I2fhyC6C4Y5hCBvXt9LnsKVdRPnq3Shfswf2ywXQ9kiFblAX6K7rCk2vVNh/vwjL3jRYf02D9bd0OC7lAw5RcQ0hOhz6Ub2hH9cXmjYJKN9wAOZ1e2E7dt55PCocxgfGQdA16I8iRER1ptH/dnz11VeRmZmJL7/8Ejt37sTOnTsVx/V6PT788EP0798/SDMkIqJAEe0OiAWlcOQUwZFbBEeJGaqIMAjR4VBFGyHEGCEY9RAEwfNchwP2jBzA7oC6bSIElbLoxFFUhvK1e2HZcRSCXgd1crz0R4gyAioBUAkQVAIcOcWwHjsP27FzsB07D/ulfKjiIqBOioWqRSzUzWMAQYBYboFYboVotgI2OyACV/7H+X8OB0SHCIiAWFoO28lMOC7ker5wrRrqNolQRYVDLCmHo8QEsaQcYmGZx1DLrhNQf/13aLuneLx3pk+3wfTZT7AdzlAcs+47Beu+Uyh9a53/P4vCMpSv2oXyVbu8Hy8qg2XHMehH9PT7mkRETUmjD9S0Wi1WrlyJFStWYMmSJdi/fz+KiorQvHlzjB49Gn/961/RrVu3YE+TiKjJEi02OArLIKgECLFGzwCpsAy23zNhS7sIx+UCOLIK4cguhCO7CI4SE1BudQY75RaIpeUemZ2KhEgD1KkJzj+tEyAWlsJ69DxsJy8AJotzTHwkdNd1gX5wV6gSY1C+Zg/Kv9sHlFtq/DrtucWw/36xxudXymqH/feLsFc9ErDZUfT4UsSt+YeUzRJFEcXPfALT/7bVahpCpAGqhCjYz+cCFu+ljwCg6ZIM/YT+0HRuVavnIyJqzARRFCv/F43qxZEjR9CjRw/p/uHDh9G9e/cgzoiIGiNRFL1mk1zH7GkXYb+QB1VSDNStm0FlDFOOMVshFpsgxEf6vI7rWo7MPNjSLsJ+6hJsZ7MgFpbBUWyCWGyCWGSCo6gMYmEpxFKz+0SdBurmMVC1iIWg1zqDs4v5AXntjYkQEQZ1+xYQC0pgP5dTaXAqRBqg6Z4CdWoCyldslx43zpmAiMduAgCULf0Bxc9/pjhP0yMFYTcPhLZPO1j3psGy4xgsv6Q5g1WdBtqeqdD27wBtvw7QdGwBdfMYCOF6AICjpByWLYdQvn4/LD8ehFhqhqZLK+hv7I+wG/pD0z6pDt4VIqLGhYFaiGCgRkSVcQU+oskCiCJEuwOwO6QqOWlcsQn2jCzY0rNhz8iC/XyeMzBylcKVmaFqEQttn7bQ9mkHbZ+2cBSWwbLlEMxbDsORmae4nhAbAXXzGIhlZjjyiiGWlAMAVEkxiHjsJoRNuQ6C2p0Bsx7OQOnb62D56YgyAKtvOg1UzaIgRIRdKQEs9W8+ruDTxz+NQmwEwiZdDcEYBseFXNjP58J+IRdimfPn4ipVFAw6aDq1grZba2i6JkPdJhGOvBI4LuXDfjEfjqwCQBAghOkghGkhhOkA1xozQTaPK6WUEFSAVg1N2+bQdG4FVXK8FCiL5VbYzlyG/dRFiBabM6sVYYAQEQZVbARUreKksUV//8idNdOoEffV3+HIKULB3W9KwZ62T1tEzZ8FTceWHq9fNFthz8yDukUchDCtXz8K0WKDWG6BKircr/FEROTEQC1EMFAjIsAZkIm5xbCfy4Ht9CXYjpyD9UgGbEfPQSw2BXt6HjRdkhHxj6lQxUei9PU1MG84UCfPI0QaoOnUEupW8VAlRjv/JERDFWUAwnQQ9FpnwGMMkwK0ihk/0WqDI78U9nM5sGdkw342C/ZzORAiwqDpkgxt19bQdGoJ0WqDZdcJZwZp+1E4sgqhHdAJhqmDoB/VC4LevwAlFDlKypE79jlpnZu6XRIc2YXS3y1VyzjEff0PqBOigjlNIiICA7WQwUCNqOkQy8yw7Pkd9rNZcGQXwp5V6Fx3dTEf9nM5EMuCmImC88O6I6eo0jVGfl+reQw0HVpA3T4JqvhIqCINEK78UUUbnU0+Ypz/D5sdjksFsF/Mh/1SvrNcrt2VDFKL2EpLLcl/5h3HUHDHax6PC+F6xK78G7TdWgdhVkREVFGjbyZCRBRIot0BR04RVLERfrcVF6022H6/CMtPR2HZehiWvWk1D4J0GkClgqAWAJXKXSJ3haDXQp3SDOrURGezjJQEZyAUEQYhwgBBr4Ht94uwHjgD64HTzu5+GjV0g7pCP7wHdEO7Q50YDdHhgCOrEPbzuXBkFTrL6OIioYqLgGixofT1NSj/6mevU9R0aw3jnAnQXdel2uVuqmgjG0zUMf2grjDcMQymT7a6HxQERL3xJwZpREQhhBm1EMGMGlHoEUUR1l3HUf7tPtjTLztL5jLzAKsdQrge4feMRvh9Y6GKNCjP2X8alp+OwnYyE7bfM2E/cxmw+tWPTyJEGqBOSYCmSytou6dA0yMFmq6tFc8VbNaD6Sj+5xew7jkJwFkGafzLROiv7+PRuZFCS8USyIj/mwLjA+OCPCsiIpJjoBYiGKgR1T3R4YDtcAYcBaXOzYG1aggaNYRwPYS4SKhijRA0ajgKy1D+5U6UfbwV9tOXKr2mEBsB459vhH5UL5jX/QLTl7tgP325yrkIkQZo+7S9st4qBurEaKiaOzstqlOaQRVtDNTLrlOiKMK6Nw2w2qG9phMDtAbElp6F0oXfQtuzDQwzhrG0lIgoxDBQCxEM1Ijqjv18Lkxf7kT5yp3OVua+CAKEGKOzs2It9svySq2CpkcK9MOc5YXaPm2dwSIRERGRF1yjRkQNkqOwDCX/WgnbmcvOJhXNopx/ogwQS8vhKHLu1WVLvwzr7pM+260riCLE/BLlY4IA3fAe0A/v4cx2tW4GVWwEyj7agrLFG3w2/tAO7ATdgI7QdGwJdaeW0LRt3qC7BRIREVH9YqBGRAEjiqJzn6gzWXDkFUPbuy3UrZv5d67dAdux87AdzYD1cAZsRzJgP5sN3fAeiHzmNkUpoP1iHgpmvQnbiQvVn6RKgDq5GUS7HbDaIdrszr3BKjT3EGIjYLhtEAx3DIMmJcHjMhGPToJhxnCUvr3WuS+V1Q51agLCplwHw+Rr/H7dRERERN6w9DFEsPSRGip7ViHK3t8Iy7YjsKVnASZZyaBKgH58PxjvHwttrzZezxdtdpSv3o2S17+RGhtUpGoZh+j/3AXddV1gS7uI/D++7rExc1XUbRJhmDoIYVOuhTopVjkHUYRYemVD59xiiDY7tD3b+L2hrz2rEI68Ymg6t+I6HyIiIgoIBmohgoEaNTT2rAKUvfs9yj7ZCpitVY7XDuwEwy3XONvGt24GVfMYmDccQMmrX8F+qvKGHQAAQYBh2mCUf/srxMIy6WHDjOFQt4iFI6cIjtxiOIpNzk2Po9x7dWmv7gBt/w4MooiIiKjBYOkjEflNdDhg3X8G5V/thunzHT4DNFWzKECjguNSgfSY9eeTsP58UjZIABwVvidSq6DpkgxNjxRou6fAkV+C0rfWATY7IIowffqTe6wgIPK5aQifOTKAr5CIiIgoNDBQI6IqWQ+mo3zNHpR/+6vXkkNVsyiE33s9dNd2hrpNIlRR4RDtDpg3HkDZe9/Duv+050XlQZogIGzyQBj/MsljPZh+VC8UPrJEmXXTaRD9+p8QdkO/QL1EIiIiopDC0scQwdJHChbR4QDKrRDC9V6Pl773PUr+tdLrMVVCNMIfGIfw6UMgGLyfL4oirL+egmnFdtjTLsJ+LgeOnCLpuH5cX0Q8NgmaTq18z9FkRvG/voTp4y1QxUUg+q37obu2czVeJREREVHDwkAtRDBQo2Cw/nYGhY++D/uZLET841YY/3S98vjx88i78QXA7nA/qBKgu6Yz9BOvhmHyNRDCdNV+XtFkhv1CHoSocKgTo/0+z1FYBiFMyzb3RERE1Oix9JGoCRJFEablP6L4n58DVjsAoOTFldB2aw3ddV2dYxwOFP/9YylIUyXFwPjQjdCP6wt1QlStnl8w6KHp0KLa56miw2v1vEREREQNhSrYEyCi+uUoNqHwz4tQ/OynUpAGABBFFD7yvlSWaPpsO6z7TkmHo16eifA7h9c6SCMiIiKiqjFQI2pCrIfPIm/SP2Fet1d6TJUQDeicyXVHdiEKH/sA9qxClLz8pTRGf2N/6If38LgeEREREdUNBmpETYDocKB0yUbkTf4X7GeypMd1g7oi7tu5iPz7rdJjlm1HkDf5XxCLnHuVCZEGRM69vd7nTERERNSUcY0aUSPnyClC4RNLYdly2P2gIMA4ZwKMcyZAUKtgmDkSlp3HYd5wwHnOhVxpaMQTN0PdPKZ+J01ERETUxDGjRtSImXccQ+645xVBmqp5DGI/eRQRj06CoHb+ChAEAVGvzIKqVZzifE2vVBhmDK/PKRMRERERGKgRNUqiw4GSt9aiYMYCxZ5lutG9Ef/dXKmzo5wqxojoN+8FrgRvUAmIeulOKZgjIiIiovrD0keiBsxRWAr7uVyoW8dDFW10PlZQisJH34flx0PugToNIv8xFYY/joAgCD6vp+vXATHvzYbps58Qdsu10PZIreuXQEREREReMFAjaoDsWQUoXbgepv9tBSw2AM59zjSdWsF26pJijZk6JQHRCx+AtkeKX9fWj+4N/ejedTJvIiIiIvIPAzWiBsSeVYiyd9ej7JOtgNmqOOa4VADLpQLFY/rRvRH16t3cKJqIiIiogWGgRtRAmL7+GUX/txwwWZQHVALgED0ei/jrZITfPxaCimvMiIiIiBoaBmpEIU60O1DyyiqUvfe94nFVQjSMs8fD8IchcOSXwHYiE7aTF+C4mA/9hKuh69c+SDMmIiIiotpioEZUB0RRhCO7EKqYCAi6mv9n5igqQ+EjSxSNQYTocBgfnoDwGcMghOkAAOoWcVC3iIN+eI9az52IiIiIgo+BGlGAOApLYdl+DOath2HZegSOywVQd2iB2E8eq9aG0Y6SctiOn4ft6DmULf8R9rSL0jFN51aIXvwQNCkJdfAKiIiIiChUMFAjqgHRbIX1cAZsh886///QWdhOXvBYK2ZPu4iCu99C7Od/hcoY5vt6dgfKFn0P02c/wX422+sY/Zg+iFpwD1QRvq9DRERERI0DAzUiAGKZGdBrq9zcWTRbUfbxVpS+sw5iXolf17YdyUDhQ+8hZsmfIWjUHsftlwtQ+MgSWHef8HkN48M3wvjoJDYGISIiImoiGKhRkyaKIgofXgzz2l8AQYAQa4QqLhKq+Eho2idB26cttL3bQt2uOcrX7EHJa2sUe5RVpGoWCd3Q7tAN6wHr/tMwLdsMALBsOYziZ/6HyJdmKDacNm89jMLHPoCYW6y4jhARBk2XZGi6JiPshv7QXdu5bt4AIiIiIgpJDNSoSbPuTXMGaQAgihDzSmDPK4E97SKsP5+E6X/bnMc0asBmV56s10LbIwWanqnQ9kiFtmcq1B1bSFmvsIlXw3G5AObv9gEATJ9ugxCmhTolAY6cItjOZruf23XJiVcj4omboW7djNkzIiIioiaMgRo1aWZZN8VKyYM0rRqGO4Yh4s83QtUsyucpgkqF6AX3IP9yIaz7TgEAypb+4H2wXovI56bBMG2IIuNGRERERE0TAzVq0uRt740P3wjdiJ7OrNqlfNiOZMB64AxsJzMBuwMAEDZpAIyP3wRNaqJf1xfCdIhZ/BDybvmXzyYh6vZJiH7nfmi7JNf+BRERERFRo8BAjZos+6V82I6dl+6H3XwNNO2TPMaJZWbYfs+EKjEa6hZx1X4eVXwkYj56FMVz/wdHbjFUzaKcf+IjoWmXhLAJ/SGE62v1WoiIiIiocWGgRk2WZeth6ba6dTOo2zX3Ok4I10Pbu22tnkuTkoDYZY/U6hpERERE1HSwWwE1WeYf3YGabkRPrg0jIiIiopDBjBo1aPbMPJi3HYEjpwiOnGKIecVwlJmh698BhjuH+9xkWrTYYNl+VLqvH9GzvqZMRERERFQlBmrUIIk2O8qWbETJa18DFpvHccum31C6eAOMs29A+B3DIIRpFcete9MglpQ77+i10F3TqT6mTURERETkF5Y+UoNjO3EBeVNeRsnLX3oN0lzE3GKUvLACOSP+AdMXOyCKonTMLFufprumMwQDm3kQERERUehgRo0alNJ316Pk1a8Aq3tfM1WzKGiv6QxVfARU8VEQi8pQ9slWwGQBADgu5qPor8vgyCuB8f6xAJT7p+lH9KjX10BEREREVBUGatRglK/f58yiyYTdci0i594OVYxR8Xj4/eNQtvBbZ8B2JetW8vKX0HRqCU3HlrCfzJTG6oZzfRoRERERhRaWPlKDYfpih3RblRSDmA8eRvRrd3sEaQCgTohC5LPT0GzzP6FKjnc+KIoonLMYZR9sco9r1xyaNv5tXk1EREREVF8YqFGD4Cg2wfKTu0tj1H/ugn5kryrPUyfHI2bJn6UNpcVikyJQ0zObRkREREQhiIEaNQjmzQelEkYhxgjdQP+7NGq7JCPqtbu9HtMN5/o0IiIiIgo9DNSoQTCv3yfd1o/pA0FbveWVYeP6wvjoJOWDBh10A9iWn4iIiIhCDwM1CnmiyQzzFnc7/bDxfWt0HePDN0IvO1c/rIfH/mpERERERKGAXR8p5Jm3HpFa7QuRBugGda3RdQSVCtGv3o3i5rFwXM5H5DO3B3KaREREREQBw0CNQp75u1+l2/qRPSHoa54FE8L1iHpuWiCmRURERERUZ1j6SPVGFEWIouj7uMMB25nLEG3uzaxFsxXmzbLNqcf3q9M5EhERERGFAmbUqF7YM/OQP2MBxKJShN81GuH3jIYQppOOW/acRPHcT2E7fh7qds0R8/7D0LRtDsuOYxCLTc5BBh30w7oH6RUQEREREdUfZtSoXpS8+hXspy/BkVOMkvmrkTPyGZhW7YL9cgEKH3sf+bfNh+34eQCA/fRl5E15Gdb9p1H+nazb44ieEAz6YL0EIiIiIqJ6w4wa1Tn75QKUr9mjeMyRmYeixz4AVALg8CyHFPNKkPeHVyGoBemxsHE16/ZIRERERNTQMKNGda5s2WbA6lx3JkQaAK3afVAWpGm6pyDiyVvcx8stEEvNzts6DXQje9XXlImIiIiIgooZNapTjpJymD7ZKt03zpkA/dirUPLvVTCv2wsAEKLCEfHXm2GYPgyCWgVtnzYouP+/7rVpAHRDu0MVEVbv8yciIiIiCgYGalSnyj/fDrGoDIAzm2aYNgSqSANi3rkf1gfHw3b0HPSjekEVHymdo7uuK2I/fxIFs96A43IBAMBw88BgTJ+IiIiIKCgYqFGdEW12lL6/SbrvCtJctD1SoO2R4vVcbddkxH39d5Qt2Qh1yzjob+xf5/MlIiIiIgoVDNSozpjX74PjQq7zjkaN8LtGVet8dVIsIp++rQ5mRkREREQU2thMhOqEKIooXbxBuh82oT/ULeOCOCMiIiIiooaDGTUKOHtWIczf/Qrbb+nSY+F/uj54EyIiIiIiamAYqFFA2M5cRtnSH2DZeRz2tIuKY7pBXX2uRSMiIiIiIk8M1KjWRKsN+XcugON8rtfj4bPH1/OMiIiIiIgaNgZqVGvm7/d7BGmqFrHQXdsZYROuhn5Q1yDNjIiIiIioYWKgRrVWtnyLdFs3sicin50GdUoCBEEI3qSIiIiIiBowBmpUK9bj52Hdc1K6b/zzjdCkJgZxRkREREREDR/b81OtmD7aIt3W9EiB9qp2QZsLEREREVFjwUCNasxRVIby1bul++F/HMFyRyIiIiKiAGCgRjVWvmoXxDIzAECIDkfYxKuDPCMiIiIiosYhZAI1k8mEl19+GX379kVERAQiIyPRq1cvzJs3DwUFBbW6dmlpKebPn4+BAwciOjoaer0ebdq0waxZs/Dbb78F5gU0MaIoouyjLdJ9w22DIRj0QZsPEREREVFjIoiiKAZ7EpmZmRgzZgyOHj3q9XibNm2wbt06dOvWrdrXTk9Px7hx43DixAmvx9VqNV5//XX8+c9/rva1A+nIkSPo0aOHdP/w4cPo3r17EGdUOfOOYyi44zXnHUFA/JZ/sokIEREREVGABD2jZrPZcNNNN+Ho0aMQBAEPPvggNm/ejC1btmDOnDlQqVRIT0/HpEmTUFRUVK1rW61WTJw4UQrSJkyYgJUrV2LLli3497//jejoaNjtdsyZMwdr1qypi5fXaJmW/yjd1g3rziCNiIiIiCiAgt6ef/Hixdi7dy8A4I033sDDDz8sHRs2bBiuvfZa/OEPf8CpU6fw6quv4vnnn/f72v/73/9w+PBhAMCDDz6IhQsXKq49ceJE9OvXDyaTCX/9618xadKkAL2qxs2eVQDzxgPS/fA/jgjeZIiIiIiIGqGgZ9TefPNNAEDXrl3x0EMPeRyfNm0aJkyYAAB46623YLPZ/L72+vXrAQCCIOCll17yON61a1fcddddAICTJ08iLS2t2vNvimxHzwMOZ8WsKiEaumE9qjiDiIiIiIiqI6iB2vHjx3H8+HEAwPTp06FSeZ/OzJkzAQD5+fn48ccfvY7xJisrCwAQHx+PmJgYr2O6du0q3b506ZLf127KHIWl0m11qzgI6qDH+0REREREjUpQP2Hv3LlTuj106FCf4wYPHizd3rp1q9/Xb9GiBQAgJycHeXl5XsecOnVKut2yZUu/r92UiQXuQE2IjQjiTIiIiIiIGqegBmrHjh2Tbnfo0MHnuKSkJBgMBo9zquIqmQSAZ5991uP42bNn8cEHHwAA+vTpg3bt2vl97abMkV8i3VbFGIM4EyIiIiKiximogVpmZiYAQKPRSNkvX1zZLtc5/pg6dSpuvPFGAMDbb7+NyZMnY/Xq1fjpp5/wxhtvoH///igqKkJUVBQWLVpUw1fR9DgKyqTbQjQDNSIiIiKiQAtq18f8/HwAgNFohCAIlY41Gp0BQWFhod/XV6vVWLVqFV555RXMnz8fX331Fb766ivFmLFjx+KNN95A586dqzf5SmRlZSE7O7ta5zSkRiaibI2aKiY8iDMhIiIiImqcghqomc1mAEBYWFiVY12lj65z/HXixAns378fpaWlXo///PPPWLFiBf7+979DownM27Fw4cJqbSPQ0DgK5IEaM2pERERERIEW1NJHV5fHqrJpACCKouIcf2zfvh2DBw/GqlWrkJSUhI8++gi5ubkwmUzYs2cPpkyZgoKCAjz77LO44447YLfba/ZCmhhFMxEGakREREREARfUQC0iwtkxsLy8vMqxrjF6vd6va5eVlWHq1KkoKipC8+bNsXv3bsyYMQNxcXEICwvD1VdfjZUrV+LRRx8FAHz++ed45513avhKmhZHgbyZCLs+EhEREREFWlBLHyMjIwE4g6qquEoXY2Nj/br2V199Je2LNm/ePCQnJ3sd9/LLL+Pzzz/HhQsX8M4772DOnDl+Xb8ys2fPxtSpU6t1TlpaGm6++eZaP3d9kDcTYekjEREREVHgBTVQS01NBQBYLBZkZ2cjISHB51hXt0d/9zr75ZdfpNuuzo/e6HQ6jB8/HkuWLMHJkydRUFDgc3NsfyUmJiIxMbFW1whVosOhaCbC0kciIiIiosALaulj165dpdvyjacrunjxIkwmk8c5lZE3D6kq8JIHVUVFRX5dv6kSi8sBhyjdZ0aNiIiIiCjwghqoDRgwQLq9Y8cOn+O2b98u3b7uuuv8unazZs2k22fOnKl07Pnz5wE4m5rIzyNPDlk2DYIAIdIQvMkQERERETVSQQ3UOnTogJ49ewIAli9fLnV2rOjDDz8E4FyfNmzYML+uPXjwYOn2xx9/7HNcSUkJvv32WwDAVVddhfBw7gtWGTHf3UhEiA6HoA7qXyEiIiIiokYp6J+yH3zwQQDAwYMHMX/+fI/jK1aswLp16wAA9913n99dH8eMGYP27dsDABYsWIBt27Z5jLHZbPjTn/6EnJwcAMBDDz1Uo9fQlHAPNSIiIiKiuhf0QO2+++7DVVddBQD429/+hpkzZ2LTpk3Ytm0bHn30Udxxxx0AgDZt2uCpp55SnJueng5BECAIAoYPH644ptVqsXjxYmg0GlgsFowePRqzZ8/Gd999hz179uCjjz7CddddhxUrVgAArr/+esycObPuX3ADJw/UhGgGakREREREdSGoXR8BQK1WY+3atRg1ahSOHz+O5cuXY/ny5YoxrVq1wtq1axEdHV2ta48YMQJffvkl7rzzThQVFeG///0v/vvf/3qMmzhxIj755BOo1epavZamQCxka34iIiIioroW9Iwa4Gy5v3//fvz73/9G3759ERkZCb1ej65du+Kpp57CwYMH0b179xpde9KkSfj999/x9NNPo2/fvoiKioJOp0OrVq0wZcoUfPPNN1izZo20pxtVTln6yPV8RERERER1QRB9dfCgenXkyBH06NFDun/48OEaB6d1qfj5z1C29AcAgGHmCEQ9Pz3IMyIiIiIianxCIqNGDYe8Pb8qJiKIMyEiIiIiarwYqFG1sOsjEREREVHdY6BG1SIWuJuJCAzUiIiIiIjqBAM1qhZm1IiIiIiI6h4DNaoWR0GJdFuIZtdHIiIiIqK6wECN/CY6HBDlGbVYNhMhIiIiIqoLDNTIb2JJOeBw7+bA0kciIiIiorrBQI38Jspa8wOAEMXSRyIiIiKiusBAjfzmkHd8jAqHoOZfHyIiIiKiusBP2uQ3dnwkIqLaOr//AnJO5UIUxaoHExE1YZpgT4AaDjFf1vExhmWPRERUfVve2IHcM3kwNgvH9X8fiZR+ycGeEhFRSGJGjfzmKJRn1NjxkYiIqqc4qwS5Z/IAAKU5ZYhqHhnkGRERhS4GauQ3lj4SEVFtnP3lnHQ7ulUUYpKjgzgbIqLQxkCN/CbfQ01goEZE1CTYrfaAXevsz+5ALXVA64Bdl4ioMWKgRn5jRo2IqOkwl5jx6X1f4r83fIDjG0/W+noOmwMZv56X7qcOSKn1NYmIGjMGauQ3eaAmRDNQIyJqSOxWO/IzCiA6/Ou2uH/lIWSdyIbdYsfe/x2o9fNfOnYZlhILAECtVSG5T8taX5OIqDFj10fym6jIqLHrIxFRQ+GwO7D6ibW4cOAiOgxrhxvnXV/peGu5Fb+tOizdLzhfCIfdAVUt9s88u8edTWvZqwV04doaX4uIqClgRo38pih9jGXXRyKihiJ9dwYuHLgIAEjbehrmEnOl449+ewLlheXSfbvFjpKskkrOqNrZPRnSba5PIyKqGjNq5DexkKWPRER1zWF34LdVh1FwoRAdh7dHq94tIAhCra554MtDivsl2aXQR+i9P7/NgX0rfvN4PP98IaJaRFX6PJmHL2HrmzsAAOPnjpa6OpYVmHD5RLY0juvTiIiqxkCN/CKKIhwFZdJ9NhMhIqobPy/biz3L9wEADq4+gsTOCeg3rTc6DG0HlcZ3IYwoihAdokd5Yu6ZPJz79YLisZLsUsS3jfN6nd+3nkbRpWKPx/MzCpB6tfdMmCiK2LfiN+xY9DNEu3MN3IZ/bcbUt26GoBKQ8ct54MrSuIgEI+Lbxvp8HURE5MTSR/KLWFIO2NwtmhmoEREF3ukd6VKQ5pJ1IhvfPb8JH97xKc4fyPR63uUT2Vh004dY9of/Ifv3HMWxA7K1Zi4l2aUejwHOgOvX/+33eqzgXKHXx8uLyvHN39dj+393S0EaAFw8fBlHvzsBADi7x92WP+Xq1rXOEBIRNQUM1Mgv8kYiACBEs5kIEYW23Ut/wYoHVyP954yqB4eAgguF+P6lzdL9is02ii4V45unvoO52HN92ba3dqC8sBzFl0uwbu4GaQ1aeVE5jn/v2Vq/JNv7erOMveeRnZYr3e92Qxfpdv75Ao/x+RkF+N+fVuLMzrPuB2Ux2Pb3dsNUYFJsdN2G69OIiPzCQI384pCvT4s0QNCogzgbIqLK5abn4edlv+LS0cv49tkNKLpYFOwpVcpabsW6uRuk9vVagwa3//cWTF9yK7qM6SiVM1rKrDi89pji3KyT2cg8dEm6X5hZhM2vboMoijiy7jhsZpvH8/nKqP366QHpdurA1ugwrK10P/9cgcf4rW/tQPFld9DX5poUTF9yK7QG58qK8sJyrH36e5jyTQAAQSWgdf/kyt4KIiK6gmvUyC/c7JqIGpLc03nSbavJhh/+sw03/+fGoJfcWcosyDx4CZePZ0FQqxAWpYchKgxp284gR5bJGv3kcMS1ca7jGvv0KEQ2j8AvHztLEg98eQh9pvaE+soXZge+9CxtPLn5FFr1aYnfVruPhccZUJbnDJiKvWTULp/IVqxl6/+HPohIcHf4Lb5cApvZBo3e+dHBYXPgwm8XpePX3NUfA/7YD4JKwMBZ/bH9v7sBQBFEJnVLRFik9yYmRESkxECN/CIvfRQYqBFRiKvYDCNj73kc/e4EustK+epL4cUiHP7mGM7vv4DLJ7IV67i86TO1JzqN7KB4rNfkHvj1s9/gsDlQkl2KtC2n0Xl0R5Tlm3Dyh9+lceFx4SjLczZ++nHBT1IDD5Vahf53XIVtb+0EAJTmlKGiExvd12neNRGt+rSEaBeh0qjgsDkAESi4UIRm7ZxNSPLO5kvZOpVahX5/6ANB5QyE+9zaE8fWn0DumXzFc7DbIxGR/1j6SH5x5DOjRkQNR+FFz66FP72zE6W53kv+6oq13IoV96/C3k/249LRrCqDtJY9kzD4gWs8Ho9oZkTnUe7gbd/nByGKIg6vPQq71QEAMMaH4/aFN8MQHeYcJHuqDsPaIqlLonTf2xo1eWljx+HtIAgCVBoVolu6W/IXyNapydvtx7eNlTJtAKDWqDHi0SEez8H904iI/MdAjfzi4B5qRNSAeFuTZi6x4MfXfoIo+g6W7FY7bBa7z+PVlXUyBybZxtEAoI/Qoe11qeg4oj1a92uFhI7NEJkUgdQBrXHD89dLJY0VXTW1l/u6J7Jxfn8mDn51RHqs503dEdUiCtf/faTHub2n9EREoruMsbzIDGu5VTGmMNP9nsW0ipZux7aOkW7nZxQo5uCS2DnB4zlb9W6JrmM7SfcN0WFo7mUcERF5x9JH8ovINWpE1IAUyTJqba9LlboSntqejrQtp9FxRHuPc4qzSvD57NUwFZZj/NzRaD+krceY6io4725pH5sSg3HPjEKz9vEee535I6FjMyT3bYXz+5zryNa/8INU5qjWqtBjYlcAzoYe/ab1xq+fOTetTuycgBbdm8Nhd0BQCRAdzkC1JKcUsckxAADRISrKRaNkWbTY1u6gLV/Wov9yFYEaAAx+8Fpkp+Ui51QuBs7qL5VGEhFR1RiokV8UzURiGagRUegSHSKKL7uDjuv+NADWMqu0B9mWN7ej7aA20OiUmauTm9OkbogbXv4RMzonIFKWhQKc+4zlpedDUAkwxBgQFqmvNPiQB2qJnROQ2Kl2GaW+t/WSAjVXkAYAnUZ2gDHOvW3KtfcOgFqnRt7ZAgy6fyAEQYBao0Z4nEFan1aa7Q7USnNLYZdlEqNbREq3Y2QZNVfpo91mR84pd/MTX5my8FgDbn/3FjhsdujCdTV70URETRQDNfKLopkISx+JKISV5JRK67YAIKpFFEY9OQwfz1wBu9WBsjwTck7lIKlrc8V58tI/S4kFP8zfipteuUHqFGm32vHdcxtxanu6NE5QCQiL0iP5qlYY83/DoQ1T7n0mD9RikqNRW20GpiA2JUZRggg4Sxvl1Bo1rr1ngMf5Ec2MUqAmb9FfmOkObA0xYYqgSp5Rc216nXcmXwrsVBoV4tvF+5yzRqcGdNzShYiourhGjfzC9vxE1FDIyx6dQYcWMa2iFZmh4kuezTTk+4EBwNk953Dk2+MAnK3o17/wgyJIA5zZO1NBOX7/8RSOfnfC45oFF2SBWqvaB2qCSlCsVQOAFj2a+732S9FuXx6oydb0yZuHAMqMmqmwHOVF5Yqyx2bt4jyyk0REVHsM1MgvDNSIKJQ4bA5cPHwJRZc9uzvKG4lEtXAHHVFJ7nK+iu37ASjKJV1+ensnCi8WYcO/NiNt62n3AS/VjvLmGoCzTLLwQmAzagDQ5fqOCHN1dgTQ+5aelYxWikhw//4uVWTUvL9ngLN8UWd0Z9jyzxVW2UiEiIhqj6WP5Bfuo0ZEdaXocjF+33wKANBrcneP8kG5snwTDq89hkNrjqIkqwRqrQp3LL1N0ZlQkR2SrbWKai4L1CoEZaIookiWZRPUAkS7CEuZFZ/+aSXMJRbpWJtrUnDjC2NhN9tweN0xaWPn3DN5imuW5pbBarJJ92MDFKhpw7QYP3c0flq4C616t0AnL41RfJEHavIW/UWZvjNqgiAgJjlaCs4KzhUoG4nUct1dXRNFEQ6Ho+qBRERXqFQqqew9mBioUZVEUVS052dGjYiqw2FzIOv3bAACwmMNMMSEQaVR4czODBxeexRn95yT9vyyW+0Y8Md+HtcovFiE3e//gt+3nFKsP7NbHUjbdgZX33GV9Ji89FGRUWvhO6NmLrHAanK3qx98/zX4aeEu6ZhL636tcOO866HRqaHRqdGqdwvpWG56PkSHKDUXka9PM0SHQR+pr/yNqoaU/sm444Op1T5PHqgV+8ioRVfIqAHOdWquQC3nTJ6ykUiX0ArURFGE2WxGUVERiouLYbFYqj6JiKgCg8GA2NhYREZGQqUKThEiAzWqklhmBqzubmAM1IioOtY+873UHt9FpVbBYffMcpw/kOkRqImiiK+f/NajgYZL9u85ivvyza7l5Y6RzWXrsyqsUSuWBW4avQZX3dYLF367iNM70qXHW/ZMwsQXxyk2do5LjZNu28ptKLpcLAU6gW4kEgjyNWolvtaotfIWqMVIt09tOwOHzfmzU2tViG8b5zE+WERRRHZ2NnJzc6seTERUCZPJBJPJBL1ej9TUVKjV9b8Wl2vUqEpifqnivhAd7mMkEZFScVaJR5AGwGuQBgC5Z/I9HivNLVMEacb4cLQb3Ea6n52mDNSUa9RkpY+yoK34crFi4+viLHfgFtk8AoIgYOTjQ6UMVFK3REz69w3QGpRlmbpwreK6uafd5Y+hGai5v2gryy+D3WaH1WRFWZ5Jerxi6SOgbCgiz77Ft4uHWhsajUQYpBFRXTCbzcjMzFT8m1FfmFGjKskbiQgRYRC0/GtD1FBYyizQGrRBq7W/ePiS+44AqcQRcGZj2g9ph/ZD2uC75zcBcO4NZiowwRBjkMbJg5/wuHDc9fkdKLpUjNNXOjAWnC+EpcwKXbgWNosdJTnu31nyMr5IWUBlKbPCXGxGWJSzKYe8FNKVeTPGh2P6kluReyYPLXu2gErj/bvN+LZx0vm5Z/LQblAbaV4uIROoNZNVRIjOINhS6i75VGtVMMZ7fhknb9Ev52+3yfpgNpsVQZpOp0NsbCyMRiM0Gv67RUT+EUURFosFeXl5KC52/m4vKSlBcXExoqI8v8iqS/zNRVWSr0/jHmpEoU8URZz6KR27P9iD3DP56DquM8b83/CgBGuZh9yBWpfRHTH6/4ajvLAcpkIzIhON0EfoIYoiflzwE8qLzACca72S+7gDNfl6qIQO8VBr1IhpGQ2tQetcVyYCOadz0bJHEoqzit3BoKAsdzREh0Gj18Bmdjb4KLpUIgVq8tb88k2uDTEGJF/VqtLXGN8uDmd2ObOG8oYioRioafQahEWHobywHICz/NF1G3AGsyq1Z0Aac2Vj7IoSQ2h9WlGRO9On0+mQmprKAI2IakSj0cBgMODcuXMoLXV+Di4tLa33QI2lj1Qlka35iRqMc/suYMWDq7Hume+lMsJj60/g9x9PBWU+F49clm636JkEtUYNY7wRzdrFQR/hbK4hCIJinVPF7onyQK1ZO+c4QSWgWQf3JsuudWryRiKRiRGKsjxBEHy26Jfflo/xR3zbWNncne+56BCVe6j5CHSCQdH5MaukykYigLPE09jMM9PWPIQ6Prq++QaA2NhYBmlEVCuCICA62v0lmytgq08M1KhKij3UYhmoEYUim8WOb/6+Hqse/QaXj2V5HP9p4S5FV0NvRFHErvd/wVdPrkPWyZxKx/rDUmZVrB9r0SPJ59i4NrJg53SFQE12v1l7d3CW2KGZdNtboCZfn+aiaCgia9GvXKNW3UDNHWTmZ+TDYXOgJKcUdou7CVNMcv1+C1sZRaCWU6oM1LysT3OJrRBsqnVqxMmC1GBylSq5GI38t4qIas9gcFd3WK3Wel+nxkCNqiQWyPYWYukjUUg6te20okMh4NzvS611/povyS7F3v/tr/Qa5/dnYs/yX3H253PY9s4On+NMheVS17/KXD6eBdHu/EdNZ9Qhvo3vD/WKjFq6u6GI3WZH3ln3/XhZoJbQURaopTmzbvLuhVFJnkGHr4yaPGiLSopAdcS2joGgdpaV2q0OFFwoVJQ9hseFQxeu83V6vavY+VHR8bGSQC0mJUZx31WGGgoq7pPGbBoRBULFTo/1vScjAzWqkmVPmnRb3TJ02jATkdvZX85Lt5u1j8dt79yMm/59A666rbf0+K+f/abInlQkL1PMOpHt9ZvDw2uPYfHNH2L5nZ/BUlZ5hk7eSCSpW6LXtU/SnNspSx9dz52fUSgFhSq1CnGyYCGhoztoyz2dC7vNrsioRXvLqMk7P14J1Gxmm6LroXyNmj80eg1iWrnLY3LP5IXk+jQXeUORkuxSFGXK3rNKM2rK1xHqG10TETV0DNSoUo7CMlh2HpPu60f2CuJsiJoG0SHip4W7sGTKchxYeajq8aKIc/suSPf7/aGPVGZ49Yy+0toiu8UubeLsjXwtmNVkU+yz5bL/84MQHSIKM4twZmd6pfPKlAVqLSspewSAuDbuQM1cbEZpbpnHnGJTYhRrzuLaxEnBn93qQP7Zggqt+b1k1GSlj0VXGogUZ8uqBlQCjAnVrxyouMYupAO1imvU5O9ZZRk1WYt+AEgMoY6P5HT48OGgPG96ejoEQYAgCHj66aeDMofKuOY2Y8aMYE+FqFoYqFGlzD/8Jm12LcRHQjugY5BnRNT47V62F/tW/IbSnDJsW7gT5cXmSscXnC9EiWyNVet+7i6FunAtBj9wrXT/1E9ncPaXc16vIw+KAChKDgHnOrj88wXSffnG0hWJDhGX5I1EqgjUDNFhirbwroYiOadljUTaKzP6Gp1asbYt+/cc5WbXXjJq3kof5ZtfG5sZa1TOV7GhSEMJ1LLTchVr6bxlIV1iU5SvI5Ra8zd1RUVFePDBBzFp0qRgT4WIAoiBGlXK/N0+6XbY9X0gVFK6RES1l7btNPZ8+Kt0X7SLOPfr+UrOADL2uo/Ht4uDMU7Zna/z6A5o0dMdKG17ayfsNrtijM1sUwQXAJB/tkBxv+BcgbTmDFCu66oo72w+zCXO5g6CSkBSt8RKXwNQoaGIK1CTd3yUrU9zkZc/Xjh4UdFqvqrSR3OxGeZSC4oue+6hVl0NKaMWKVuj5tqqAAAMMWGVrqWLbhGF2Culp7EpMYhLDY1GIgQ89thjePfdd+t9/QwR1S1+6iafHCXlMG91l1Hox/cL4myIGr+c03nY8NJmj8fP7vGeAXORlz3Ks2kugiBg+JxBzg2n4QyiMn5RBn+5Z/IgOpRr0vIy8j3GyMn3HqtIvn9as3ZxfjXTiFesU3M+t7wDZLN23gI1d0MReTMVtc65DUBFxrhwqcEK4FynJs+oRQUgUCs4X4jCzNAN1HyVdla2Pg1wrhG85bUJGPXXYZj86gSfG4BT/bPZbFUPqkNt2rSBKIoQRRH//Oc/gzoXosaEv2XJJ8uPhwCL85e/EB0O3bWdgzwjosarvKgca/+xHlaT5weus7+c99kS2GF34LwsUEvpl+x1XGKnBLS7ro10/8LBi4rj2RXKHgHPjJq8GyNQeaB28Yg7UKuq7NGlYlaqvKhcsU5OHsi5JMha9JsKZBs3N4+AoPLc4FtQCYhIVK5TK86SZ9Sq15rfJaZVtBQAig4RdqtDdix0WvMDgN6ogy5c6/G4tzV9FUUkRKDHhK7VbrhCRETVx0CNfCpf7y6/0o/uDUHLdsdEdcFhd+C75ze5OzIKwOgnh0vHS7JKPIIml6yTOVKJoUqtQqveLXw+j/zYRVnGCwByvQRqeRnK58yrkFErulzsM4CUZ9T8DdSayQK1vPQ8qeU+AOgj9Yq1VS4JHTyzbIDvjZsB5Tq14ssVMmrV3OzaRaVRIdZLKWBEghHaMM+gKNjkLfpdqsqoERFR/WKgRl6J5RZYfnSXPYax7JGozpzcfEqxzuy6Pw1A9xu7KIIQXw1A5GWPzbslVlpiKA+YLp/Ihk3WRMJbRq0srwxmWSOTihk1W7lNsSZMOi/fhMIL7k6CLXs09zknOfkaNavJhjM7z0r3m7WPgyB4Zsj0Efoqm4ZUdqzoUrFyjVotMkXyjKCLvG1/KPEW9DJQa3iee+45CIKADz/8EABw9uxZqcPhc889B8Dd8XDZsmX49ddfMWjQIBgMBsTHx2PEiBE4d879u8XhcODzzz/HH/7wB7Rr1w6RkZHQ6/VISkrCmDFjsHDhQpjNns2NKuv6OGvWLGf59fDhAID9+/dj5syZSElJka596623YuvWrXXzJvlp/fr1mDp1Klq3bg29Xo/4+HgMGjQI8+fPR2mpZwdcuR07duCPf/wj2rZtC71ej+joaHTr1g2zZ8/GwYMHfZ5nsVjw3nvvYdSoUYiPj4dOp0NiYiKGDh2KV155BUVFvrdToaaBgRp5Zd56BGKZ85exYNRDN7hbkGdE1Hgd/PqIdLvd4Dbof8dVAICUAa2lx32tU5M3Gknxsj5NLrFTM6h1zo6Gdosd2SezATjb++ecyvN6jiurZjPbvO7BVuSl/FG+f5qxWbiigUdldOE6RMo2mz75o3sPR2/r01zk69RcvAVvLvLyxsLMIkV5ZWQ1N7uW87ahd6itT3MxNmOg1tQcP34cw4cPx86dO1FeXo68vDycOnUKrVo5f29cunQJAwYMwO23347PPvsMZ86cQUlJCSwWCy5fvoxNmzbhoYcewjXXXIOCgoIazWHJkiUYOHAgli9fjnPnzknX/vLLLzF8+HC89NJLAXzF/ikuLsbNN9+M8ePHY+XKlTh//jwsFgvy8vKwc+dOPPnkk+jcuTN+/fVXr+c/99xzGDx4MD766COkp6fDYrGgqKgIx44dw3//+1/06dMHL774osd5eXl5uO666/DAAw9g8+bNyMvLg9VqRXZ2Nn766Sf87W9/Q8eOHXHgwIE6fgcolDFQI6/M693dHnUje0EIwdIdosYg51SuogxxwB/7SZmj1KvdgdqF3y4qOvQBzuBJXmLYuq/39Wkuaq0azbu4uy+69jkryS5VZM6iZWuqXC368zMKPJqNAN47P8oDtRbdk7xmwnyRZ6VKc8qk2946ProkegnU/C19vHwsS9pQGwAiE2tW+gj4yKiFaKAWmchArTF44IEHsH//fkycOBEA0KJFC+zfvx/79+/HAw88oBg7f/58WCwW/Otf/8L27duxZMkSzJ07FyqVCqIoYtKkSVIwcscdd2DVqlXYtWsXvv32W7z44ouIj3f+N3jgwIEaBVRHjhzBAw88gMjISLzwwgvYtm0bfvzxRzzyyCNQqZwfR5955hkcOXKkiisFjs1mw6233oqvv/4aAHDVVVdh6dKl2L17N7799lvcfffdUKlUuHDhAkaMGIETJ04ozt+8eTOef/55AED//v3x8ccfY/fu3diyZQteeeUVJCQkQBRFPP3009i5c6fi3L/85S/S+/3ggw9i/fr1+OWXX7B27VrMnDkTAJCVlYU//OEPsNuVXXqp6eCiI/IgWmwwb/pNus+yR6K6c+ibo9LtxM4Jir2pWvZMgtaggdVkg81sw4WDFxXBW+bhS9IeWFqDxq8W+C17JiHzSiORi4cuAdOULfAjEoxo2SNJKl3Mv5JRy033nnFz7UUmp9jouqd/69Nc4tvGIX1Xhsfjzbw0EpGOdaheRi1KljWTZ9PCovRem2z4y1uzk1AN1CquUVNrVYp97KhhSEpKQlJSEuLinH/3dDod+vTp43Wsw+HAq6++ij//+c8AgEGDBknH1q5di19++QUA8OSTT+Lf//634tzx48djxowZ6NatG0pLS7Fq1Sq88sor1ZprTk4OEhMT8fPPP6NNmzbS48OHD0dKSgoef/xxOBwOfPzxx/jXv/5VrWvX1IcffogNGzYAAKZMmYLPPvsMGo37o/H48eNx44034tZbb0VxcTHuuusuRcC1fPlyAEBCQgI2b96MyEj3751hw4Zh7Nix6Nu3L+x2O5YuXYrrrrsOAGA2m7FixQoAwP3334+FCxcq5nXjjTciMTER8+fPx/Hjx7F7927Fz4uaDgZq5MGy4xjEYpPzTpgO+uE9gjshokbKUmbF8e9PSvd73qQsMVZr1Uju0wpndjnXamX8cl4RqJ371b0+rVXvllBrq96oWb5OLfPwJYiiqGja0axDvKIphiuj5mqXX1FxlrL00WaxI+tEtuz5/FufJj2/l6wUBCCuje9ALbGjZ7atsg6Gvkoxa9rxUXrO5pFSYO0SuoGaMqMWmRQJVSPdJ1MURaDUd4fSemGMqFZmuS5otVrcc889Xo/9/vvvSE1NRXZ2Np588kmvY1JSUnDttddi06ZNyMjw/DLFH08++aQiSHO555578MQTT0AURRw+fNjzxDry2muvAQDi4+PxwQcfKII0l1tuuQX33nsvFi1ahF27dmHnzp1SwHX58mUAQLNmzRRBmkuvXr3w9NNPw+FwYMCAAdLjBQUFsFicTaDatWvndW4PP/wwioqK0L59ezRvXr3fo9R4MFAjD+Vrf5Fu64f3gBCuD+JsiBqvk5vTYCmzAgB0Rh06j+zgMSZ1QGspUDu75xyGzL5WOiZvQNK6b+Xr01zkjT1MBeUouFCI3NOyQK1dPOKubGoMyDJqso6P+gid1GlS3jERALJPZkut6TV6jdf1Y5XxVj4Y3TKq0kyXsZkRhugwmK40NtEZdQiL8v17KyLeCEEtKDbvBmq+h5qLoBIQ1yYOl49lXXkgdMsJIyqsUausVLTBKy1B0eQxQZ1C1OqNQETtvgiorR49esBgMHg99thjj+Gxxx6Dw+GQyhC9SUpyftFjtVqrHOvNmDHefw7R0dGIi4tDbm4uSkrqJ6i+ePEijh51VjTcdtttiIry/d/A/fffj0WLFgEANmzYIAVqnTt3xvr163Hs2DHcf//9ePrpp9G6dWvFua6mLnIJCQmIjY1Ffn4+XnrpJbRo0QK33nqr4ufTunVrvPvuu7V9mdTANc6vz6jGbGcuo/zrPdJ9/fi+QZwNUeN2aI277LHr9Z2gNXgGIykD3OvOcs/koSTb+SGmvNiMrJPuzFXr/pWvT3MJiwpDnCxjdvHQJUVGLaFDvOJ4YWYRbBY78mQdH1NkWb2Ka9QU1+oYD7Wm6iyfXGxKjMf+Z5WtTwOcXe3kAWFUi8hKsxcqjQqRXtrTR9YyUAOUgWZk8who9KH5fWjFjFqoBpQUOK6mIZVxBV52ux3p6enYtGkTFi5ciPvvvx89e/bExx9/LI11OBy+LuNTamqqz2NGo/PvZH1t3i1fC3f11VdXOrZ3797Q6XQe582ePVua96JFi5CSkoI+ffrgb3/7G7Zs2eLztahUKjz++OMAgMLCQvzxj39EfHw8brzxRrz55ptIS0vzeh41PQzUSKFk/mrA5lzzom7dDGHjGKgR1YXLx7MUJYI9JnnvrBrTKlqx3ursL84s2snNacCVhJAh1uC9ZNAHeTlixq8XUHC+ULrfrF08olq6y+BEh4icU7kovOju+NjmmhTpdsWujzny7FwVAZY3Gr1G0czEOaeqX5s8UIv2o8ukt+6OtS19BJzbCLjEpXh2gQwVYdFhUgdQgIFaU1BZxghwBl4ffvghhg4dCqPRiLZt22LMmDF46KGHsGjRIhw+fLjaGbSKXEGNN64vV3ztzRhoubnu31WJiZWv71Wr1VIzlbw8d3VBp06dsG7dOkU552+//YZXXnkFI0aMQEJCAv70pz/h2LFjHtf8+9//jqeffhparfMLOpPJhG+//RaPPPIIOnbsiB49euCVV15BWVmZx7nUdITmV30UFNb9p2H+1t1+1vjEzRD07PZIVBfk2bSWPZN8BiOCICD16tbS+CPfHsfxDSdxfn+mNKb1Va08slCVadkzCUfWHQcApG09LXVzVOvUiEmOhkqjQkxytLQ+7dRPZ6SgUK1TI7lPS+la5YXlsJqsUjYw57T7Q0xlLfUrE982DgXnZMGjHwFf9xu64MCXh+CwO9Dthi5Vjo9qHokLuKh4zN9tBCrTdWxnHPr6KMryytD39t61vl5dEQQBkYkRUpBeMThuVIwRztLDIM8h2CrLMptMJkyaNAmbNm2SHlOr1ejYsSO6d++O/v37Y+TIkXjnnXekBhoNnTwg9Gf9oKvzYsWxw4YNw8mTJ/Hdd99h1apVWL9+vbR2raCgAO+//z6WL1+ODz/8EH/4wx8Uz/nCCy/goYcewueff441a9Zg+/bt0j51R44cwd/+9jcsXrwY27ZtQ4sWLWr9mqnhYaBGAJy/sIr/9aV0X9MjBWETKy8FIKKaMRebceIHd2lLxSYiFaUOdAdq8lb+Ll3GdqzW87eQdWK0yza9jm8bB5XG+Y15bEqMFKilbT0tjYlLjUVEghEqtQoOu7P0qfhyCeLaxEIUReSekmfU/M/yycW3jcOpbWfc1/Ej4ItNicG9q/8Ia7nNY/2VN96CstquUQOAsEg97lx+Oxw2h1/NXYKpz5Se2PrWDsSlxiia1DQ2giAEfX1YqHv66aelIG38+PF48skncc011yAsLEwxrrqdHkOZq1Mm4GyDXxmr1Yr8/HyP81y0Wi0mTZqESZMmAQAOHz6M77//Hl988QV+/vlnWK1W3HfffRg3bhxiY5WZ9qSkJMyZMwdz5syByWTCTz/9hG+//RafffYZLl++jLS0NDzxxBP45JNPavuSqQFi6SMBACw/HoJ1j7v7XMT/TYFQyxIHIvLu+KbfYSt3rl0Iiw5Dh6Heu365JF/VymtHvla9W+DWN29C22t8r/vwJqZVNAwxYR6PyzNXcakx0m15aWR8m1io1CpEyPbhKrqyTq34UrHUHAXw3hjEH/IyTk2Yxu+yPH2E3q8gDXBm1CoKROkj4AwMQj1IA4Det/TAvV/NxB0f3Baya+mo7tntdixevBiAsznGN998g+HDh3sEaQBw7ty5+p5enenZs6d027U1gS8HDhyA1er83da5c2fp8dLSUvz66684ffq0YnyPHj3w+OOPY/fu3XjwwQcBACUlJdi+fbs05uLFi9i8ebNiHZvBYMD111+P119/HUeOHEFysnPt8fr162v4Kqmh4ydxgmh3oOTfq6T7uiHdoB9c+Tf8RFRz8rLFrtd3qvJDst6oQ9tB7mCsZc8k3LJgIqa8MQmtele/HEYQBLTs4bm/mTwDFutjfVXclSBKHugUX1mnJi97jEyKgD6iZh1jk/u2gj7CuXC/w9B21Srr9FfFfdY0eo3X4LWxM0SH1cn7S/WrNmvHsrOzUVzs/LKlZ8+eUKu9f8mQlpYmbdAM1F/Tj7rSokULdOvm/Kzz+eefS++BN65AFgBGjhwJACguLkZ0dDT69++PefPm+Tx37Nix0m2Tybn10X//+1+0bNkSo0aNUgRvcvHx8ejfvz8AoLy83M9XRY0NAzVC+Ze7YDtxZT8mQUDE/00J7oSIGjn5JtGJnf1rXz/mb8Mx6q/DcOubk3DrWzehdd9WtdqXqYWXjagTfGTU5OLbOAM4eYdE1+vJqdDmv6YM0WGY/v5UTPjnWIx6YmiNr1OZih0eI5sHf58roprS651fitSktX10dLS0f9jOnTu9BiwXL17E7bffLq3TAiDtA9aQzZkzB4BzM+57771X8fpcVq1ahSVLlgAA+vTpg+HDhwMAIiMjMWTIEADAihUrFEGsnGtja0EQ0Levs0HbuHHjpOD6H//4h5Stk8vMzMRPP/0EANJ51PSw1qGJE212lLy+RrofdvNAaLunVHIGEdVWkayDYmUbM8vpI/ToMaFrwObgPaPmDq5iZXupybnKGSMVGbUrgdopWSORGnR8lItKikRUAJp7+BKZGAEIkJqkRCYGv9kDUU25NkTOzc3F/PnzMWLECMTGxqJ9+/ZVnmswGDB+/Hh88803yMzMxLBhw/D444+jffv2KCgowLZt27B48WLk5OQozisqKqqyk2Sou/fee/HFF1/ghx9+wIoVK/D7779jzpw56NKlC/Lz8/HFF19g2bJlEEURBoMBn3zyieILnblz52Lr1q0oLy/HiBEj8PDDD2Pw4MGIjY3FuXPnsHTpUnz33XcAnHu1dejg3Cuzbdu2uPPOO/Hhhx9i586d6Nu3r/S8NpsNhw4dwquvvorc3FwIgoCnnnoqKO8PBR8DtSbOsvM4HJlXPlxp1Ih47KbgToiokTOXWlBeZJbu12UwUpmETglQ69RSM5GIBCPCotylf7pwHSISjCjJLpUe04RppPnK29u7Sh9zFR0fa7Y+rb6otWpENHO/vkDsoUYULDfeeCPmzZsHURTx5JNPAgDuuOMOxb5nlXnrrbewb98+XLhwAfv378eMGTM8xnTr1g233347nn32WQDAsWPHpDVUDZVKpcLq1avxhz/8AevWrcO+ffswa9Ysj3Ft2rTBF198IZVKuowYMQILFizA448/juLiYrz00kten2fUqFF4//33FY+9/fbbOHv2LLZs2YLDhw/jvvvu8zhPq9Xi1VdfxQ033FDzF0kNGksfm7jyr36WbuuH94C6tX9lWERUM/KyR7VODWNceFDmodGp0bxzgnS/WQfPDFjFrFpcaqy0nkm+Rq3ocglsZhvyzxW4r1eL0sf6Ig/OghUwEwXC1VdfjS+++AJ9+/aFwWBAVFRUtfbfSk1Nxf79+/HEE0+gS5cu0Ov10Ol0SEpKwqhRo/Dee+/h119/xQMPPCCVSX7++ed19XLqVWRkJNauXYs1a9bglltuQcuWLaXXPnToULz77rs4dOiQtF6sokceeQR79+7Fvffeiy5duiA8PBw6nQ7Jycm4+eab8cUXX2DTpk0ee8hFRETghx9+wEcffYTx48ejRYsW0Ol0iIiIQNeuXTFnzhwcPHgQDz/8cH28DRSimFFrwkSTGebv90n3wyZfE8TZEDUedpsdeen5aNYu3qNRQ7EsUItsHhHURg6pA1sj80q7/1a9PJuSxKXG4tyvF6T78W3dDUbkGbXSnFLknMr12I8t1LW5JgUXDzv3O0od0Hjb01PTMGXKFEyZ4rnG3N8NpBMSEjB//nzMnz/f55iwsDCv66natGnj83mWLVuGZcuWVfn86enpfs2zJvx5DyZOnIiJEyfW6Pp9+vTBokWLqn2eSqXCjBkzvGYwiQAGak2aeeNvEEudJVhCRBj0o3oFeUZEDZ8oivh89lfIOpGNzqM7YNwzoxXHC2Xr06L9XJ9WV66a2gvmEmdDgN639PA4XjGjFt/GXc4YmeAO1ESHiPQ952TjYqX92EJZv2l9ENs6BsZmRjTvkhjs6RARESkwUGvCTF/tlm7rx/eDEKYL4myIAu/ikcs4sPIQ2l6Xgi5jOtXLcxZcKETWiWwAwIlNaRj1xDBoDVrpeNFFd0Yt2OV22jAthjx4rc/jcanKFv1xsoyaRq9BeJwBZXnOdtNndp6VjsWH+Po0F7VWjY7Dq262QEREFAwM1JooR24xLFuPSPcNkwcGcTZEgSeKItbP24SiS8U4+WMamrVvVi8NLiylyrKg/HMFSOzkXgsmX6NWcS+vUFMxUJNn1ABn50dXoOYKToGGsT6NiEJbRkYG8vLyqh5YiZSUFMTFNYwvjoi8YaDWRJWv2wvYHQAAVfMYaAd2DvKMiAKr8EKROygSgUNrjmDEX4bU+fNaTcpALS89XxmohVBGrSrG+HB0HN4Ov285jXaD23jMN6p5BC4fy/I4r7at+YmI5s6diw8//LBW11i6dKnXLo5EDQUDtSaqfLW77DHspgEQ1KG/noSoOi4evqS4f/z7kxh03zXQhWt9nBEY1vIKgdrZfOm2KIoNKqMGAOOfG4PBWSVe9xmL9BFoNpTSRyIiolAWMp/OTSYTXn75ZfTt2xcRERGIjIxEr169MG/ePBQUFNT6+seOHcNDDz2Ejh07Ijw8HFFRUbj22mvx1ltvwWKx1P4FNCC2s1mw7j8t3Q+7iWWP1PhkHrmsuG8ps+LkD7/X+fNayyoGagXSbXOxGZZS9+8bfze7DiZBEBDVPFKxyauLvEW/iyHWELQtB4io8XBtNF2bP8ymUUMXEhm1zMxMjBkzBkePHlU8fujQIRw6dAhLly7FunXrPDYa9Nfbb7+Nxx9/3CMg2717N3bv3o2PP/4Y69evR2xsrI8rNC7yvdPUnVpC041tqanxuVQhUAOAg18fRfcJXb0GHS6H1x7Dqe1n0Pe23mjdt1W1n9dablPcl2fUCmVlj1qDBoboMDRk3jaJDvWNromIiBqKoGfUbDYbbrrpJhw9ehSCIODBBx/E5s2bsWXLFsyZMwcqlQrp6emYNGkSioqKqr5gBcuXL8fDDz8Mi8WCpKQkLFiwANu3b8eaNWtw/fXXAwD27NmDO++8M9AvLSSJoohyWbdHw83XVPqhlaghMpdakHM61+Px7N9zcPm455oql5LsEmx+bRvSd2Vgw79+9Hv/ITlbhUCt8EIR7FY7gAqNRJK8Z6kaEu+BGtenERERBULQA7XFixdj7969AIA33ngDCxcuxIgRIzBs2DC88cYb+OSTTwAAp06dwquvvlqta+fk5OAvf/kLAKB169bYvXs3/vKXv2DQoEGYOHEi1q9fj8mTJwMA1q1bhx07dgTuhYUo27HzsJ9xf1ANmzQgiLMhqhuXjl4GrsRY+ggdWvRMko4d+vqoj7OAvIwCiHbniSVZJSi+XFLt57ZUaCbisDtQcKEQAFAk20MtKin0yx6r4q30sVl7ZtSIiIgCIeiB2ptvvgkA6Nq1Kx566CGP49OmTcOECRMAAG+99RZsNpvHGF/eeust5Oc7y46WLVuG1NRUxXFBEPDyyy9L91etWlXt+Tc09gx3C211p5ZQJ/Pbb2p85GWPSd2bo9dN3aX7JzefQnmx2et5pTllivtZJ3Oq/dy2Cs1EAPc6tYbU8dEf+kg9dEbl/ovs+EhERBQYQQ3Ujh8/juPHjwMApk+fDpXK+3RmzpwJAMjPz8ePP/7o9/VXrFgBABg+fDhGjhzpdUynTp3w5z//GQ8//DD69etXnek3SGJpuXRbFWMM4kyI6s5FWaDWonsSOgxrJ60Hs5ltOPb9Ca/nleaWKu5n/57tdVxlKmbUACD/yjq1htbx0R/y8kdBJXjsvUZEREQ1E9RAbefOndLtoUOH+hw3ePBg6fbWrVv9uvbZs2dx4oTzw9itt95a6di33noLb775JqZPn+7XtRsyscQdqAnGht3IgMgb0SHi4lFZoNajOTQ6Nbrd0EV67NDXR72uPyvJVgZqNcqomTyz/q6GIoqMWgPo+OgPefljTHI0NPqQ6FFFRETU4AU1UDt27Jh0u0OHDj7HJSUlwWAweJxTmUOHDkm35Zkym82Gs2fP4tSpU02uLT8AiCUm6bYQwUCNGp+8s/mwlDj/2xZUApK6JgIAek5yd43NzyjAhd8uepxbmlv70seK+6gBzk2vPfZQawSljwAQmeTOqLHskYiIKHCC+tVnZmamcxIaDVq0aFHp2JYtW+LUqVPSOVWRB3StW7dGdnY2nn76aXz22WdS98iIiAjcdtttePHFF5GUlOTrUtWWlZWF7OzqlUylpaUF7Pkr4yhxr81RMVCjRki+0XV8uzjowp1rqKJbRqF1v1Y49+sFAEDmoUtI7tNScW5pjjKjVpZXhtLcUhjj/S8TtnrJqOWfK0RZXhlsZvex6EZS+thpRHscXH0EANB1bKcgz4aIiKjxCGqg5mr0YTQaq2xTbTQ6PygVFhb6de3cXHdr7gsXLmDixInIylK25S4pKcEHH3yA9evXY8OGDejevXvFy9TIwoUL8fzzzwfkWoGmzKgZgjgTorqReVi+Pq254lizdvFSoFaS5dnRsaTCGjUAyPo9B22rE6h5yajZzDacP+DO4OkjdNBH6v2+Zihr1bslZv7vDxAdImJbxwR7OkRERI1GUEsfzWZndicsrOrMjqv00XVOVUpK3B/Cbr31VmRnZ+ORRx7ByZMnYTabceLECcyePRuAM7N30003obi42NflGg15MxGWPlJjdOmospGInLzxRXGFQE0URY+ujwCQXc3yR6uXZiIAkL47Q7rdWMoeXWJaRTNIIyIiCrCgBmquLo/+bPrqWvjvqzNkRSaTO3N07tw5vPHGG3j99dfRsWNH6HQ6dOrUCe+88w6effZZAM592t56663qvoQGRyxmMxFqvEwFJuRnFEj3W/RQZtQUgdpl5Rcz5YXlcNgcHtes7jo1eemjoHb/bju7RxaoNZJGIkRERFR3glr6GBHh/NBUXl5exUj3GL3ev3IheZaud+/eePjhh72O+8c//oFFixbh4sWL+N///oe///3vfl2/MrNnz8bUqVOrdU5aWhpuvvnmWj93VRyy0kdVJAM1alwuHnWXNxtiDYhuqQyIIhPdgVrRpRKIoih9UVSS65lNA4Csarbol5c+xreJQ84pZxm2qcD9e66xZdSIiIgo8IIaqEVGOj+slJV5/4AkV1rqXDsSG+vfHj2uawPAxIkTfY7TarUYPXo0PvroIxw5cgRlZWUIDw/36zl8SUxMRGJiYq2uUVcUpY/MqFEjc+mIu5FIi+7NPbL1kbJW8laTFZYSi7RWrFTWml9r0EoljMWXSmAqLJf2YauKvPSxeZcEKVCTayx7qBEREVHdCWrpY2pqKgDAYrFU2SXR1e2xZcuWlY5zkXdxrOqc5ORk6ba8CUljpNhHjc1EqJG5WEkjEQAwxIRBrVNL94tk69TkjUQSOsQrArPs3/0vf7SWu0sfm3fx/oUNSx+JiIioKkEN1Lp27SrdPnXqlM9xFy9elNacyc+pTI8ePaTbru6Svsj3U4uJifHr+g2VYo0aSx8pQERRhKUsuPsSOmwOXDruLn1s0cNzyw1BECqsU3MHavJGIhEJRiR0bCbdzzrp/iLJYXfg6HfHcXpHusem2aJDhE0WqCV2TvA6V5Y+ElFDt2zZMgiCAEEQsGnTJsWxNm3aQBAEDB48uEbXnjVrFgRBQJs2bQIwU++OHDni9fHazp0okIIaqA0YMEC6vWPHDp/jtm/fLt2+7rrr/Lp23759odE4Kzv37NlT6VjXnmtxcXGKksnGiKWPFGgOmwOfz16N9yYuw8GvvP/DVx+y03KkIEmlVqG5jyBJvk5N3lBEvoeaMd6IhE7uQE2eUftp4S5sfHkLvvn7epzdc05xbXk2DXA2LzHEeP53xkCNiCg4zGYz5s6di6uuuirYUyGqUlADtQ4dOqBnz54AgOXLl3t8O+3y4YcfAnCuTxs2bJhf146Li8OoUaMAAOvXr8fZs2e9jrt8+TJ++OEHAMCkSZOqNf+GRrQ7IJbJN7xm6SPVXuahi7h0NAsOmwMHvjwUtHlc+M29T1li5wRo9N6X4MrXqclb9JfKmokYm4UjUZ5RuxKo5abn4bfVh6XHLx1T7s1YcQ81bZgGcW2U62oN0WHQhWurfD1ERBR48+fPxwsvvACr1ftWKkShJKiBGgA8+OCDAICDBw9i/vz5HsdXrFiBdevWAQDuu+8+v7s+AsDjjz8OwPntycyZMxV7qwGA1WrFPffcA7PZDEEQcN9999X0ZTQI8mwawNJHCozyYnfwbyowVTKybp3fnyndTr7K97pUX6WPJbKMWkQzIxI7uTNyBecKYS61YPt/d0O0u79QspQqyz0Ve6gJgEavQVyKMlBjIxEiauzS052l4fKKqFBhs9kqPR7Kc6emJ+iB2n333Seln//2t79h5syZ2LRpE7Zt24ZHH30Ud9xxBwBnzfBTTz2lODc9PV2qjx4+fLjHtceMGYN77rkHALB161b0798fixYtwq5du/DFF19gyJAhUhD4wAMP4Nprr63DVxp88kYiAEsfKTDk+4aZSy0+M+N1yWF34MJBd0at0kAt0fum16WyZiLGZkZEt4yCzqiTHtv32QHFptWAl0BNVvqoDdNCEATEpVYM1NhIhIiIiKoW1Pb8AKBWq7F27VqMGjUKx48fx/Lly7F8+XLFmFatWmHt2rWIjo6u9vXfe+89CIKAJUuW4MSJE7j//vs9xvzxj3/E66+/XtOX0GCIsj3UEKaDoFH7HkzkJ3m5n2gXYTXZ6r20LzstVwqaVGoVWnT3bCTi4i2j5rA5UJbn/u/D2CwcgkpAQsd4XDjgDAD3fLTP41qVZdS0Bud7EJsaoxjD9WlERETkj6Bn1ABn+/z9+/fj3//+N/r27YvIyEjo9Xp07doVTz31FA4ePIju3bvX6NpqtRqLFy/G5s2bcfvttyM5ORk6nQ7JycmYMGEC1q1bhw8//BA6na7qizVwDllGTRXhfwkpUWUU5X4ALKVmHyPrzoUD7rLH5l0SKg0U5YFaaU4p7DY7ygpMEB3uTKAxzggASOwoa0jiJVForjRQc34P5plRY6BGRIHx6quvSpVF27Ztq3TsI488AkEQoNVqkZXlXl9bUlKC119/HePGjUNycjIMBgOMRiNSU1MxdepUfPPNN9WeV1WdEx0OB7744guMGjUKLVq0gNFoxFVXXYU333wTdrvdr+dYv3497rrrLnTu3BkxMTHQ6XRITEzEkCFD8PLLL6OoqEgx3tWl8vnnn5cec713s2bN8nvuALBr1y7MnDkT7dq1Q1hYGGJiYtCvXz/MnTvX5zZP8iqwLVu2wGKxYMGCBRgwYABiYmIQERGB3r17Y968eSguLvZ6jUCp6c+8sk6fcoMHD/batbPie7BhwwZcddVVCAsLQ/PmzTF+/HiPvZVr8l7LnT9/Hs8++yz69u2L2NhYGAwGdOnSBQ8//DDOnDlT9ZsVZEHPqLmEhYXhySefxJNPPun3OW3atPG7zGrEiBEYMWJETafXKHAPNaoLFTsdmkstiPDecLHOyNentepT+b6JkQnuQE10iCjNKVOsrdMZdVKglyjr/OhiiAmDqcD535KlVBmk2iqUPgLOVv+6cC0sZc6xzKgRUaBMnz4dTz75JBwOB1asWIGhQ4d6HecKjADnspDEROcejzt27MDkyZO97mWbkZGBjIwMrFy5ErNmzcLSpUsDMufS0lLccsst2LBhg+LxAwcO4JFHHsGqVavQvLnnPpguxcXFmDp1Kr7//nuPY9nZ2cjOzsb27dvx3nvvYcuWLdKevYFgs9nw0EMPYdGiRYrHzWYz9u3bh3379uGtt97Cp59+inHjxvm8Tn5+Pq699lrs26es1Dh48CAOHjyIZcuW4aeffkKrVq0CNneXYPzMvdm6dSteeOEFKTDPyspCUVERwsPDAQTmvV6xYgXuuecelJaWKh4/ceIETpw4gQ8++ACffvppSDcTDImMGtUPZaDG9WkUGLYKGTVzSf3up1ad9WmAs8FHeJz7i4riyyXKjo/x4dLthAqBmkavwbX3uLcVqbz00fk9mCAI6Da+MwBnNi+5ikCSiMhfLVq0wMiRIwEAK1eu9JmN2rp1Ky5edP6enDFjBgDnHrXjx49HdnY2IiIi8Le//Q3r1q3D7t27sWrVKsyePRtarfMLp2XLluHbb78NyJxvv/12KUgbNmwYVq5cid27d2Pp0qXo1q0btm7dipUrV/o8/+6775aCtBtuuAGfffYZduzYgQ0bNmDBggVISUkB4Mze/PWvf5XOmzRpEvbv369YArN//37s378f8+bN82vu999/vxQ4tG/fHgsXLsTOnTuxceNG/OUvf4FOp0NBQQEmTpxYaYZz9uzZ2LdvH8aNGye9/uXLl6Nbt24AgDNnzuCxxx7za07VEayfuTfz5s1DZGQk3nnnHWzfvh1vvPEGnnjiCel4bd/rdevWYdq0aSgtLUVkZCSefvpp/Pjjj/jxxx/xzDPPIDw8HGVlZZg2bRpOnDhRZ6+ztkImo0Z1TxGosZEIBYilYuljPQdq1Vmf5hKZGCGtSSvOKpayXYAzA+YS2zoGmjCNlCnrO623opTRXKHMs2IzEZehDw9Ctxu6ICY52ue2AURENTFjxgxs2rQJWVlZ2LJli7Q1kdxnn30GAIiIiMDNN98MAPjPf/4jldh9/vnnGD9+vOKcyZMnY+TIkbj11lsBAKtWrcINN9xQq7l+++23UhO36dOn4+OPP4YgCACAgQMH4rbbbsPYsWN9dlw8ePCgFMTddtttWLFiheL4mDFjcPfdd6Nbt264cOECvvnmG1itVmi1WsTFxSEuLg5JSe5/I/r06eP33H/44Qd88MEHAIBBgwZh/fr1iIhwV2iMHj0aU6dOxejRo2EymXDnnXciLS1NCnzkLl26hCeffBL//ve/pccGDhyIm266Cd27d8f58+fx1Vdfobi4OKD7+wbjZ+6Lw+HARx99hAkTJgBwvqcutX2vy8vLMXv2bADO7bq2b9+Orl27SucPHz4cgwYNwg033ACTyYR//vOf+Oijj+rkddYWM2pNiLyZiBDJ0kcKDJuX0sf6VJ31aS6KvdQul6A0R55RcwdqKrUKvW/pAQBI6NgM/ab1gc7ovn7F0kdvzUQAZ1YtoUMzRfBGRPVDFEU4CsuC+qcuu+FOmTJFKherGLgAzhKyVatWAXB+EHeNzczMRPPmzdG/f3+PD+wukydPlrZFysjI8DqmOhYvXgzAGTAuXLhQCtJcwsPDsWzZMqhU3j+eHj16FB06dIBOp8MzzzzjdUxUVJT0esrLy72W+NXEa6+9BgDQaDT45JNPFIGDy3XXXSfNKyMjQyo3rSg+Pt5rFi8qKgq33XYbAMBiseDkyZMBmbtLMH7mvrh6RXhT2/d648aN0tz/+c9/KoI0l7Fjx2LMmDEAnIFhMDpW+4Nf7TYhDtk+aiojm4lQYHisUSup32Yi1Vmf5qJo0X+5BA67Q7pvbBauGDv4/mvQZ0pPGGLCoNaooZe17LearHDYHVCpVdJ9F20Yf70ShQKxyITs3o8EdQ4Jv70BITq86oE1EBERgUmTJuGzzz7DqlWrsHDhQmg07t8/GzduRE5ODgBIWx4BwKeffgrAmdnwRaVSISEhAefPn0d5ebnPcf6w2Wz44YcfAADjxo3z2cm7ffv2GDx4sNdytmnTpmHatGlwOBw+gzkAiqxZbecNOPfd3bp1KwBn1q6ydW/33nsv/vGPf0AURWzYsAHTp0/3GDN06FCf+wK3bdtWul1x/9/aqu+feWWuvvpqr48H4r12ZW0FQfD6/rssXrwYDocDKSkpHl8ahAp+kmhC2EyE6oJn18f6y6hVd32ai6JFf1aJouNjhCyjJj3WzP2YfG81ALCWWaGPdP6DKw9aNQZmz4iofsyYMQOfffYZcnNzsWnTJkVzBVfZY1JSEkaPHu1xrivgKS8vx5kzZ5CWlobjx49j//792LZtGy5cuACg8g/3/rhw4YJUdte7d+9Kx/bv37/SNV6uOYuiiAsXLiAtLQ2///47Dh06hF27dimadNR23gBw9uxZqSGFrwDDpVmzZmjXrh1OnTqFI0eOeB1TWfBhNLr/valqc+6aqq+feWV8NUoJxHvtykS2bdu20q29WrduXd1p1zsGak2IWCwrfWQzEQqQioFafTYTqcn6NMBzLzVB5f4mrWJGrSJduDJQM5da3IGa7L3QMVAjonoyduxYJCQkIDs7GytWrJACNbPZjK+++gqAMxulViv3Ty0qKsIbb7yBFStW4NixY14/mAuCEJCyMPmWAHFxcZWOrazrIwB8/fXXWLhwIXbs2OHR0Q9Apdm2mpC3gXd1zKxMYmIiTp06hby8PK/H5cFYRfLMTl2U49Xnz7wyUVFRXh8PxHvt+rsWHx9fy1kGHwO1JkQsZddHCrxglj7WZH0aUKH0MasEao37H3VjM9//gAKASqOC1qCB1eR83fIMonzzbw1LH4lCghBlQMJvbwR9DnVJo9Hg9ttvx9tvv42vvvoK7733HnQ6Hb799ltpPzFXt0eXtLQ0jB49GmfPnpUec+0x1atXLwwcOBBjx47FyJEjFWPqg7cGHIAzwzNz5kx8/PHH0mOCIKBt27bo3r07+vXrh+HDh2PTpk345z//GbD5yIMWf0rkXN03Q62crr5+5v5k4ny9N4F4r+sqExkM/CTRhCg3vGbpIwVGMEsfa7I+DVA2E6k434gqAjXAmVVzBWry5imuxwBm1IhChSAIdbY+LJTMmDEDb7/9NgoKCrBhwwZMmDBBai7SuXNn9OvXTzF+2rRp0ofxRx99FLNmzUL37t09sm6BWiclXzdWVYOP/Px8r4+/8847UpA2cOBAPPvssxgyZIhHs4mabNJdGXkGUJ4Z9MU1pqrMYX2r7c/c32xfxc3GqyMQ73VsrLM7s6+MZkPCro9NiFjMjBoFnjyLBNRf6WNN16cBzk2r1Tq112PhcVV/oJOvU1Nk1EzMqBFRcAwcOBAdO3YEAKxZswYmkwlr164F4JlN++WXX/Drr78CAO655x689tpr6NWrl9cP7AUFBQGZX8uWLRETEwMA2Lt3b6Vjf/vtN6+Pv/vuuwCcH8Q3btyI8ePHe+0IeO7cudpNtoJ27dpJ5Yq//PJLpWOzsrKQnp4OwBkgh4pA/MzlTWpMJpPXMaIoIjMz0+sxfwTivZbvR+daF+nNihUrkJKSghEjRgS8w2agMFBrQlj6SHXBo/SxnjJqNV2fBji/FZSXP7oYosOg8RHAyfkM1OT7qDGjRkT1zNXVce3atVi/fr20fkve7REATp06Jd2+6qqrfF7viy++kErLaltOplarpXbsGzZs8PlhPicnR9oQuyLXvNu3b+9zf7G8vDxs3LhRul9x3jVZv6bRaDBs2DAAytbv3ixZskS67dqMPBQE4mfuCrQB36379+7d6zMj6o9AvNeu2w6HA59//rnP89euXYtz585h586daNnS/y976xMDtSZEsY8aSx8pAERR9Cx9rKeMWsbe89Lt6qxPc5E3FHGpan2ai7xFv9lHRo2BGhHVN1fm7OLFi9IarUGDBilavgPKJgvr16/3eq2ff/4Zjz/+uHTfYqn97/aHHnoIgiDAYrFg5syZMJuVa5ptNhvuuecen23hXfM+cuQIzp8/73G8qKgI06dPVwQKFectb4tfnbLOOXPmAHC2j7/zzju9NjHZuXOn9L63bNlS2hMtFATiZ96zZ0/p9pIlSzx+fiUlJXjkkdpvhVHb9/rWW2+VGpH84x//wJkzZzzO37ZtG/73v/8BAKZOneo1MxsKGKg1IYqMGvdRowCwWx0Q7co69fpqJnJq22npdsrVydU+X75OzcUY7986Ft8ZNe6jRkTB0759e1xzzTUAILWor5hNA4DBgwdLH2TXrl2LKVOmYM2aNfj555+xevVq3HXXXRgyZIgi4KnNuiOXa665BrNnzwYAbNq0CQMGDMDHH3+MPXv2YMWKFRg8eDDWrFkjbcpd0ZQpUwA4y+6GDx+OxYsXY+fOndi4cSNefPFF9OjRA99//73inIrzlneUnDt3Lvbu3YujR49WOfexY8di1qxZAJwf8vv06YN3330Xu3fvxubNm/HYY49h5MiRMJlMUKlUWL58eaXdHetbIH7mKSkpuO666wA4y1NHjx6N1atXY9euXXj33XfRr18/7Nq1C+3atavVXGv7Xuv1eqlM9vLly7j66qvx8ssvY8eOHdi4cSOeeuopXH/99XA4HIiJicFLL71Uq/nWJX6SaEIcsjVqqkhm1Kj2bBXWpwH100yk6FIxLh93L0bvMKx9ta9Rm4yaP2vUmFEjomCYMWMGdu/eDcDZPdFbVsdgMGDp0qWYPHkyLBYLVq1ahVWrVnmMu/nmm2E0GvHJJ58gIyMDZWVlPoMof73xxhuwWCxYvHgxDh48iDvvvFNxvE+fPpg0aRLmzZvnce68efPw448/4vDhwzh16hTuu+8+jzHJycl4/PHH8eijjwIAjh07hsGDB0vHR40aBYPBAJPJhAULFmDBggUYNGgQtm/fXuXcFy1aBLVajffffx9paWl48MEHPcbEx8fj448/xqhRo6q8Xn0K1M988eLFGD58OLKzs7F9+3aP9+3RRx9FXFwcnnnmmVrNt7bv9eTJk/H+++/jwQcfRG5uLp566imPMc2bN8fq1auRkpJSq7nWJWbUmgjRagPM7g+RXKNGgWAxeQnUyqxw2Otuk0wASNvqzqbFpsQgvm1sta/hbY2aPx0fAUBndAdhllL3e6BYoxbGQI2I6t+0adOk9vbjx4/3uZfUDTfcgL1792LGjBlITk6GVquFwWBA27ZtMWXKFKxbtw6rV6+WslhWqxWrV6+u9fzUajUWLVqEjRs3YuLEiUhOTkZYWBg6deqEuXPnYufOnT7Xn8XExGD37t14/vnn0bt3bxgMBmi1WiQkJGDIkCGYP38+Dh8+jIceekh63RXXKLVu3RrfffcdBg0ahIiICBiNRo8SPl+0Wi2WLFmCbdu2YcaMGUhNTYVer0ezZs0wcOBAzJ8/HydOnFBsOB5KAvEz79atG44ePYonn3wSnTt3ll7/+PHjsW7dOrz22msBmWsg3uu7774bx48fx5w5c9C5c2eEh4fDYDCgZ8+e+Mc//oHDhw/j2muvDch864og1vWOduSXI0eOoEePHtL9w4cPo3v37gG7vqOgFNl9/iLdTzj4BlRRjb9dMdWtvPR8fDRzhcfj96+9C2GRdVde+/ns1bh45DIA4Oo7++K6Pw2o9jUyfj2P1Y+tVTw24rEh6HVT1f/d7V76C35e5uye1eX6Thj7j5EQRRFvj1osBal//HgaYlvHVHteRFR9drtd0bWtU6dOHt3siIiqK9i/W5hRayLEYmUbVcHIjBrVXsXW/C6WOlynVpxVIgVpANBxWM1q4b2VPvqdUQuXlT6WOUsf7VaHIpPI0kciIiKqDQZqTYR8s2shXA9BzR891V7F1vwudbmX2qlt7u5N0a2i0KyD97KeqkQmeFmj5m8zkQhZoHbltVZcr8dmIkRERFQb/CTRRHAPNaoLFVvzu9TlXmq/y9andRjWDoIg1Og6Gr0GhlgDTPnubHNt2vNXXK/HNWpERFRbFovFr66UldHpdNIm0NSwMFBrIuSlj9xDjQLFV6BWV50fS3NLkXnoonS/pmWPLpGJEVKgJqgEhMf499+GouvjldJHm8mdXVTr1FBpmLUmIqLayczMrHSDan+kpqYiPT09MBOiesVPEk2EMqPGPdQoMHyWPhbXzRq1tG1ngCvtjyKTIpDYOaFW14tKcncWC48z+B1cyTNqrtJH7qFGREREgcRPE02EfI2aihk1CpD6Ln1Mk61P6zisfY3LHl3kLfqN8f5vTOqtmQj3UCMiokBr06YN2KC96WJGrYkQS+Slj1yjRoHhK6NWF6WPZQUmXDiQKd3vUMuyRwCISY6Wbke38L5vjzfyZiJ2qwM2s417qBEREVFAMVBrIsQSNhOhwKvY6dClLro+nv7pDESH81vFiAQjkrom1vqaXa7viFa9WyC6VRT6Tuvj93nyjBrg3ORbmVFjsQIRERHVDj9NNBHKQI2ljxQYFTsdupjrYB+19D3npNsdhrWDoKpd2SPgDLhuffOmap+nNWggqAQpcDSXmmE1MaNGREREgcOMWhOhCNS42TUFiLzToaB2B051UfpYXuT+O9ysfc32TgsUQRCgC3cHY5ZSq6KZiIYZNSIiIqolBmpNhLKZCAM1Cgx5cGKMc28WXReljzbFGrDgB0KKTa9LLYrSRx2biRAREVEtMVBrIhTNRCJZ+kiBIS/3M8bLArU6yKjZzHbptkYf/EBNb3Rvc2EusSiaiWhY+khERES1xECtiVDso2bkPmoUGPKMWkQzd3v7ulijZrMoN5QONkXpY5kyoxYKGT8iIiJq2BioNRFiMZuJUOApMmoJ7kCtLtao2cwhXPpYYuE+akRERBRQDNSaCAf3UaM6IA9O5BtG1/UatVAofay46bViHzU2EyEiIqJaYqDWRIil7lI0NhOhQFGUPsoyanaLHTaL3dspNSa/nkYX/EBIb3QHauZSZtSIiIgosBioNQGiKCqbibD0kQJEnkWSr1EDAlv+KDpE2OWBmj4E1qgZK3R9LOc+akRERBQ4DNSaArMNsLo/5LL0kQJFnkUyxBoUe6mZSwPXUES+Pg0Ija6KFQM1WzmbiRAREVHgMFBrAuQdHwEGahQYokNUrBvThWsV67bMxYHLqHkEaiGQUWPpIxEREdUlBmpNgLyRCAQBQjjb81PtVQyetGFa6CtsAh2w56qw3i0U1qh5lD6a5M1EGKgRERFR7TBQawKUrfnDIAhCJaOJ/CNfkwU4y/0UWaYq9lLLPHQRZ/ecgyiKVT6XPChUaVRQaYL/q6tiUKrMqAU/kCQiqkvLli2DIAgQBAGbNm1SHGvTpg0EQcDgwYNrdO1Zs2ZBEAS0adMmADP17siRI14fr+3cG4Ldu3fjxhtvRGJiIsLCwpCamoq5c+dWes7SpUshCAJmzJhRT7MkAOCniSZAsdk1yx4pQOSBCQRny3x9hDtba64ko5bx63msfmwtAGD4Xwaj9+QelT5XqLXmB5QbXptLrWwmQkTUAJjNZrz44ot4+eWXYbEEfiuZUPfLL79g+PDhMJvdX6ZmZGTAYPDdaC49PR1//etf62N6VEFofOKhOqXo+GhkoEaBocgghWkgqASPTaB9Ob/vgnT78DfHqg7UzPJALfjr0wBAZ5QFpcXmkNuQm4iIPM2fPx8vvPBCsKcRNK+88ooUpD333HMYM2YMzGYzOnbs6HX8pUuXMGbMGOTm5tbnNOkKfppoAhwl7owa91CjQJFnkFxdGJWlj74DNUuZO8jLOZWLokvFiEqK9Dk+1PZQAwCd0Z01U2QXwTVqRNS0paenB3sKPtlstkqPh/LcA+H48eMAgOuuuw7PPvtspWP379+PW265pdG/J6Es+As9qM6JskBNiOQeahQYFTNqQMV1W77XqFnKlEHcmV1nK30uRUYtRLJV8qC0IgZqREQUikpLSwEAbdu29TnGarViwYIFGDRoENLT06FSMVwIFr7zTYAiUGPpIwWItdyzHb2iHLCSNWryjBoAnNlZRaAWimvUfARqgkqAWhca5ZlERERyDocDAKDReP+3ND8/H7169cJjjz0Gk8mEiIgIrF69uj6nSDIM1JoANhOhuqBoR+8qfYzws/SxQhB3fv8FjyybnCKjFiJBkFqr9hqQacM07KxKRPXi1VdflTovbtu2rdKxjzzyCARBgFarRVZWlvR4SUkJXn/9dYwbNw7JyckwGAwwGo1ITU3F1KlT8c0331R7XlV1TnQ4HPjiiy8watQotGjRAkajEVdddRXefPNN2O12r+dUtH79etx1113o3LkzYmJioNPpkJiYiCFDhuDll19GUVGRYryrS+Xzzz8vPeZ672bNmuX33AFg165dmDlzJtq1a4ewsDDExMSgX79+mDt3rs+1XOnp6dLzbdmyBRaLBQsWLMCAAQMQExODiIgI9O7dG/PmzUNxcbFf74G/5M999qzzi9EPP/zQ6+svLCyUyiOHDRuG3377DZMmTQrofMh/ofHVNNUpsVjWTISBGgWIt3b0/jYTqbimy2514Owv59FxWDuv4xVr1EIkowY4s2omi0nxmIYdH4monkyfPh1PPvkkHA4HVqxYgaFDh3od5wqMAGDMmDFITEwEAOzYsQOTJ09Gdna2xzkZGRnIyMjAypUrMWvWLCxdujQgcy4tLcUtt9yCDRs2KB4/cOAAHnnkEaxatQrNmzf3eX5xcTGmTp2K77//3uNYdnY2srOzsX37drz33nvYsmULUlNTAzJvwLm+7aGHHsKiRYsUj5vNZuzbtw/79u3DW2+9hU8//RTjxo3zeZ38/Hxce+212Ldvn+LxgwcP4uDBg1i2bBl++ukntGrVKmBzr47evXvjmWeewZQpU4Ly/OTGjFoTwGYiVBe8lT4qmolUVvpYavV47MzOdJ/jQ3GNGuB9nRr3UCOi+tKiRQuMHDkSALBy5Uqf2aitW7fi4sWLACDtg3Xx4kWMHz8e2dnZiIiIwN/+9jesW7cOu3fvxqpVqzB79mxotc7f7cuWLcO3334bkDnffvvtUpA2bNgwrFy5Ert378bSpUvRrVs3bN26FStXrvR5/t133y0FaTfccAM+++wz7NixAxs2bMCCBQuQkpICwLOl/KRJk7B//37cf//90mP79+/H/v37MW/ePL/mfv/990tBWvv27bFw4ULs3LkTGzduxF/+8hfodDoUFBRg4sSJlWY4Z8+ejX379mHcuHHS61++fDm6desGADhz5gwee+wxv+bkj5YtW0qvtUWLFgCAiRMnen39KSkpOHDgAIO0EMFPFE2AsvSRzUQoMGzlnu3oFfuoVbLhtbcyx/RdGXDYHVCpPb8/UnSYDJGuj4D3dWpsJEIUWkRRrDTDXx90Ebo6K4meMWMGNm3ahKysLGzZsgWjRo3yGPPZZ58BACIiInDzzTcDAP7zn/9IJXaff/45xo8frzhn8uTJGDlyJG699VYAwKpVq3DDDTfUaq7ffvst1q1bB8CZDfz444+l92XgwIG47bbbMHbsWGzfvt3r+QcPHpSCuNtuuw0rVqxQHB8zZgzuvvtudOvWDRcuXMA333wDq9UKrVaLuLg4xMXFISkpSRrfp08fv+f+ww8/4IMPPgAADBo0COvXr0dERIR0fPTo0Zg6dSpGjx4Nk8mEO++8E2lpaVKwK3fp0iU8+eST+Pe//y09NnDgQNx0003o3r07zp8/j6+++grFxcWIjPTdEdlfOp1Oeq06nfPfrbi4OK+vn41DQkvofOKhOqMofWQzEQoQi6Lro6uZiLzro//NRADAVFiOS8ey0LJHkscxuyX09lEDGKgRNQSWEgvenRCYsr2aemDtXdBH6qseWANTpkzB7NmzUVZWhhUrVngEajabDatWrQLgDL7Cw8MBAJmZmWjevDlat27tEaS5TJ48GXq9HmazGRkZGbWe6+LFiwE4A8aFCxd6BK/h4eFYtmwZOnXqJDW9kDt69Cg6dOiAjIwMPPPMM16fIyoqCuPHj8eSJUtQXl6O7OxstGzZstZzf+211wA4m3B88skniiDN5brrrsMzzzyDv//978jIyMAXX3yB6dOne4yLj4/3msWLiorCbbfdhtdeew0WiwUnT55Ev379aj13argYNjcBoqxNuhDJQI0CwyZvJmLw0kyk1AJRFD3OE0VRkVGLTHL/Y3d6R7r35zKH5ho1r6WPIVSaSUSNX0REhNTsYdWqVR77hG3cuBE5OTkAgDvuuEN6/NNPP8WlS5fw888/+7y2SqVCQkICAKC8vNznOH/YbDb88MMPAIBx48YhOjra67j27dv7bOQxbdo0/P777zCZTOjRo4fP55JnzWo7b8DZrn7r1q0AnFm7yta93XvvvVIAWnEdnsvQoUOh13sP3OVt80tKSmo6ZWokGKg1AY4Sd0ZNxdJHChDlGjXP0kfRLio6Q0rnmWyALH7rPKqjdNtXm/5QXaMm3/TaRctmIkRUz1zrznJzc7Fp0ybFMVfZY1JSEkaPHu1xrqvUrby8HMeOHcM333yD+fPnY/r06UhOTsb58+cBwGuGqzouXLgglVr27t270rH9+/ev9LhrzqIo4vz589iyZQsWL16MOXPm4Oqrr8ZLL70kja3tvAHg7Nmz0v5jV199daVjmzVrhnbtnI2xjhw54nVMZYGe0WiUble1OTc1fqHziYfqDPdRo7og79yo8VL6CDg3vdaFayucpyyJ7Dy6I/Z+sh8AkJeej8LMIkS3jFKeE6Jr1Lw3E2GgRhRKdBE6PLD2rqDPoS6NHTsWCQkJyM7OxooVK6SOg2azGV999RUAZzZKrVaWjhcVFeGNN97AihUrcOzYMa9BjSAIXqsjqku+JUBcXFylYyvr+ggAX3/9NRYuXIgdO3ZIAZRcoNdZyVvuuzpmViYxMRGnTp1CXl6e1+PyYKwieTloIN53athC5xMP1RlFoMbSRwoQefCkuxKcaHTOvcXsV9rpm0ssiEhQnifv+CioBcS3jUVcaizyzuYDAE7vTMdVt/ZSnNOg1qiFUMaPiJwffOtqfVio0Gg0uP322/H222/jq6++wnvvvQedTodvv/1W2k/MlXVzSUtLw+jRo6V9tQDAYDCgS5cu6NWrFwYOHIixY8di5MiRijH1wVsDDsCZHZs5cyY+/vhj6TFBENC2bVt0794d/fr1w/Dhw7Fp0yb885//DNh85AGTP01hXN03uacm1RY/UTRyoihCLGEzEQo8eemjvBxRH6FDWZ7z75y3Fv3y9Wm6cGcntLbXpUqB2pmdZz0CNcUatRAKhLwGauHMqBFR/ZsxYwbefvttFBQUYMOGDZgwYYLUFbFz584eTSmmTZsmBWCPPvooZs2ahe7du3tk3QK1Tkq+bszbvm1y+fn5Xh9/5513pCBt4MCBePbZZzFkyBCPxh412aS7MvIMoDwz6ItrTFWZQ6KqcI1aY2eyAA7ZN0Fco0YBIl9/Js8iKTo/emmJLe/46CqLTLk6WXos94znP9CKoDCESh914cyoEVFoGDhwIDp2dK75XbNmDUwmE9auXQvAM5v2yy+/4NdffwUA3HPPPXjttdfQq1cvr0FaQUFBQObXsmVLxMTEAAD27t1b6djffvvN6+PvvvsuACA2NhYbN27E+PHjvXZfPHfuXO0mW0G7du2kcsVffvml0rFZWVlIT08H4AyQiWqDgVojJ9/sGgBULH2kAJGvUZNnkaraS61iRg0ADNHuv5fy67rYLCHa9dHLuhM2EyGiYHF1dVy7di3Wr18vrd+Sd3sEgFOnTkm3r7rqKp/X++KLL6Qyvto2tlCr1ZgwYQIAZzfEzMxMr+NycnJ8dkt0zbt9+/Y+9xfLy8vDxo0bpfsV512T9WsajQbDhg0D4OyiWdlWBUuWLJFuuzYjJ6opBmqNnLzsEWoVoOeHSAoMq2LDa1mgZlS26K9IvkbNlVGTZ6as5VaIDuUCaru5Aa1RYzMRIgoSV+bs4sWL0hqtQYMGKVq+A859vFzWr1/v9Vo///wzHn/8cem+xVL7TcMfeughCIIAi8WCmTNnwmxWfplns9lwzz33+Gyp75r3kSNHpG6UckVFRZg+fbqidLLivOVt8atT1jlnzhwAzlb9d955p9cmJjt37pTe95YtW+K2227z+/pE3jBQa+QUe6hFhHFhKwWMTd6e31fpY1Vr1K6MVQQ3orIdP9DA1qiF0PyIqGlp3749rrnmGgDAvn37AHhm0wBg8ODBUvfCtWvXYsqUKVizZg1+/vlnrF69GnfddReGDBmiCHhcTUlq45prrsHs2bMBAJs2bcKAAQPw8ccfY8+ePVixYgUGDx6MNWvWSJtyVzRlyhQAgMlkwvDhw7F48WLs3LkTGzduxIsvvogePXrg+++/V5xTcd7yjpJz587F3r17cfTo0SrnPnbsWMyaNQsAsG3bNvTp0wfvvvsudu/ejc2bN+Oxxx7DyJEjYTKZoFKpsHz58kq7OxL5g58oGjmxWNZIJIJljxQ4itJHeUZNvul1FWvUXAGaax82aYzJqgjerObQXKPmtT0/m4kQURDNmDEDu3fvBuDsnugtq2MwGLB06VJMnjwZFosFq1atwqpVqzzG3XzzzTAajfjkk0+QkZGBsrIyn0GUv9544w1YLBYsXrwYBw8exJ133qk43qdPH0yaNAnz5s3zOHfevHn48ccfcfjwYZw6dQr33Xefx5jk5GQ8/vjjePTRRwEAx44dU2ygPWrUKBgMBphMJixYsAALFizAoEGDsH379irnvmjRIqjVarz//vtIS0vDgw8+6DEmPj4eH3/8MUaNGlXl9YiqwoxaIydfo8bNrilQHDYH7Fb3fjvVWaNm9ZJR0+g1gCzZW3GdmiKjFkJr1Lxn1BioEVHwTJs2TWpvP378eEWZo9wNN9yAvXv3YsaMGUhOToZWq4XBYEDbtm0xZcoUrFu3DqtXr5ayWFarFatXr671/NRqNRYtWoSNGzdi4sSJSE5ORlhYGDp16oS5c+di586dPtefxcTEYPfu3Xj++efRu3dvGAwGaLVaJCQkYMiQIZg/fz4OHz6Mhx56SHrdn3/+ueIarVu3xnfffYdBgwYhIiICRqPRowTTF61WiyVLlmDbtm2YMWMGUlNTodfr0axZMwwcOBDz58/HiRMnpH3siGordD7xUJ0QS2V7qDGj1uQ4bA5k/HoecamxiEry/g9fTci7MALKcr/qZNRca9QEQYDOoJWOVQzUGtIatVAqzSSipic+Pt7v9WQ9e/bERx99VOmYyZMne914edasWVIpYEWuroeVGT16NEaPHu312BNPPIEnnnjC6zGj0Yi5c+di7ty5lV4/JyfH57Fhw4b5zKD5M/chQ4ZgyJAhVY6Ta9OmjV8bWFf2vgaCP6/PG26+HRz8RNHIcQ+1pm3Hop+xb8Vv0IRpMOvT6TDG1a5kxaViIKWRZZGqtUZN1kREW0mgFrJr1AxaZyZQrPAYERERUS2x9LGRE4tlGTW25m9y0n92thC2ldtw7lfPDlk1Je/4qFKroNa6f5VU2fXRS0YNUDYUke/RZrfZ4bC7yyxDaY2aoBI89lLTGEJnfkRERNRw8RNFI6csfeQataZGntEqzS0L2HXlgZTGoFF0E9VHuteoWbzto1bquUYNUAZqFllGTZ5NA0JrjRrgDDblr4lr1IiIKFAsFotfXSkro9Pp0K1btwDNiOpTaH3ioYBzyEofVSx9bHLkAU9gAzXvrfkBZfDlbY2a4lyDj4yarDyyYqv+UFqjBjjX5JVku/fT4T5qREQUKJmZmZVuSu6P1NTUGq9No+BioNbI6QZ2BkTnWjVN7zbBng7VI1EUYS2ro0Ct3HuwBShLH72uUZNveK3IqLl/HSlKHysGaiFU+gh4NhThPmpEREQUCPxE0ciF3dAPYTf0C/Y0KAhsZhtEh7vLRVlAAzV38FSx1E/e9dFSZoXD7oBKrZI9Jm8mImtCIlvrJc8EWmWBmlqnhqAKrU3b5fNW69SK10pERFQb/naLpMaJnyiIGil50w4AKMkt9TGy+iotfZTto+ZtHspmIt7XqFl9rFELtfVpgDIwZdkjERERBQoDNaJGSp65AgKcUfOxzgy4kiWTJb3Ki9wNbURR9J1RU5Q+ygO10NxDzUURbLLskYiIiAKEgRpRI2X1ksmqmN2qKZu89LFCoKZSqxAm6/xYXugO1Kwmm3LPMR9dH30HaqEXCOmYUSMiIqI6wECNqJHyFpSV5gWm/NFSSekjABhi3FtBmBSBmjLL53sftQYUqDGjRkRERHWAgRpRI+UtUAtU+WNlGTUAMES7t4IwFbi3iJB3fFSpVVDr3KWM/uyjFoqBmrzLJfdQIyIiokAJmUDNZDLh5ZdfRt++fREREYHIyEj06tUL8+bNQ0FBQcCfb/fu3VCr1RAEAZs2bQr49YmCzVtr/EC16K+smQhQMVBzZ9Tk69O04VrFRtn+ZNS0IRioJfdt5fU2ERERUW2ExKeezMxMjBkzxmPn9UOHDuHQoUNYunQp1q1bF7Bd1a1WK+699144HI6AXI8oFMmDHZe6CNQ03jJqPkofFR0fjRWakCgCNXdwJg/U1CHYTKRZuzjc/t/JKLpUjPZD2gZ7OkRERNRIBD2jZrPZcNNNN+Ho0aMQBAEPPvggNm/ejC1btmDOnDlQqVRIT0/HpEmTUFRUFJDn/Ne//oXDhw8H5FpEoapi10cAKM0JzBo1+T5qOq+Bmo/SR3nHR0OFjaLlpY+ycaG+Rg0Akro1R6eRHaDWhl4gSURERA1T0AO1xYsXY+/evQCAN954AwsXLsSIESMwbNgwvPHGG/jkk08AAKdOncKrr75a6+c7fvw4XnrppVpfhyjUeW0mEqiMWrkso1ZVMxF56WOp74xaQ91HjYiIiKguBD1Qe/PNNwEAXbt2xUMPPeRxfNq0aZgwYQIA4K233oLNZvMY4y9RFHHvvffCbDajWbNmNb4OUUNQsT0/AJTmBSpQk60b89JAQ7FGrdD7GjV5t0SgYqDmvfQxFNeoEREREdWFoAZqx48fx/HjxwEA06dPh0rlfTozZ84EAOTn5+PHH3+s8fP997//xfbt25GQkID/+7//q/F1iBoC76WPddBMxFBVe3556aMsoxbuZaNs2fVF0bnhWqhveE1ERERUF4IaqO3cuVO6PXToUJ/jBg8eLN3eunVrjZ7rwoULeOqppwAAr732GuLj42t0HaKGwnvpY4DWqCkCtarWqMn2UVN0ffSdURMdIuwWZ8mjspkIM2pERETUNAQ1UDt27Jh0u0OHDj7HJSUlwWAweJxTHbNnz0ZRURFGjx6NGTNm1OgaRA2Jt4yaucSiCHxqSrGPmpc1amGy0kdLqQV2q/3Kbf/WqAHuvdS4Ro2IiIiaoqAGapmZmQAAjUaDFi1aVDq2ZcuWinOq4/PPP8eaNWtgMBjw7rvvVn+i1ZSVlYUjR45U609aWlqdz4uaFm9r1IDANBRR7qPmmVELl5U+Au51ahZTJV0fKwR8VilQ4xo1IqJQs2zZMgiC4HU/2jZt2kAQBEVFVHXMmjULgiCgTZs2AZipd0eOHPH6eG3nHsqee+456WfGz50NQ1A/9eTn5wMAjEajYuNbb4xGIwCgsLCw2s8xZ84cAMDcuXPRvn37Gsy0ehYuXIjnn3++zp+HqDLeSh8BZ/ljdMuoGl9XFEVlMxEvpY8avQZag0ZqCmIqMCGimVGZUauwRk1QCYpzXIFmQ2jPT0REDYPZbMaLL76Il19+GRaLZ+UJUSgJakbNbDYDAMLCwqoYCan00XWOvx5//HFcvnwZPXv2xOOPP179SRI1UN5KH4HaZ9TsFjtEhyjd99ZMBFCWP7rWqSm6Php1Hud4a9Ef6hteExFRwzF//ny88MILsFq9f5lJFEqC+vW0q8tjVdk0AFIHOF+dIb358ccfsXTpUqhUKixatAharec3/0SNlbz0MSxKj/Ii55cctQ3U5GWPAKDxUvoIAIZoA4ovlQBwlz5W1YTE+ZizS6TXNWo6ZtSIiEJdenp6sKfgU1XbPIXy3KnpCeqnnoiICABAeXl5FSPdY/R6vV/XNplMuO+++wAADz74IK655poazrL6Zs+ejalTp1brnLS0NNx88811MyFqkiyyoCi2dQwuHrkMIACBWrnyHzlvzUSAip0frwRfimYifmbULJU3LiEiIiJqjIL6qScyMhIAUFZW9QfH0lJnW/HY2Fi/rv3cc88hLS0NLVu2xEsvvVTzSdZAYmIiEhMT6/U5ieTsVrvU3h4AYlpHywK12rXot5a7gy21Tg2V2nuWW95QxGvpY7ivjNqV53GtUSvnGjUiIiJqeoK6Ri01NRUAYLFYkJ2dXelYV7dHV/fHyhw6dAivvfYaAOC+++7D6dOnceDAAcWfjIwMafypU6ekx7mwlBqDio1EYpJjpNuBLH30Vr7oolijdmXTa+WG154ZNZ3sehYva9S44TURkdurr74qdfHbtm1bpWMfeeQRCIIArVaLrKws6fGSkhK8/vrrGDduHJKTk2EwGGA0GpGamoqpU6fim2++qfa8quqc6HA48MUXX2DUqFFo0aIFjEYjrrrqKrz55puw2+1ez6lo/fr1uOuuu9C5c2fExMRAp9MhMTERQ4YMwcsvv4yioiLFeFeXSnmzN9d7N2vWLL/nDgC7du3CzJkz0a5dO4SFhSEmJgb9+vXD3LlzkZub6/Wc9PR06fm2bNkCi8WCBQsWYMCAAYiJiUFERAR69+6NefPmobi42K/3oC5kZ2fjueeeQ///Z+++w6OqtgYO/8709IQkEJIAofcq0otSFBFEBBuo2BW91nuvvXeveu3YP5BrAaWIFJUO0nvvoRMggfQ67Xx/DDkzk5lUQkLIep/HxzPn7H1mT4bArFl7r921K2FhYQQEBNC4cWPGjRvH6tWrS+ybnZ3N+++/T+/evbX3pH79+lx99dV88cUXJdaZ2L59Ow8++CAtW7YkICCA4OBgmjVrxl133VXqn+1LWbV+Pd26dWvtODExkejoaL/tTp48SV5enk+f4mzcuFGbg/zKK6/wyiuvlNj+wQcf1I4PHTp0QcvBClEVbEUKiUTEh2nHOWfON1DzrPhY/F8hAUUyaqqqli+jpk19lH3UhBDCnzFjxvDUU0/hdDqZOnUq/fr189uuMDACGDx4sDbrZ+XKlYwcOdLvl+VHjx7l6NGjTJs2jTvvvJOJEydWyphzcnK44YYbmD9/vtf5LVu28NhjjzFjxgzq1atXbP+srCxuvPFG/vrrL59rKSkppKSksGLFCr766iuWLl2qJQUqg91u5+GHH+brr7/2Ol9QUMCmTZvYtGkTn376KT///DNDhgwp9j5paWn07NmTTZs2eZ3ftm0b27ZtY9KkSfz999/ExcVV2tjLYtasWdx5552kp6d7nT98+DCHDx9m8uTJPPzww3z88cfo9d5fnB46dIiBAwdy6NAhr/OnTp3i1KlTzJ8/nw8//JCFCxfSsGFDrzbfffcdDzzwgE+QnpiYSGJiIpMmTeKBBx7giy++KFNdi0tJtWbUunXrph2vXLmy2HYrVqzQjnv16nVBxyTEpcAzc2UMMBAcHaQ9zkmtvDVq/vZQK+S1Ri0jzxXguYtFlmGNmut5ZOqjEEL4V79+fQYMGADAtGnTis1GLVu2jJMnTwJw2223Aa4vwa+55hpSUlIIDg7m6aefZu7cuaxZs4YZM2bw0EMPaUXYJk2axLx58yplzDfffLMWpPXv359p06axZs0aJk6cSJs2bVi2bBnTpk0rtv/dd9+tBWlDhw5lypQprFy5UgsECoOAw4cP8+9//1vrd91117F582YeeOAB7dzmzZvZvHkzr732WpnG/sADD2hBWtOmTZkwYQKrVq1iwYIFPP7445hMJtLT0xk+fHiJWaCHHnqITZs2MWTIEO31T548mTZt2gCuoOfJJ58s05gqy6JFi7jxxhtJT0/HZDLx+OOPs2DBAlatWsWECRO07a0+//xzxo8f79N/3LhxHDp0CKPRyDPPPMPChQtZt24dM2bMYPjw4QDs37+fe++916vf3r17GT9+PA6Hg2bNmvHNN9+watUq/v77byZMmKAlT7766it+/vnnC/tDuBip1ax9+/YqoHbo0EF1Op1+21x77bUqoEZERKj5+fmV8rwTJ05UcX1sVBcsWFAp9zwfO3bs0MYDqDt27KjuIYka7MT2k+pH/b5QP+r3hfr19d+rGUkZ2uOP+n2h2q32Ct97z8L92n1+vn9ase0O/H1Qa/e/cVPU7DPZXmOw5dt8+iz58G/t+rJPV6pOp9OrT/qJjAqPWwhx6bLb7equXbu0/+z2iv8dV9NMmjRJ++ywcOFCv23uv/9+FVCDg4PVnJwcVVVV9cknn9T6zZs3z2+/adOmaW3uuecer2slfY5q1KiRCqi9e/f2Oj937lytz5gxY3w+9+Xk5Kh9+vTR2jRq1Mjr+tatW7VrN910k98xZ2RkqHFxcSqgWiwW1Wq1el1/+eWXtXv4U9zYFy5cqPXr3bu3mpWV5dN35cqVakBAgAqoDRs29HruQ4cOeX3Oe+qpp/yOPT4+XgVUk8mkZmZm+h1jRXm+9v3792vnbTab2rhxYxVQAwIC1JUrV/r0zcrKUnv37q31/+uvv/y+trffftvvc994441am+PHj/uMSa/Xq8eOHfPpd/z4cTUiIkIF1EGDBp3Py6+Q6v67pVozaoAWlW/bto333nvP5/rUqVOZO3cu4FpvVtaqj0LUZrZc742lAyODvK6fT1bNs5hISWvUPKc+5qbne1V81Ol16E2+682Mgd5THz0LooCsURNClJ+qquRmF1Trf6qqlj7QCho1ahSBgYGA6zNTUXa7nRkzZgAwcuRIrW1SUhL16tWja9euXHPNNX7vPXLkSO1zl+fa/or65ptvAFfV7wkTJvhMYwsMDGTSpEnFbsW0a9cumjVrhslk4sUXX/TbJjQ0VHs9+fn5pdZAKKvC2gcGg4Eff/xRq1zuqVevXtq4jh49qk03LSoyMtJvFi80NJSbbroJcNVv2LdvX6WMvTQzZ87Upiy+8MILfmevBQcH89NPP2lZ1vfff1+7dvr0ae24SZMmfp/jmWee4eGHH+a///2v17TJwr7BwcF+p7zGxcXx6quv8swzz3itJawtqn0e0f33388333zD5s2befrpp9m5cye33347JpOJmTNn8umnnwKuxZ3PPvusV9/Dhw/TuHFjwJU+X7p0aVUPX4iLkjXHey2YwaT33kvtTC6h9UIqdG97XvmnPuZn5lOQ415EbAw0+p1n7rnmzZpn81qfBjL1UQhRfnk5Vh6/6adqHcNHv4whMPjCfNEcHBzMddddx5QpU5gxYwYTJkzAYHD/XblgwQLOnDkDwNixY7XzhdPInE5nsffW6XRER0dz/PjxMm2lVBK73c6iRYsAGDJkCGFhYX7bNW3alD59+vidOnjLLbdwyy234HQ6S9xXNyYmRjs+33ED2Gw2li1bBrjW+JW07u2+++7j+eefR1VV5s+fz5gxY3za9OvXr9jEQ+HnWnAV56gKCxcu1I7vueeeYts1bNiQIUOGMHv2bJYvX05+fj4Wi4WmTZtiMBiw2+3885//xGKxcM0113jtX9ylSxe6dOnic8+WLVsCkJGRwc0338xbb71Fq1atvNo88sgj5/sSa6xqz6jp9XrmzJmjvSmTJ09m8ODB9O/fn48++giHw0FcXBxz5swp9pdaCOHNa43auSxVUGSgdu58Kj9a87zXvxUnMMydUVMdKlmn3f/gmIL8B3hFi4nY84tsri2BmhBC+Chcd3b27FmvD90AU6ZMAVzBy6BBg3z6FgY8+fn57N69m9mzZ/Pee+8xZswY4uPjOX78OFByQFcWJ06c0KoZduzYscS2Xbt2LfF64ZhVVeX48eMsXbqUb775hkcffZTLL7/ca1um8x03wJEjR7Rtoi6//PIS20ZFRWlZpZ07d/ptU1KgFxTkngFT2ubclaVwnA0bNiyxkAu460sUFBSQmJgIuF5zYbbr+PHjjBgxgujoaEaPHs0333yj/Rny5/bbb9cC65kzZ9K6dWtatmzJY489xrx58yol0K7JLopPPbGxsWzevJlPPvmEqVOnsn//fqxWK02aNOH666/nX//6F3Xq1KnuYQpRY1jzPDJqAa6iHUGRQZw9lAac315qnpUbDSVk1EzBJnR6HU6H6x/JjCR3qeTCMRVVdB81e4E7o6boFHSGav9uSQghLjpXX3010dHRpKSkMHXqVK3iYEFBAb/99hvgykYVrdSXmZnJxx9/zNSpU9m9e7ffoEZRlEqZuum5JUBpn+lKCxZmzZrFhAkTWLlypRZAeSop21YRniX3y7JPbt26dUlMTCQ1NdXvdc9grCjP2SYXcsqsp8LXV9bXVsjz9X322WfodDq++eYbVFUlIyOD6dOnM336dMAV4N5zzz3cc889XhnfyMhI5s+fz2233ca2bdsA2LdvH/v27eOTTz4hMDCQ4cOH88QTT9C9e/dKeb01yUURqAFYLBaeeuopnnrqqTL3SUhIqPAf4jvvvLNWznUVtYPXGrWgys2opR9N145D6vnO0S+kKAqWMAu559bDZZx07wtTXEat6D5qRfdQq21leYUQ5y8gyMRHv/hOP6vqMVxIBoOBm2++mc8++4zffvuNr776CpPJxLx587T9xAqzboUOHDjAoEGDOHLkiHucAQG0atWKDh060L17d66++moGDBjg1aYqeE6Z8+R0Ohk3bhw//PCDdk5RFBo3bkzbtm257LLLuOKKK1i4cCFvvPFGpY3H87NmWf4dKqy+WVP+zSp8feV5bUXbm81mvvrqK5555hmmTp3K7NmzWbt2rdZ+/fr1rF+/nkmTJjF//nxCQtzLL9q3b8+WLVtYvHgx06dPZ968edqfudzcXKZOncovv/zCO++8U6444VJw0QRqQojK47lGzXhuY+nASgrUzhxyf4MW1SSyxLYB4R6B2okM7by/za7Bz9RHq5TmF0KcH0VRLtj6sIvJbbfdxmeffUZ6ejrz589n2LBhWnGRli1bctlll3m1v+WWW7QPw0888QR33nknbdu29cm6VdY6Kc91Y6UV+EhLS/N7/vPPP9eCtO7du/Pyyy/Tt29fn8IeFdmkuySeGUDPzGBxCtvUlNlgheMsz2vz7OepcePGPPPMMzzzzDNkZmayZMkS5s6dyy+//EJGRgZr1qzhnXfe4c033/TqpygKAwcOZODAgYDri4T58+czc+ZMFi5ciKqqPPvss4wYMUJb11YbyDyiWkAtyEctZm8VcWmyFqn6CEUzahWb+mjNtXlNYYxqUvI/QgFh7oIimZ4ZNT+bXYO/NWoSqAkhRFl0796d5s2bA/D777+Tl5fHnDlzAN9s2vr169m4cSPgKh7x3//+lw4dOvgN0opuflxRsbGxhIeHA7Bhw4YS227dutXv+S+//BKAiIgIFixYwDXXXOO3+uKxY8fOb7BFNGnSRJuuuH79+hLbJicnc/jwYYAaE1C0b98ecK3FKy1YW7duHeDKehYWPlFVlSNHjrBkyRKvtqGhoYwYMYKvv/6aTZs2aT/DP//8U2uTn5/Ptm3bfNbzNWvWjIceeogFCxbwzjvvAK6MatGN0i91EqhdolRVJfedl8m6+2YyrxuA8+jh6h6SqEKeBT8KpxMGe5Toz61gRi31cKq2abXBYiAsNrTE9p4l+j2LiRjLmlHzWKMmgZoQQpSssKrjnDlz+PPPP7X1W57VHgGtCARA586di73fr7/+qk1dO9/CFnq9nmHDhgEwf/58kpKS/LY7c+ZMsR/GC8fdtGlTr6lznlJTU1mwYIH2uOi4K7J+zWAw0L9/f8BVRbOkrQq+/fZb7bhwM/KLnWeRmf/7v/8rtt2RI0e096Znz54EBLj+jX/mmWdISEhgwIABWpBaVJMmTbQvEvLy8rTzjRo1omPHjjz88MPFPm/hmsuifWsDCdQuUYqi4Dh4AOexI+B04jh0oLqHJKqQzU/VR8+pj9kVDNTOHHRPe4xMiEDRlTyf3bNEf2FREShhjZrXPmp276mPfvZdE0II4VaYOTt58qS2Rqt3795eJd/BVcChkGd2w9PatWv55z//qT22Wq1+25XHww8/jKIoWK1Wxo0bR0FBgdd1u93OPffcU2ylv8Jx79y5028lwczMTMaMGeM1dbLouD3L4pdnWuejjz4KuEr133777X6LmKxatUr7ucfGxmp7ol3sRo4cScOGDQF4/fXXWbt2rU+b7Oxsxo4dqwW+jz/+uHZt6NCh2nFxa8h27NihZc08p+EW9l2+fLmWAS6qsHJp0b61gQRqlzB9k2basfOgBGq1iWdlRlNQYdVHd6CWl56H017+ksVnD7orX0WWsj4NvKc+eipL1Uenw0lBlmeFScmoCSFESZo2bUqPHj0A2LRpE+CbTQPo06ePVr1vzpw5jBo1it9//521a9cyc+ZM7rrrLvr27esV8BQWJTkfPXr04KGHHgJce3d169aNH374gXXr1jF16lT69OnD77//rm3KXdSoUaMAV1bliiuu4JtvvmHVqlUsWLCAN998k3bt2vHXX3959Sk6bs+Kki+99BIbNmxg165dpY796quv1orQLV++nE6dOvHll1+yZs0aFi9ezJNPPsmAAQPIy8tDp9MxefLkEqs7XkwMBgMTJ05Ep9ORm5tL//79efLJJ1m0aBFr1qzhyy+/pHPnzqxcuRKAcePGMXLkSK1///79ueKKKwBXFrZPnz7873//096bt956iyuvvBKbzYbJZOJf//qX1vfpp5/GbDajqiqjRo3iH//4h/Zn8bfffmPs2LG8++67gGt6b+EattpCPvlcwvSNm2HD9ReWQwK1WsVrjVqA7xo1VMhNyyU42nduf0FWASe2nSS2fQyWUO9AyzOjVtr6NPCe+uipLPuoAeSmu6c4yNRHIYQo3W233caaNWsA1zoif1mdgIAAJk6cyMiRI7FarcyYMYMZM2b4tLv++usJCgrixx9/5OjRo+Tm5hYbRJXVxx9/jNVq5ZtvvmHbtm3cfvvtXtc7derEddddx2uvvebT97XXXmPJkiXs2LGDxMRE7r//fp828fHx/POf/+SJJ54AYPfu3fTp00e7PnDgQAICAsjLy+PDDz/kww8/pHfv3qxYsaLUsX/99dfo9Xq+++47Dhw4wPjx433aREZG8sMPP9S4gGLAgAHMmDGD22+/naysLO1nU9Rjjz3Ge++953P+559/ZvDgwezYsYOVK1dqQZ2n4OBgJk6c6LWPXqtWrfjhhx+4/fbbyc/P5/PPP+fzzz/36duxY0dmzZp1nq+y5pGM2iVM55FRk0CtdvHKqJ1bD2a0GDEFuzNZ2Wd8pz+qTpUZ/5zD7Of+ZNqjs7yybqqqctZz6mNZArXiMmrFrVErkjXL8wzUTBKoCSFEaW655RatvP0111zjNc3R09ChQ9mwYQO33XYb8fHxGI1GAgICaNy4MaNGjWLu3LnMnDlTy2LZbDZmzpx53uPT6/V8/fXXLFiwgOHDhxMfH4/FYqFFixa89NJLrFq1qtj1Z+Hh4axZs4ZXX32Vjh07EhAQgNFoJDo6mr59+/Lee++xY8cOHn74Ye11//LLL173aNCgAX/88Qe9e/cmODiYoKAgnymYxTEajXz77bcsX76c2267jUaNGmE2m4mKiqJ79+6899577N2712tNVU0yYsQIDhw4wPPPP0/nzp0JDQ0lMDCQVq1acd9997Fx40Y++ugjv9snxMTEsGHDBj777DMGDBhAdHQ0BoOBsLAwOnbsyLPPPsvu3bsZPXq0T9/Ro0ezY8cOHn/8cdq3b09ISAhGo5GYmBiuvvpqvv32WzZs2FDq/nqXIkWtqt30RIl27txJu3bttMc7duygbdu253VP55kUNo95gENKDKeVOvR99jY69G91vkMVNcC3N0zWSvCP/nQEcR3qAzD59imkndsHbfhbQ2jSO8GrX9L2k/z6D/c3Vjd+fj2x7VwllXPO5vLtDZO1a/f9No7ACP8Zs0LHNp1gxhO+ZZKHvDiQloOa++3z+VXfavuntbqqBXvm7wOg+ZVNGfrK4BKfTwhROzkcDvbt26c9btGihU8FQyGEKK/q/rtFMmqXMCUyirXmDszXX85WXVP2r00svZO4JPgrzw8QFOWeL+9vL7X9yw56PT663l3i+IzH+rTAOgGlBmlQfEatuKqPAMYAd+bMK6Nmlg9dQgghhKg9JFC7hCmKQv1I94fepENnS2gtLhWqU8XmWZ4/yB0Uea5Tyz6T49PvwNIigdoGd1Utr2mPjcu2iWdxa9TMxaxRA+91arlpMvVRCCGEELWTfPK5xNVvEA7n9i48mVy2OdiiZvPcQw3cxUQAwuq75/0fWnmYHnd1RVFcJfZP70kmO8U7eDu1O5mC7ALMwWavjFpUGSo+AljCzH7PG4up+ui65h5vnhQTEUIIUYtZrdYyVaUsiclkok2bNpU0IlGV5JPPJS6uTTxsdJXXPZunx1pgxyQfeC9ptlzvPWM8pxm2GNCMdZNdJZtTDpwladsp4jq61q8VnfYIoDpUjm9OomnfxuUuJAKgN+gxB5soyPYeU3FVH8E7sMzLcO+lI+X5hRBC1DZJSUklbkpeFo0aNSp2I2pxcZOpj5e4ep1aYFBdhRlUFE4eTi2lh6jpPNen6Qw6r42iIxvXocFlcdrjLdO3A66Kjp7THj2zV0c3HMfpcHL2sHs/nbKU5i/kb/pjcVUfwTuj5rA6PMYka9SEEEIIUXtIoHaJMzZuQl3StccntvlmTcSlpbhCIoU6jWqvHSf+fYjM01kk7ztD5qks10kFuo51f3t3dMNxMk5kuoMmBeokRJR5PAHhvgVF/I2rkLGYa7JGTQghRG2TkJCAqqrn9Z9k02ouCdQucYrFQkyAe9rZiV0nqnE0oir420PNU0KPhoTFhgKuAiLbftvJAY9pj7HtYmg9pIX2OP14BodWH9Eeh8eFYbQUH2gVFRDmnVHT6XXoTcVnx0wBxQRqMmVXCCGEELWIBGq1QP267oIOSef20BKXLptHRs1fdkqn19HxBo89+2bvZt9i94boza5oQmi9EMIbhGnnNk/brh2XdX1aoaIZNVOQUStg4o+xuEBN1qgJIYqh03l/nLHb7dU0EiHEpcThcHg9Lvp3zYUmgVotEOtRSv3kWfnH61JX2tRHgDbXtNT2KyvIKiDzZJZ2rVm/JgA06tpAO5ednK0dRzUtW8XHQkXXqBU3tVG7XlygVkIWTghRuymKgsnknkGQk5NTQmshhCibvDx39WmjseQvmi8ECdRqgbi2jbTjVKuJ/CLl28WlpbSpjwDmYDOth7T0OR/Tph4hdYMBaHh5vN++5SkkAr6bXpdUSARKCNRk6qMQogQhIe7tR9LS0iSrJoQ4L6qqkpGRoT0OCgqq8jHIJ59aoG6XVhjVRGyK6wPwyX0nadyxYTWPSlwonptdFxf0AHS6oT3bZu70Otf8iibacVynWBS9gupQvdqUe+qjT6BWWkbN/19LEqgJIUoSGhrK2bOu/R6tVitHjhwhIiKCoKAgDAb5+0MIUTaqqmK1WklNTfXKzkugJi4Iff1Y6ukyOa66pqwd37xfArVLmDXHI6MWVHz2KqJhOI26NeDIumPauWb9GmvH5iAT9dvUI2n7Ke2cwWwgrH5oucZTdOpjaRm1YouJyBo1IUQJzGYzkZGRXsHa6dOnq3lUQohLQXBwsFfWvqrI1MdaQFEUYkLciyGTdh2vxtGIC60sa9QKdb6pg3Zcv109QosEYQ27ek9/rJMQgU5fvr82fIqJVHiNmgRqQojiKYpCdHQ0kZHlW0crhBAlMZvNxMbGVvn6NJCMWq0R26gOnCvcd+KQbHp9KStPoNbo8gYM+Gc/Tu1OpuvYTj7XG3aNZ83EDdrj8q5PA9+pj8YKr1GTYiJCiJIVBmuhoaFkZmaSlZWF1WotvaMQQhQRGBhIeHg4ISEhVV7tsZAEarVEfPc2sH0fAKeyDTgz0tGFhVfvoMQFUdY1aoXaX9eG9te18XutXqu6mIJNWLNdH3TKuz4N/Ex9DKpgRk2mPgohykBRFCwWCxaLhbp166KqKk6ns7qHJYSoQXQ6XbVk0IqSTz61RHyv9vCtK1BLV4LJ+nsFYcOGVfOoxIVQlqqPZaUz6Gg1qDnbftuJoldo3KNR6Z2KMAYYMZgN2AtcFdhMAVL1UQhRdRRFQa+XjLwQouaRTz61RGS9EMx6JwUOV+r2+LKNEqhdoqw5ZZ/6WBZ9xvegfrt6RDQIJ6JheIXuERBuIeu0ay+20jJqxY1Z1qgJIYQQojaRYiK1hKIoxMQEao+Tdh1HtRZU44jEhWLLK1vVx7IyWoy0GtyCeq3qVvgenuvUKr6PmnwjLoQQQojaQwK1WiSuVax2fMoejH3LxmocjbhQPIuJlGWNWlWI7VBfO45pU3LA5688v96oK3e1SSGEEEKImkzmEtUicY0jgYMAnFIisK/+G2O3XtU7KFHpylP1sar0uPtywmJDCY8PI7pZVIltDX42vNbL+jQhhBBC1DLyFXUtEtsoQjs+rdTBtnoFqlTCuqSoqlqpxUQqiznIRKdR7UnoXvpG63qDHr3Je5qjrE8TQgghRG0jn35qkdhG4dpxphJE7tkMAvfvwdDSf2l2UbVUVSX1cBoF2QXYrQ4cVgeqUyW2fQyWUEvpNwBXH4eqPTZeJBm18jIGGHFY3Zu0y/o0IYQQQtQ2EqjVIuGRgVgCjeSfmxp3SokgZMpkDC+/U80jEwBznv+LgysP+5y3hFkY98MtZQrWPKc9wsWTUSsvY4CB/AyPx7KHmhBCCCFqGZn6WIsoikKcx/THU0od7CuWYt+8oRpHJQCyz+T4DdIA8jPyObz2WJnu4zntEcUV8NRERQuK6GXqoxBCCCFqGQnUahnP6Y+JSiwniOT4J1+Sm5lbfYMS5Gfkux8oEBYbijnYnQ07c/Bsme5j8ywkEmBEUZRKG2NVKlqtUja7FkIIIURtI59+aplYjw2Lt+qasVXXDE4Dt0yl3eXxjH2oJ5H1gqttfLVVQbZ7T7vgqCDu/HkM63/czKqv1wJw9mBqme7jmVGrqevTwF+gJmvUhBBCCFG7SKBWyzQuYdPiHeuP88p9vzCyh5nel0egCw1H37gpurr1qnCEtVNBtjvAMgebAYhqWkc7d6bMgZpnaf6auT4NfKc+GiWjJoQQQohaRj791DJNWkVz/bgubPj7MLkZeeSezSAf9wf6ArvClBVWNi7fwI2O5UQoOQS++SHGy3tU46gvfQVZ7oxa4ZTHqCaR2rns5GzyswqwhJhLvM/FuIdaRRTNqMk+akIIIYSobeTTTy009OaODL25IwAFv/1C7ucfsllpxix9L/IUV2XB/boG/FcZzVjHItpO+gpD1+41dr1TTeAvoxYcHYQp2IT13LWzB1OJ61i/xPvYvKY+1tyMmqxRE0IIIURtJ8VEajnTiBsJ+fBLej8ykueGK7SNzNOu5Stm/k8/hMUHjNh3ba/GUV76PNeoFWbUFEUhqrHn9MfSC4pcMhm1QJn6KIQQQojaTQK1Wk5RFAztOmIefgP1HnqQRyeP565/9sV8bt8qVdExR9+TSf9Zis1qr+bRXrr8ZdQAopq6pz+WZZ2aZzGRouu8ahIpJiKEEEKI2k4CNeFFURR6DmzG0x9cS50w94fj9SkhvP/P2eR4rKUSlSffc41aiHvKYmQTd0btbDkzajW56qPPPmqSURNCCCFELSOBmvArvnEdnvv8BpoYzmjnDiWm89v3G6txVJcuq9fUR4+MmlegloqqqiXex2sftSBZoyaEEEIIUVNJoCaKFVonmEduqUc3527t3JrFieR5TK8rpNrt5L75Itn/uBvH/j1VOcxLQnFTHyM91qhZc21kncoq8T6XzBq1AO/ATNaoCSGEEKK2kUBNlChg2Aiu128gQM0HoCDfzppFiT7tbIv/wrZ0AY69u8if/E1VD7PG8yom4jH10RxsJiTGvQF5aevUvNeoXUoZNVmjJoQQQojaRQI1USJdaBhBg6/icude7dyy2Tt9puDZd27Tjh0HfQO50lgXzCPntee87lObFJdRA4hqXPaCIpfKGjWZ+iiEEEKI2k4CNVEq8y130MN4UHucdDyL/TtOe7Vx7HVPj1RTTqMW5Jf5/s4zyeR98Cb2vxeT9/4b5z/gGshfef5CUU3LXlDEcx+1mjz1UYqJCCGEEKK2k0BNlEoXE0uDB+6kufO4dm7J/y3RjtWCfJyHPLJoqooz6USZ7+84cggcDgCcx4+i5uSc/6BrENWpUpBTfEYtskk5Mmp5l0gxEdlHTQghhBC1nARqokxMw0bSu5E767Nlbw7pB13BmOPAPnA6vNo7jx8t873VlGTvviePF9Py0mTNsYLHTFKfjJpH5ce0Y+nYrd4/a697eRYTkX3UhBBCCCFqLAnURJkoisLlL48nlFwAHOhZ/vEM17HHtMdCjhPHynxvZ9FA7UTtCtQ8pz2CbyYsvEEYeqPrV1V1qKQdSSv2XlaPzJwxsOZm1IoGmbJGTQghhBC1jQRqosyMsXH07hKmPV55QMFhteHYt8unbXkyas4z3oGaI6m2BWoe68qCTOj03r+WeoOeiEYR2uPipj867U4cHtm2mrxGrWhGTdaoCSGEEKK2kUBNlEv/+69CpzoBSFOD2DZzpd+MmrMcGTWfqY+1LlArvpBIoSiP/dTOFFNQJC/Tu4BLTQ7U9EY9AWEWABSdQmB4QDWPSAghhBCiakmgJsqlTsO6tAvP1B7P/2039uO+QVm5Mmq1fupj8YVECkU28az86D+jdnq3++cYEG7BHOL/XjVF34d7EdEwnO53diUoMrC6hyOEEEIIUaVkPpEot/5XxLNtVjYAiRkWFuq6cJVzIxiNYHMVs1DT01Czs1CCQ0q9X9Gpj5JR8xXV1LPyo/+MWtL2U9pxbPv6KIpSSSOsHq2vbkHrq1tU9zCEEEIIIaqFZNREubW5aSDt1EPa44W6y9ijNMDQvhNY3FPUylJQRM3Lhews73NnU1Dz8iptvBe7smTUPCs/5pzJJS/Dd5+6pB2egVpMJY5QCCGEEEJUNQnURLnpI+pwa6s0otR0AFRF4Wf9ADLi26CLb4AKHFLqMfOnHaxasL/EeznPpPg/f6rs+7DVdPlZpWfUgqKCvKYyFt342l5gJ3mPOzMZ20ECNSGEEEKImkwCNVEhof37cYd9AUbVNdUxV7Hwf1vC2GBsy8eGG5hguJ4F63OY9OEK1i07WOx9ihYSKVSV69QcNgfJ+86gOtXSG18A1jJMfVQUhaimHgVFEr3XqZ3em4LD5iryYjAbiG4edQFGKoQQQgghqooEaqJCjH36U59URjn+1s4dPVnAzwfqckKJ9mo79+etOIsJgpwpp/2fr6J1aqpT5ef7pvPzfdNY8M6SKnnOorymPpZQACS6mTv4OrrB++dz0mPaY0zruugNskG0EEIIIURNJoGaqBBdVF30rdtxmbqfHo6dJbY9eTSdLauP+L1WtJCIdj7pOHa7k2Vz97BtXdlL/ZfXmUOpnD3kyk7tXXQAVa36rFpZ1qgBNO7VSDs+sv4YBR5TJpO2eaxPk2mPQgghhBA1ngRqosKM/QcCMMK5iqYhruIfRqOO7s7dPGn7lQ6KOzibN3Wb3yBITfFYo+ZViOQ43/1nGT9+vprPXlnItrUXJljzDHacdie2PPsFeZ4Sx1CGqY8A8R1jCQh37S3mtDtJXHkYcGUFvQuJ1L8wAxVCCCGEEFVGAjVRYabhozAOHIK5UxeeePcanvtoGO98MZTRjuXUJ5UB1vVa26MHzrJjve/eap4ZNUO7jtrxlsMONq44rD3+49dtF+Q1WHOsXo89g6aqUtaMms6go1m/Jtrj/UsSAUg9muYOOBWIaVO3wmNxpqWiOp0V7i+EEEIIISqHBGqiwhSTicBnXiH4vc8wJTQmoUU0IbHRKGHhAMRxljYxDq393E/m+9zDc42avmMXAHIxMTO7nVe7xF3JHNl/ptJfg2dGzd/jqlDWjBpA8yubasdH1x8nP6vAa9pjVNPIEoO9Esfxyw9k3Xwt2Q/ejmqv+syiEEIIIYRwk0BNVDpdXAPt+Mrjs7Xjg6kG9m7yzqp5Vn00tOsAej1z9D3JUgJ97rto1q5KH6tnNsv1uBoCNa/y/CUHWXEd6xMQ4Zoi6nQ4Sfz7UJGNriu+Pq1g9nRQVZyHErFv2Vjh+wghhBBCiPMngZqodJ6BWoJ6mqZO955o8/5vtXas5uejZmW6+9WL5UBEO9brWmnnWngEHuuXHyIjNbdSx1o0MCsauF1oTrsTa65Ne1xaRk2n19G8v/f0R8+Kj7HtKhaoqaqKmuou+a8WU41TCCGEEEJUDQnURKXTxTf0ejzAuVk73n0wl8W/7yInq8C74qNOhzUwhF/zOmunGkQqPPraYMIjXdk1h93Jsnl7XcfHj5L/40QciSVvqF2aooFZfhVPfbTmej9/SeX5C3lNf9x4nIwkd7Ab26GChUTy88DqUVjlbOVPMxVCCCGEEGVnqO4BiEuPvmVr94PAINpe3ZcGv5/mmK4eAFO+XMuv366nbbMgmurakEUgKZZ6nHxsLqlWV6CiUx3c2ioLk9nAlcNaM/N711S8ZfP2MGR0WwqeewLnyRNYp/9MyOTpKMEhFRqrb0atagM1r0BRAVNgyRk1cE1vDKwTQG5qHqrDXUkzpF4wIXWDKzQONT3d+/GZFP8NhRBCCCFElZCMmqh0hi7dMN9xL4a+Awj+4AsCxt3LEGWTVxuH3cm2PVnM1Pdlof4yttriSU7K0q5f6dxK/WxXef++17TAaHJt4JyVns+6qStxnnRNp1SzMrEtW1ThsfqsUavijJpXIZEgE4pOKbWPTq+jmcf0x0Lnsz7NmZHm/fisBGpCCCGEENVJAjVR6RRFwXL7vQS99Bb6Zi1QgoJp07E+j9mm09uxg2BTyeXfGzpPM9C5CWfScQCCQy1095jut/ivQ3juyGZdMK/CYy0oWp6/ygO1spXmL6qFx8+jUP0Krk8DUDPSvR5LoCaEEEIIUb1k6qOoEobe/YnfuJZ45xmuCz5E0hOfsPqL2Zw6nkEE2cS0iqPByKupa8wl7OV70QHOkydwpiSji67LgOtas+KvfQAcT1c4pMTQRHUV0XDs3IbjxDH0HkVMPNntTlBVDEa9zzVrNRcTKU9pfk/128UQFBlIzll3cZUKr08D1HTvjJoqa9SEEEIIIaqVZNRElTD26qcd604n0So8j1vrH+ARx2/c5ljIsN6hdOvfhEY926KPiXU1VFWsi/4EIL5xHVp1cgci0/X9yMWdgbIt+MPr+VRV5cDO03zxxmIeHvE9j47+kS/fWsLmVUew2dx7u1V3MZH8cpTm91R0+qMp2ERkQkSFx1E0o6ampcpeakIIIYQQ1UgCNVEldJFR6Fu7N7G2rVyKM8U9vU6Jruv6v6Jgumqou938uaiqa6LjkNHttfPJSgST9Fdjw5Ulsy6ch+p0Yj9xgjXvfs/bD/3Kf/49j82rjqCqYLc52LTiMF+8sZh/j53CTxNWU5BvoyAr32ucVZ1Rs3pOfSxDxUdPba5phaJ3rWlr2icBnb7iv85F16ihqqhpqf4bCyGEEEKIC06mPooqY+zdH8fuHQAU/PIDngvNdNH1tGPToKEUTP4WAOexIzj27MTQuh1tusRxdYM0/jrmyhwd0tVnCgMY61iAcvoUG1/5jFkb7JwmAsgpdhy52VaWztlD+tlc7FkFgLuAR7UWEynH1EeAui2iuOG/wzl7KJVWg5uf1zjUjAyfc84zrmmnQgghhBCi6l00GbW8vDzeeecdunTpQnBwMCEhIXTo0IHXXnuN9CKlw8srLS2Nt956i549e1KnTh1MJhMxMTEMGzaMX3/9VcvYiAvLeOVVYLG4HhQUeO3b5RkQ6OrHou/g3k/Ndq5YiOpwMCj5Ty537tGubdM14Vf9FXylH8bXG0LOBWluzdvV46EXB/CPVwbRrX8TTGb3dxNbVh8lW+f9K5BfJMMG4DyVRM7Tj5L7+vOoeXkVeOXFK8iqWDGRQvGdYuk4sl2F+npSi2bUkHVqQgghhBDV6aLIqCUlJTF48GB27drldX779u1s376diRMnMnfuXNq0aVPue69evZpRo0Zx8uRJr/OnT59m7ty5zJ07lyFDhvDLL78QElKxvbhE2ejq1iPo1ffIef05yHaX4kdRUOpEebU1Db6WvG2ujbKtSxZgefAxHAf2Q3Ymo/ibDCWIfYqreMgGXUuvvoqq0kFNpL9xL21fnoQS5NpbrEO3BuTn2fjohb84uNs17fKUyUCTfJuWUytIz6Wo/P99h33TOgD0LVphvvn28/5ZFDqfjFplKrqPGoBT9lITQgghhKg21Z5Rs9vtjBgxgl27dqEoCuPHj2fx4sUsXbqURx99FJ1Ox+HDh7nuuuvIzMws172PHDnCtddey8mTJzEYDDz44IP88ccfrF27lh9//JEePXoA8OeffzJ27NgL8fJEEYYulxP86Xfo4htq53QxsSgG7+8MjP2udGffsrOwr16Bff1qAPQ4ubPlaRo0reNz/5ZBGTxm+oPbHItokH8c67xZXtctAUZuuq+b9jhfryPD4P41sNvBduqUVx/7NvcecLaNa8v5it1Uq5Wc154l66Fx2PfsBCpenr+yFS0mApJRE0IIIYSoTtUeqH3zzTds2LABgI8//pgJEyZw5ZVX0r9/fz7++GN+/PFHABITE/nggw/Kde8XXniBtLQ0FEXh119/5YsvvmDIkCF069aNMWPGsGLFCkaPHg3A7NmzmTt3buW+OOGXPr4hwZ98i3HQNejqx2G592GfNkpgEMY+V2qPc99/nYKp/9Meh/ToziOvDqZ+w3AA4gLyeHiohSenPEKT4Vdo7QpmTvWpXtikVV269musPT5tNOC5s1vmV19px86zZ1BPubOxjp3bUa0VKzhinTcL+99LcO7fS+7LT+PMzLhoMmo+xUSQvdSEEEIIIapTtQdqn3zyCQCtW7fm4Yd9P7DfcsstDBs2DIBPP/0UexlLhufk5DB9+nQARo0axfXXX+/TRq/X88UXX2AyuT4g//DDDxV5CaIClJBQAp9+mZDJ0zH2G+C3jemqa90P8vPB5g6QDN16El4nkBc/G8HbE2/kxV/H0/Eft6Lo9ZiuvxH0rmqQakoytuWLfO498pa26FRXeGbXKZz12GMt5++V2PfuBsCxa7t3R2sBjnPZsPI6tXw9p4jAgYKaeob8zz/wDtT8VH20b1pP/uRvcKYkV+g5y0K1WiHXd8qnKoGaEEIIIUS1qdZAbc+ePezZ4yoMMWbMGHQ6/8MZN24c4CoKsmTJkjLde/PmzeSdK/wwfPjwYttFRUXRvr2r7Pv27duLbSeqnr5jF/SduvqcN3S5HF3jZq5jg47IesHodO7Kjbqouq7CJecU/PKjTxGQ8JRE2juOao/PGPXYzt3CqpqwLfkLAPu5KpX5GLWtAOxbN5b7tcybvJ43d7fiA+NNvGS4iy/0w5m5NIOkM1kU7upWdOqj81QSOc8/QcH/viP39efK/Zxl5W/aI7iyiUIIIYQQonpUazGRVatWacf9+vUrtl2fPn2042XLljF48OBS712vXj1efvlljh8/TseOHUts63S6MisFBVVbml2UTNHpCHr3E5xHDqHm54HDAUYT+mYtUBSlxL7m0WOwLXRtgu1M3EfWPTcT8ODjGPpeiaIo2Devp539ODv0jXAoCk5F4ZjZSJTNQR5m7KtX4LjvUXavP8JK/WB2KgkYcHC5cw/91+0goZR6Iqrd7iqSotfz95/7+G3KDjg3Zqti5KASy0FiwQlKoI5gh5Pdu5OJaBZJYJArw2tbt8q1aA5w7N6B48A+9M1anOdP1c9Y/Ux7BCkmIoQQQghRnao1UNu9e7d23KxZs2LbxcTEEBAQQF5enlefkjRv3pxXXnml1Hapqans2OHKmjRo0KBM9xZVR9Hp0DduWu5++qbNMfYbqE17VFOSyX39OQzdexP43GvYN61HwUK01c4psxGAPL2OY3od36hX0f70QQ7fPZWUlLZa3tmKjpX69qxKVOn46gIGj2pPs7b1fIJG++4d5L7+HGp2NntufJ4fphylJKqikGXQ88vkTcz4aQtd+iQweGRbos9VvSxkXTiPgAsQqDk9Kz5aLK5ppgA52ah5eSgBAZX+nEIIIYQQomTVGqglJSW5BmEwUL9+/RLbxsbGkpiYqPWpLB9++CE2mw2Aq666qpTWZZOcnExKSvmyEQcOHKiU5xZuAU+9iC4unoJpP8G599i+diXZ/xyPM3EfVrUrdRxOsvUOsg3uNWr5ion1SitI8d1TDUBFYcva42xZe5y6sSF0v7Ip3a9sSt3YUBwH95Pz3BOQncVupQGTfj6Mei7SMzhVGuXbsIToaHF5IPtW7mW3LgGnR6BntztZt/Qg65YepIkulP5KI1qpR9ABtkV/Ybn3Hz4VMs+XZ0ZN3yABR+I+OJdldp5NQe9RoVMIIYQQQlSNag3U0tJcHxCDgoJKncoWFBQEQEZGRqU9/6ZNm3jvvfe0+99zzz2Vct8JEybw6quvVsq9RMUpZguWu8djvOpa8j99X9sLzXlgHwBWTChAwwI7OTYH6UY9mXodqsefRUVVaaUepVd8Hhm6YJYdCyZFCdeuJydlMfvHLcz+cQsNG4UQcHwfqq0v6OGgUh9nYZCmOmiU78CiqpDp4IpHR9Ip/12mrLSSo1fI0uuwBhvI8Zh9e9BZl4OGIcQ7Uxjn+Ivw9DTsG9di7N67Un9OnmvUlDqRKKmRWiER9ewZkEBNCCGEEKLKVWsxkcI1YZbC/bJKEHBu+lVlrSM7duwY119/vXa/V199lejo6Eq5t7i46OMbEvj2R5iuG+113qq61oIpQLBTJb7ATstcK12th7nMuZdBjo08a/+Jux1/0qFbPP0HJvAv+1Tusv9B88B0n+c5eiSLvY767NM1YJ+uAXbF9T2IWbXSIj/LFaSdc+ZgGsqt96MDQhwqja05vJj9DXdfH01CC+/Nv4/rovnMcD0nqYNt/jy/r1E9lzGsCDXdnVFTwsLRRbqfX0r0CyGEEEJUj2oN1AqrPJaWTQNQz33ILa4yZHmcOHGCQYMGcezYMQCGDh3Kk08+ed73FRcvRafD8o9/Yr7lDu2cFfe+ZYV7mOmBBgYbtziWcrVzAxFkA2Bo0x59p8vQAW3Uo9yfPY3X7qrDsIRU6uG/GAeAQbVzl+NPTE691/mU/WcoyHfv3mbCih4nbZZ+zTPvDeHR9ido6zykXc9QgplguI7dq/ehZrk3fledTnLffIHMoX3J/e9bqA6H1/OcPp7Bolm72LvdexNvT2qmO0uthIejRLm/sJBNr4UQQgghqke1Tn0MDg4GID/f/1ogT4VtzGbfvabK4+DBgwwaNIhDh1wfgrt168bUqVPLFCyW1UMPPcSNN95Yrj4HDhzwu9ebqDyKomC55yGUsHDy//ctNms4nNs+LLR+KCn7XUGJLaI+FIlP9G3ao4SFQWAQ5OaAw0HA12/TH+gHnKQOR3T1UBUdpisGYejQGcffi2i44Xci1Uz+xnsz6+R9KYTUDdYem3SujJiakox96QIaHl7NnY6zrFZbM9PQD1WFfMXMt+rV3PLVPHo9OhqjyYB95TJsSxe6xv3H7yhGE5Z//JODe1KYP307W1YfRVVBp1N48u0htGgf4/NzcXpk1HRhEeCxlYFUfhRCCCGEqB7VGqiFhIQAkOtns92icnJyAIiIiKjw823cuJGhQ4eSnOzaPLhr16789ddfWsBYWerWrUvdunUr9Z6i8phHj8E08mZst0+FXFd2KjQmxB2oBUV4BWpKTCy6OpEAGDp0wr5mpdf9FCAuxE7CoA6Yho5An9AEAGevemTdMZ38XO8gDSBl/1niO8W6x1QnBM4lyvK/nYCaehaAns7d1Hvy33z74RpsTgWHoufHxXn8vOwHYhuGU//0TqJ0nXCgw4YB25wkkjZM4mCR+MrpVJkxcQNPf3Ctz5cSXmvUwsJRPKZRyqbXQgghhBDVo1oDtUaNGgFgtVpJSUkpcY1YYbXH2NjYYtuUZP78+YwaNYrsbNdUtj59+jBnzhzCwsIqdD9Rsyl6PdYcq/Y4NCZEO7Zi9ipTb2jbXrtmGjYK+7o14HSgxNTH0PlyjF17YOjRB8XkHZDpwiMw3ziGjO9/8Xn+tGPpZJ/J0R4HNG4AW13Haqo7StQlNKHzwDY87sxjwn/XkqO41mo6HSrHD6VxnFjQF/mdKCa2Orgnha1rjtKpZyOv816BWngEOvdSOsmoCSGEEEJUk2pdo9a6dWvtODExsdh2J0+eJO/cdCzPPmU1a9Yshg8frgVpQ4cOZf78+RKk1WKqqlKQ7T9QK8i1Ybi8l/bY0PEy7djYvRchP/5GyI+/Efq/mQQ++RzGfgN8grRC5lvGoV5xre/zO1VObD2pPbbUjcDQw7eao6FDFwCaD76Mx7un09J5FINqL9NrbBKaz0NP9aRDN/f+gDO/34TT4fRq51VMJDTMa42aU9aoCSGEEEJUi2rNqHXr1k07XrlyJT169PDbbsWKFdpxr169/LYpzoIFC7jpppuwWl0fyseNG8e3336LoZL3ohI1i73AjtPuDlhC63sEalkFBDz4KHlOB7o6kRgHD/Xqq4sqe3VQxWhEHTQSFvhWazy587R2bA42Yx5ym8+0SkOHztpxw38/xn0P3ob91J8kE8EJJYrjShSZShAB3XtgDjTBqsWY87NorR6l4dlklK//IOKOp9i+HlQVTh5NZ83iRHoNbg6A6rB7FSfRhUegWtwbXKtnz6CqaqWu4RRCCCGEEKWr1oxas2bNaN/eNa1s8uTJWmXHor7//nvAtT6tf//+Zb7/0aNHufHGG7Ug7R//+AcTJ06UIE14ZdOgSEYt24oSVY+gV94l4NGnznuD6fws/1tKOKzuCo3mYBP69p3Qt2rr1UbfoZN2rAQFEfj0K+h1CvVJpau6j+udq7int5N7X72W258ezNivHmFoGxsNVdc6TDUlmfAPn6JrY3eg9fsPm7FZXVk5z4qP4FqjdjrfzHzdZSRRB2xWr0BOCCGEEEJUjWoN1ADGjx8PwLZt27TNpz1NnTqVuXPnAnD//feXq+rjXXfdpW2Qfeutt/Lpp59KZkAAUJDtDp70Jj1BkYHaY9WpYs2r+L5kPs9VTKDmyRxsRlEUzDfdpp3TNWmOLiLSq52hXUfMt45zn1AUzLe7N2rXxcQS9P4ELPc8BIUBptPJoL0/oldcX4SkpuSw8JVJ2PfuQk1Pd9/LaOLE6QLeeXEpC/Rd+cxwPYeUeqhnksv/ooUQQgghxHmp9tTS/fffzzfffMPmzZt5+umn2blzJ7fffjsmk4mZM2fy6aefApCQkMCzzz7r1ffw4cM0btwYgP79+7N06VLt2uLFi1m8eDHgysSNHz+eLVu2lDqeTp06VcrrEhc3z4yaOdiEOdj7C4CCrALMQf7XnZWXZ0YtomE4aUfTfdqYQ1zPZehzBeY77sWxczvmcff5vZ/5tntwnjiGbeVyzGPv1KpMFlL0esy33IHhsm7kvPYs6qmT1CGLHvYdrNS7Mth/bbbSefsT1Hn0ca1fZmgMn72ykPxcV5BqU4z8n/4aHt+YSJMmzc/nRyCEEEIIIcqp2gM1vV7PnDlzGDhwIHv27GHy5MlMnjzZq01cXFy5KzR+99132nFaWhr9+vUrU7/ipl+KS4tnRs0cbEZn0GEMMGI7l0lzXQ8ppnc5n8sjUIttH0NGUqbX+rjCMcC5vd5uv7fE+ykGAwHPvU4AJW8Wr2/eiuDPJ5H3zsvY169hkHMT63WtsCpGcpUAPrNfza0/zCIeyMfId3l9SMvM8bpHvmLmsx+O8XSvDOrFSfEdIYQQQoiqUu1TH8FVcn/z5s28++67dOnShZCQEMxmM61bt+bZZ59l27ZttG3btvQbedi6desFGq24FFiLZNQAzCHurFpZpiuWVX6We0P3oMhAIhN89wIsHENZKYpSpmm8utAwAl//APOd9xMSHcaVoUe1a8lKBJ8mX86fusv5UT+IJJs7MO3eMQSd6goms20GPnz2T9LO5Pjcv5DqsGP7ezH2LRvL9TqEEEIIIYR/1Z5RK2SxWHjqqad46qmnytwnISGh2AzYjh07Kmto4hLkOfXRdG6KoznYRPa55Vj5WVZ/3Sr2XFmeQaGZ6OZRpBw469Wm6NTLyqTo9VjG3o1l7N2MdDhxTPibRfMSURUFp6Jjkb6LV/vBI9sy+q4uNB71MFNsrkqsqWdy+c+/5zH05g50u6IpZov3Xx3WaT+T/+3nAAS+8i7G3mUv+iOEEEIIIXxdFBk1Iapa0amPABbPjFp2ZWbUPJ4rxBWoFVXejFpF6fQ6bnqkP/9od4I6aobP9S69GzHqnstRDAZ6DmzGCId7u4Czp7P53yerePqOqUz7bj1nTmVp16yL/9KO8yd9jer0ntoJkJqcRfqcuVj/muP3uhBCCCGEcJNATdRKRYuJuP5/YaY+FnhMfbSEmqnbwl+gduEyav60vvVqnrRPo6djp3aucTTc/a9+6HSuKZXGQUPo49zBNY61KB6Z69xsK/On7+CFe6czd8pW7GfP4jx4QLvuPJyIfY1770OAOT9t4dk7p/HM5yf57MMNrHxnMgX5lVdZsziqqnLyWDrWgrJtEi6EEEIIcbG4aKY+ClGV/GXUCisvuq5X3tRHz4yaJcRMVNMoUIBzsY9Or8NgqdpfRX2nrgTExnBD0gp6OHeRrERw+ejRmMzucehbt0NXP44BJ7fQznmYNe1vZd0RHQV5rqDH6VSZNXkTe5bt5CYCCCVP61vw0yQMPfuiKArL5u7h9x82A6AqevYojdizQmXK+p+5rG9jRt55GeF1Aqls1gI7X765mB0bThCXEMHTH1yLJcBY6c8DYN+9g/wvPkLfpDmWR/+NopPvwIQQQghxfiRQE7WS34zahSomkuk59dGCKdBIeHwY6ccyzp0zVfn+fopOh2noCPK//ZxYUolVUzHWqePdRlEwDriagh//j7qkM+LUdK5p1Ih1ibCUdqTbXD+3vUcK+NAwmlsdi2mhngDAsXcX9k3r2Gmrz09frPE7BmuBg9ULD7B7cxLjXxhA45bR2jWnU2XN4gOsXXKQ3KwCbDYH1gIHdpuDRs2juOvJPgSWkIW0We1MeH0RuzYlAXDicBqLftvJtbd2Op8fW7HyP/sAx77dOHbvwHDZ5Rj7DrggzyOEEEKI2kO+9hW1kjXHu8CH5/+h8tao2QvsOKwO7bHlXNaursc6taqe9ljIeNW17k2xAaVOpG+bgVdrx2pKMsbt6+mdu54nc3+kYwP3dMhsJZBv9dfyvWkIu5WGOFDY++2vfPPuUlSnq12Yms0d9vlc7tyDRXX/fNPP5vLeU3+wepFr+uThfSm8+8+5TPrvCnZvTuLIgbMkHUnnzKks0s/msnXNUb79zzKcDv/r3Gw2B1+8uUQL0gr9NW0HWRn5fvucD+fZMzj27XY//7rVlf4cQgghhKh9JFATtZL31EdX8GTxCJjyKylQKxrwFWbtPAuKmCppY+3y0kXUwXzbPQDo23dC37KNTxt9g0boW7fzOR+AldvSZnDLLS3Qq65AVFUUdqiN+D/DNbxlGMu3h5tiK3Bds5gU7rH/QXv1EDc5lvGS/X9c51hF4QxBu83BxA/+5r1/z+PtJ+ZwaG9KiWPfseEEs/632ee83ebg67eXsGP9cZ9r+Xk25k2p/G077Ou9M4b2jetkP0YhhBBCnDeZ+ihqJe+pj4UZNY81apU09dFz2qPepMdwbg1Y076NWf3depx2J40uj6+U56oIy9i7MN9wM1gCip1+GfDUS1rpfX2L1hT88B3YbJB2hm7bfybGfoifDANJVtz7w2UqQdqx3qDj3s4Z1F+R6jqhKBhVB32d24kLM/A/R1+yz/2c9u887fXcl/VJ4LK+CZhMBgwmPUtm72brGtdecH/8so0GTerQtV9jAJJPZvLzhDXs3HhC69+ldyMaNY9i5iTX/m5L5+5h4Ig2RMVUzmbmAPYN3hk0NeU0zmNH0DdMqLTnEEIIIUTtI4GaqJU8M10mf2vUKqmYiFdp/mCzFgxFNAhnzLejyUjKJKF7w0p5ropSAkou5KGPb0jQK+9qj53Hj2JbMA8Ax5YNxAFP2n/lYM9b2GBuz9bVR/CclXjnY71oMvNtCieAmkbejHXmVFBVmiRv5qnHRvLNn3kcO5iq9anfMJxbH+xOq06xXmNp0jKat5+cw8mj6QBM+nAFgcEmNq8+yt9/7sXpcGeyOvZoyL1PX4HqdLJ07h7SUnJw2J3M+t8m7vl35ezzpjrs5G9YzzJdJ9II5krnFuqQjX3jWgnULiFOhxOdXiagCCGEqFryL4+olaqqPL/nfSyh3mvRIhvXoUnvBHSGmvVraL7+Jp9zelQ6Xt2JB58fwH++GcF1rKGj8wB32OfTJfAUjgN7tbamwdd4FdsImvEdT717NQOua02jZpHceN/lvPjZCJ8gDcASaOThlwYSeO49sxbY+eiF+Sybu6dIkNaA+5+9AoNBh9Fk4Lrrm2nX1i45yNHEsz73BsjPtbF51REWzNhB2pmcUn8Wjt07+SuvFX/ou7NG35ZPDSM5pkRj37iu1L7i4qeqKl+/s5RHRv/Awt92lt5BCCGEqESSURO1TtECH+7y/Bc4oxZSPUVDKpu+RSv0bdrj2LXdfVKnw9CxCwAh9etw5WUh2NcsAiB/4pfgOPfztgSga9wU89g7sf29GFQV57EjsGgOtzw4GuepJHJff57cKckEPv8Ghg6dfZ6/bmwo9/y7H5+9spCiS8HqRFq4/q5udLuiibYfnJqXS9vpbxCj9uKU4iqY8tNTPzDwulaYW7VBb9Bx8lg629cdZ9+OUzjsrnTgn79u5x+vDPKqRlnUpt/WskTvHmO2EsgX+uHctuVvethsOJNPkf/dBHR1Y7Dc/4iU7a9hDuw8zYblhwCY+f1G+l/bCqNRX82jqjyqqoLTiaK/dF6TEEJcSuRTg6h1igZhWjERj33U7AV27B7BXIWfyzOjVk3VHS8E00jvrJq+RSuUYPe6L2PfK7Vj56FEd7uWrVH0BvRNmmO8eph2vmDytziSjpPz3BM49u1GTT1L3kfvoDp9KzuqdjtN5n3ONXZ3EY8gNY/rHCt5rvkOegxoqgVpALbli1FOJzHU4c5yHcwL5ZupSXz26kI+fnE+v3y9jt1bkrQgDSArI5/3n/mDLefWxBV1+kQGP6z2XddnU4xMclzB4i//JPfZx7H/vQTr9J+xLV3g9z7i4rV7y0nt2Fbg4PC+M9U4msrlzMwg+/7byLp1OPad26p7OEIIIfyQQE3UOp6l+XUGnVbgo2iZ/MqY/uhZTKTo1MeazNjnSpRId6bJ0Oly7+s9+4Kfb+kNrdtrx5Y7HwBLAABqRjrZD9zuyq6d4zx2BPuaFT73KPh5EvYVS7nSuZV77PO4mb95xv4zfZ07UP9ehH3HFq/21r/mANBKPUoTZ5LP/YoK0DvQn5uOaitw8MUbi1k6Z7dXm4J8O1++toB89dyfHdXKmDHNMesKK2DqmPrHGX5Kbk4qrgDWsd17XCWx2RxM/Xotz909jUWzdpW5n6hcuzd7/3nZt+1kMS1rHtuCeTgPJ6KmpVLw64/VPRwhhBB+SKAmap2ipfkLC3wYLAav9WLl3UvN375eBZfg1EcAxWDActcDrgcBgRivGe59PSQUQ5fLffrpW7fVjnWRUZhvud19MT/Pp33BtJ+8HjsO7KPgx4na47ad6nHl5HcIatncfZsJH2mZOMfxo1qApAB3P38VHeoVEO9Mob56lnrGHOrGhtIwPogrHZt5yD6Ll/O/Y/xABUugEQDVqfLThDV88vICZkzcwPI/9jLpv39z4liW9pw3BW2h/9jePHGdhTA1Wzu/UdeSdw0386u+H8m7D/v5SbqoNhuq3Q5AZloe/33mTxb9toszp7L45eu1HDlw6WRyaoq8XKvPNhH7tp8upnXN4zx2lBTC+EXfn1V7z3/2gBBCiMongZqodfyV5gdQFKXCJfoXvb+ML675P9b9b5PXec/92CyXUKAGYLp6GCE//kboT7PQx/puMWDsc6XPuaJ7splHjUGJ8l4DZujSTTt2bN+CffcOwBXM5L73urbeTakbQ9DL76KLqotl/OPuPvv3YJs/F0D7P4AuoSnRfboy/pWrecwxgyft0/hX7g+8+kw7nohZz1DnOhqrp9Cj0mj+d/zr35cREeWuiLlj/XH+/HU7P3y6io0rDmvn+zm20rVPIxRFodHgnvzD/huxqjuwcip61ula8/bRrkz6YDlJR9K8Xq99724yb7yGrJuv5fDfW3nzsd9J3J2sXVdVmPLlWtmbrYrt234Kp9P7Z35g92nstksjqHGePM5U/RWs17Xi14y2l1S2UAghLhUSqIlax6s0f5HNpitSUCTzZCY7Zu/GXmBn3fcbcHh8kMvPzPd770uFrm6M19o0T4be/UHnnv6oi41HF1HHq41isWC5/1F3n179CHzrQ/RtO2jnCn79CVVVKfhxIs6D+7Xzgf98DiXItV+boW0HjFdepV3L+/y/2Hduwzp/nnbOdPW1KIqCvlFj9G3cUzDzvvoE++q/vQdvLaDOtM/5172NqG/ILPb1N3aeZKhzHYZuvVyvMaEJEXUCeNQ+kxvtS4lQ3Nk1JzpWLUrklfG/8clL89mzJQmHw8mpb/6Pg7nBLM9uwHvvbiDtTK7P8yTuSmbd0oPFjqM8nBnp2LdsRLVWTsGcvFwrv36zjilfriGnkqqlXgyKTnuES2udWtrxVI7oYrTHq+btqMbRCCGE8EeqPopax19p/kKeWa/8Mn7ozDzt/jDusDnJOp1NeHyY67my3M91qWXUSqMLC0ffsTOOzRsA32xaIdOVg1FCQlEz0zH2H4ii12O+cSy55woc2FcuJfuum3CeOObuc+1Ir8wbgOXeh7GtWQF5uZCfR86/H3ZtzA2g12McOMTdf8hw8s5VrXRs2aidV0LDUDMzzp3fgGnLBh5Fxy6lEclKOGeVMFIJIVUJIUrN4FbHYsy9+2Lo0cfVX1EwDRuJOvkbuhkO0e+Vh/j7o+ksSI0jTQnVnmfHhhPs2HACRQFVbQOGNq4L52bP6nQKN93fjd1bTmobfE//vw106tkQs8VY9jehCDUrk+wHbkc9m4Kh7wCCXnqrwvcC11q6z19bxL5tpwA4mpjKE29dfUlURvQsJKLoFNRz2bV9O07RrG296hpWpVDtdnadNXt9VbtpbRJjrXaMJvlYIIQQFwvJqIlax3uNmnfw5LWXWhnXqOUU2W8r46Q7A+P1XJdQMZGyMo8eox2bhgwvtp2xa3dMA65G0bs+JBp69kUX18B10en0CtKUejFY7v+Hzz10desR+NLbYDj3QbMwSAMMPfp4ZfOM/QdqhUw8BTz+NIaefb3OGXDSQT3EIOdmbnYsZbxjNs/bf+IB3XzqPng/gS+97VV23zzmToLe/5zgb37EfHl3enUO5Wn7VG62LyY2xO51b3+zGQPMCo++fhUDrmvDjfddjuHcusn0s7n8MdW7Op9qt5P39afkvPwUjqQT7N1+inVLD3I2Odv3xoB16QLUs651V/a/F2MvR4GTopxOlYkf/K0FaeAqZz/5o5U1fppm+tlcbVN1gB5XNtGO93q83ppKTTnNLhp4ncsvcLJt3fFqGpEQQgh/5KszUeuUlFGrUKB21nuqWkaSO1DznPp4KZXnLytjt16ETJ4BCuhifDewLo6i02G+cSx5H73jdV7friMBjz+DEhjk//m6difwudfIfeMF8Cjtb/LYCgBACQzCeMUgbH/O1s7pEppg6H0F+tbtyNq6CXJdAbiuSXMCHnsaNTMd6+/TsW9ci65pcwL/9QL6Js0pStHrMXS8zD3mZi3R/zWHrup+usUEcOzpl5k/Ywe7Nrmn1gWruUSoWdQnlUHN9CR0vhPVWkDoHz9yZSOVBYmuoHL+jJ30vqoF0fVd000Lpkwm59cpbFRasGLLb5y2udfUJbSIokvvRnTpnUDdWFc2z7bwT6+x5k38ihPjXkSn09G8XT2tsE5ZTP9uvbbHmKe1SxKpFxfKsDGdynyvi83uLe73JiommF5XtWD1Itc2E4m7krHbHBhqcNYw78hR9itxPufXLU3ksj4JVT8gIYQQfkmgJmodryxXSNGMmmcxkbKt4ckuklHLPBeoqarqdY9LcY1aWejqlz1A82S85jqcKck4Dh3A0KEzxr4D0NUtfcqZse8AAp54jrwP3gBcRUcM3Xr6tDMNGe4VqJlvvRNFp0OJqkvQ6+9T8NuvGDp0wjTsBpRzWTpjjz6oNhsYDGUOavTNW2rHzoP7ad0xhjZd4jjz91pSX3uZCLIx4ZFp267HmZlB/tefYvtrDv0xsC7wDjJsRuw2B1+8uZiEZpEo1nwKlu5np2EsOUoA2Lyf9/C+Mxzed4YZEzfSon0MVw+MpcGu7RSO+oASy9xdCRx/9i8A2nSJZezDvbQgsFBGai4H96RgDjASFhFAaEQAqxcdYMHMnVqblg2M2JNOkOioC8DvP2wmun4I3a9sWqaf0cVmj0eg1rpTLE1aRmEw6rHbHFgL7BzZf4ambWru9Me9G49hU3yn0G5fd5ycrAKCaunfVUIIcbGRQE3UOiVm1EJ8M2o5qbks/3QV5mATfR/uibHIGqGcIsUfMk66yrbb8uxeJfstoZbKeQG1hKLTYbnz/gr1NQ0ZhhIZiX3dKkxDR2hTKj3p27TH0OcK7CuWYrisu2s65DmGDp0xdOjsf1zG8q0R0zdpDjqdK8OXn4fzxDH0DRMIXP0XRtJdbTp3xXnkMGrqGXA4yHv3VezrVgFgxs61ecv5yeAa3/GDqRw/mHru7i2gSLwYERXoU5Bk3/ZT7Nt+ivqGUfR07mKXksAeXUOvNrs2JfHKQzMZPqYzA0e0ZtfmJFb8tY/t6477VD/01KBBCLcd+Qynw8lnhutJUSIA+P7DFWxedQRLoAlLgIGAIBMNmtShRfsYgi/i3wVVVb3Wp7XuFIvRZKBJq2j2bXdNe9y3/XSNDtR27EoDXH/3tXEe5ohSjxwlALvdyaaVR+g7pEX1DlAIIQQggZqohUpeo+YO3AqLiSz8z1IOr3YVdIhuHkX769p49ck5W2SN2rmMWn5Wvtd5c4h3UCguLOPlPTFe7ptJK6QoCoHPv4F6JhklMhrFzwbdlUEJCEDXoBHOI65pgo4De9FFRWP7e7HWxnTNCBx7d2KdPgVAC9IKdVIPsEZpx0HVf3BgVG10de6jLztp/J/vODzpFzYt2c82XRNOK+61eSeVKGbo+xU7VluBgxkTN/D7/zZht/vuC1hUVEww90ZuwnLQ9Wf9bvuffGYeTY7TqH3o9/l5KBDXuA4tO8QQ1yiCoBAzgSFmgkPM5OVYOXE4zfXfkTRsVgedejbkimGtCQy6ML8/TocTRadoGdJTxzJI95jO3LJjfQBatI/RArW9209yzc0dfG9WA6iqys4T7ui+rfMwEUo2K/WuYj9rlyZKoCaEEBcJCdRErWMtY9XHgqwCkraf1II0gDMHz/rcr+jUx4ykzHPTHt0BoTHAiN5Qc9e0XKoUgwGlHGvnKkrfrKU7UNu/FzUnG/LPBfLBIRh790MXG68FapqAQFAUlNwc7rTOY3PDweQ3bIFt104c6ek40BEZFUAX5RCW04cByHvjBaL27uIq4CrnRg4pMSwJ6M5uWwxFJThPcq1zLekEM8vUj2yn6/ehaJAWGhGA0aQnMy0Pm9W1/UREVCAPj4kn8N0PtHZRZDLOOo+vLddjt/vPwqlq0axgyQ7vO8Ofv27nyuGtGXR9W0LCKicbl5Gay5/TtrPir32ERQRw2z960apTrNf6tAZN62jP17JDDHPO7b+euCsZu92pFXqpSU4cTiPN6s4Kt1KPUo80VuIK1PZvP0XamRwiovyvAxVCCFF1JFATtY7n1EdTUJGMWpFAbdW367yuZ53yrqanqqpPMRFrjpWCrALyM4tfCydqF33zltgWuQp52NeswHkmRbtmuvIqFJMZfYtW6OIaeFW4DHjkX66pkB+8SQBWeh2dC0fnet076NkvcezbS/4XHwLg2LvL63pj9RSNc2dxigiW6DuxTd+Meo0iGTYgiqZffXVu5uRpWhQcY56+B2t1rV1j1it07NmIvle3oHWn+uj0OhwpyWQtWUJueCyRPbtge/phim7/3Fg9xb9a7uPYFXeQl2sjP9dKfp6N9NQ89u84RVZ6PuWVn2vjj6nbWPjbTrr0akTrzrG06RxHeKSreEramRwSdydzaE8KTlWlbZc4WnWs77fgR1ZGPn9N286S2bu1oDM5L4sPn/+Lq0a14/gh94bkrTvFoubl4kxJJqF5LAaDDrvdSUG+naMHztCkVV2ve6uqyp5PJ7Ny4QEK6jXi2n+PIKGF94buF5LqcJSaGd62zv3nq4EzmVDyCFHzqKNmkKqEoaqwftlBrhrVvoS7CCGEqAoSqIlax3vqY/FVH88eSvNaYwaQVaTsuTXbij3fu+Q6uLJqns9T2/ZQE970zTwKihx3Z2gJCsZ801jg3B5sQ4aT/90EAIxXDMY46BoAbMsXY1+/2ue+xoFXY2jXCX2T5uR//7VWqbLw3oYu3bCfm2IZQxq3OpYw7oEuBIy8HgBr4LMU/PIDzhPHCMTKaMdyejh2cVqJoKXtGHWCr8XSqR+KXofz5AlyHrsPJS2VICDfZAarx+/S2Lso+HEiAJFbFtPoH/ehb9QK59kzOJNPoaYW4OxoIVmpz4GCCBL3pZOelktuVgE5WQVkZxVgMOiJbRhGXKNw4hpHkpGay5I5e8jLcX25YitwsHbJQdYucW3+Xb9BGAX5dlJTvLPai37bRWCwiQ7dGtCqU30y0/JIPpHJ6aRMjuw/i7XA93dWVeGvad6bPrduXYfsxx/AeXA/xiHDadyqM/t3nAZcZfoLA7Xc7ALWLE5k2cwtnDztBJrACdj2xBwG39CO4WM7Y7YYPJ5LJTUlB6NJT3CoBZ3OPRWxIN/O6RMZnDqWQXZWPkajAZNZj9GkJzDIRONmESgrFqFE1MHYvZfWz751E7lvPA9GE0Hvf44+Nt7nNQJsX3NYO26lHkWpEwWpZ+jsPMAivata6dolEqgJIcTFQAI1UesU5HhOfSw+o1Y0SAPIPJ3l9Ti7yPq0QhlJmVjz3GX4JFCr3fTN/K/5CXzmFa9tC0w3uoI21eHAfOMYbd1UwL9eIPeN53EeO4KuXn109ePQNW2OecSNgGu7AdM112Gd/rP73k+9hOGybmSfOIrz4AHXSZ0e85WD3M83dISruuaRQ9hWLMX6x+/EJ58iXj0DgHX2dNSsDCwPPk7Os4+jpnlMV/QI0oz9BmIedz+2lctxHnaVsc994wXU/DzUU+6phAChQBeDgctbtcU0ZDjGq0Zor9O2dhW5b70A2/NQwsJR6kTRv2dH1sQMYOHsvWRleGfjTh7LKPZnnpttZc3iRNYsTiy2TbCaSy/nTrbompF8rghKIYNBR8Mjq3Ee3O8a25+zaT68O/vPxXKbVx3BYXeyZ+tJDu5O9rumT1Vh/vQdbFl9hJvu705ejpVdm06wa3MSGal5gGtz85BwC6HhAeTmWElNzva7v16hEIONKwvW08O5i5D7H8J841hS9hzmrxd/YZNtGE4UWj87k673Dadd1zivDdKzMvI5uM/9HrYxJ6Nv1Qb7quV08QjUjh1M5cThNOISInye//C+M2xde5SW7WNo2bF+qdVPnU6V5fP2cPTAWQaMaEN84zolthdCCOEmgZqoVRw2h1cGzDejVnLBAmu2lYLsAi3AK1rxsVDGySyvb8ll6mPtpgQFo4uNx5nk3lDYfNs9GHv08W6n12O+5Q6f/ro6kQT/98sSn8N841hsK5agJp/GfOcDGHu5ioYEvvAmOU88gJqRjunaEejCvT98K4qCPqEJ+oQmmG+5A9vSBRRMmaytqbMtXYhtzUrIz3N38symWSxY7n0IRVEwj76VvPdd2yIUBmx+2e04dmwlb8dWnGmpWG65A+fJE+S+9SLkun6n1PQ01PQ0dAf3c8VNZgZMHM+ODcfZvTmJXZtPcKbINOTIesE0bV0Xa4GdneuPYStmjRxAoJrHlc6t9HLuxISd/s5tzNb1YI2+rdamSaso1N8mePVLOLEBcBV0Kdz+wJ+mzhNkKEGcUcIBSE7K4rNXFvpt63SqZKTmaYFbabLsRn7X92KJriN9v13OqVV6tuzNxUlLrQLoxhTY+NYSjCY9bbrE0rlXIzp2b8jOjSe0IDBUzSG+fgC6+q791OqSTnyIleNZrr8DP35xPvc/cwXN2rper8PhZN6Urcz5eSuqU2Xuz1tp3rYew8d2KjZgs9kcTPzgb22/vXXLDvHAc1fQ/vIGPm2FEEL4kkBN1Cqe2TTwzaj5y3w17tmIoxuO4bC5vjHPOp3tEaj5z6hlJmVi8Sh6YAmVQK2207frqAVqhm69MN9+T6XeXxcZRch3U1Czs9FFRrmft0Ejgr+b4toWoEXrEu+hGAyYBl2Dsc+V5L7yNPaNa10XPII0810PYr7hFuyb1+M4lIixaw/tw75xwNXkT/oK1WMNHoASEopSJxIlJBTHoUTIcQdZBf/3Bbr6sVh/+dF76qYH68xfCRlxI5f1SdA2ZE5OymDPxJkoyxfQyHmSCHM8pqbXYN+5jZy8v9mjNGS7rjFnjFFEdmlLvfhwwveuJWLbUhqppzFhxzTqFgztOsEbzzPKsYJW6jFmGvtRYAnlqsY5qJu9iwfFb52HwXSP3+xZEPl0deyhu3M30WRiQ898XVeWGTqVmCHzx2DUExMfSnhkIHabE2uBnYKkU5zOULErrn+2s5Qg5ul7wN48fPZoOMdmdbB1zTG2rjmGTqcQ4PFFVCv1KIa4OHSx7o2v+4Yc5+esJgCkn83l/af/4Ia7u3JZ7wS+e385B3ae9rr//p2n+e9zf9G8XT2uuakDbTrHotO7Cqzk59r44o3FXsVZrAV2Pn91Ebc90os+V0tlSSGEKI0EaqJW8Vw3pugVjAHevwImPyXAe97XjdSjaWSccJXdzzydTVTTSACyzxaXUctEkYya8GC58wHUnGyU0DAC7n8ERVf5FQMVswXF7FsVURcWji4svOz3sVgIfO09ct94Hvvqv7XzpuGjMN86DkVRMPbsi7FnX+9+RiNBr71H/k+TUIKCMXS6DEOny9BFuYtuqA47jr17yHvnZZwnT4CqkvfGC173CXjiGXQNE8h940XUsylgs5I/8SsCn37ZdY+cbIK+epOOq5ZrfZyJ+8hP3AeAGeioHqSj4yA4wNL6IQy9W5I9bQqorgIi5rF3a/v0Bb74FrmvP0dbxxFaW/+H0qQDulUpFI2vjA4rfRLyWXrAhMGop1mburRsE0XC1tnU3zofPSoEh6CPbQ37dnOtcy2X9WrMLycbc+xgKhHRQbTpHEvrzrG0ahyIMTiQzDyVzLQ8MtLyMJkNxDQII6pusBbwAORPmUzBd9+SSSBLdJ1YY2iLXfX+81NHzaCfcT8h+ans1CWwS9+YfNU97dHpVMnxKHDU2nkEXf0+6Oq717J1ydmG/b5bmP5/63E6VJxOlWnfrmfGxA04He6fhiXQSH6ue2r3/h2n2b9jARFRgfQY2IyO3Rvw84Q1HDngWyXX6VSZ/PFK0s7kMGxMpzJvHC+EELWRBGqiVinI8l6fVvRDgk6vwxRkwnou89ZiQDOim0YSGhOiBWpZHuvUPDNqYXGhWpuMpEyvDa6LZu5E7aOLrkvQK+9W9zDKTDGZCHzpbfI+fQ/b/LkYBw/F8vCTpX6w1jdvRdDL7xR/X70BQ5t2BL7+PtmP3uuTRTNdOxLT0OsBsNx5P3kfvAmAbdGfOEbdgjMtjfzP3veaRur3ecLCUTPSAcj/+XsMm9eD0xWk6erHYR57l9bW2Ls/loeeIP/T99EB7NnmDtJ0ekwjRmOdORWAa4/8wpAvpxBcLwJl23ryPn4D9fQp7V6WOx9AMRrJ27cbgPpb/uL5n38nJ8dBULARx6rlFPz8Go79e7Dq9QS1bENYxy40rBPpWtSWpMPZui26lq79Gq1/zqbgXIGZUHIZ2fQsw58bwR/fr2H9yqPUdabT17md9rEOgl9+m+wHbqOD4xB2xzJO3PsG21MsbF59lMw0d1bUpNporp5AVz/eK6PG2WQGXducJq2i+frtJdrG6Z5BWvcrmzDm4Z4cP5jK7B+3sGere3PwtDO5/DF1G39M3eb1XnTr34SB17fh89cWaeOY/eMW0s7kMvYfPdHra942B0IIURUkUBO1SkkVHwvVb1uPI+uOYbAY6HF3VwBC6gZr17NOu6dteZbmj21f3x3MJWcTWi9EuyZTH0VNpBgMBD7xLDzxbKXfW9+oMYEvvEHuC/8Ep2sqoa5JMyzjH9PaGAcPpWD6zzgPHwRVJftfD3tNmwQwXN4T8x33Yl+5DOvi+WAtwHzHfRj7DSDrzhshOwtyc7BvWq/1Md8zHsVo9LqPafgoHPv3Yvtzttd548Crsdx5P9b5c13PnZuD8c0nKSjI99pKAVzTW03DrofcXPI+fQ/sdtT0NBwb12POSCNnymScxzw2AXc4cOzajmPXdp+fj+HyHhi69iT/q4+1c7rYeILe/BBdRB3GPD+MG/9eQu57r6MEBhH00n/RN26K4fKe2NetwoCTZvsW0eHFN7n1oZ4c2pvC5lVHODprIT3yt2DGjq5+LLp69UGndwWxqorzVBJNWyfwwqcjmPjBcnZsOOH6mQUYGPNQT3oObAZA83YxPPn2EPZtP8VfP65n5/YUnKpvED/gmqbc9HBfdDqFZ/57LZ+8tIBT54rArPhrH5F1g7j21k4+/YQQQkigJmoZq2fFRz/THAGGvDiQvQsPUL9dPSIahAMQEuMOuorLqMW2q8fuv/aCCqpDJSXRPe1Hqj4K4ct4eU8CHnuKvE/fR4mIJPCFN72mbip6PZZ7/0HuC0+6TngGaYqC+fZ7MI+9G0Wnw9CqLZZ7HkJVVS3rZxlzJ/lff+r1nPpWbTH2G+gzFkVRCHjkXzgPJ+LYs8v9HDffjhIYhHnYSAqm/g9AqwTpvqke8023YR57F4reACGhGLr2wL5mBQC5L/1by+aVlX39Guzr17jHVyeSoHc+QRfhrppo7Hslod17g17nel7ANGwk9nWrALCtXIrjwF50TVvQtHVdmjQNI3PqoxQumtPHxrs2fa9bT6vO6Tx5An3DBELCLPzjlcGsWXyAk0fT6TukJXVjQ73GqDocNNqziDu2fUWWTc9mXTPW61pySolEUVWudq5n4KqZOAfVQde6HVH1Qnj6/Wv57MU/SdyXSqPmUQy8vi1CCCH8k0BN1Cpem10XMx3REmqh4w3tvM6F1nNn1DI9qs15lucPrR9KcHQw2ef2WivIkg2vhSiNaej1GPsPBqMRxeT75YmhW0/0nS7DsWWjdk7f6TIC7nsEfYtWPu09p2aaRoymYNav3lMTH3ik2OmbislM4EvvkPOvh3AmHcc04kb0jRq77nX9jRT8Ph3yvNel6lu3I+CJZ9A3buZ13njlVVqg5hWkKQrGvgMw3XQbAI5tm7Dv3gFWKyjgPJOC88A+74EFBRP09kfo6sdSVNGfmaFbT5TouqgpyeBwkD1+HLrYeAx9+mNo1U4L0tDpUaJdFR31sfHYCwM1jymlOp1Cr0HNtcdqTg6OfbtR7Taw2SiY9jOO7ZsBCAH6ObfTr85pzoY1Qk3cSxSZcMYVqIZMno4SEEhQiJkH4nby+75UhjRugMmaAwHhft8PIYSo7SRQE9VGVVVseTZMgSWXxK9MeenudRrlyXKF1PPNqKmq6lWePygqkLDYEC1Q8yQZNSGKpwQFFX9NUQh44lny3n7JtQ/cmDsxdOtVpiIUismM5b5/aMVKDH0HuKo8lkAXXZfgr37AeSYZncem0bqougT951Psq5ajhIahi41HF98QXYNGfsdi7NmXvMBAbbsBDAaMg67BfNNt6Bs00toZWrbG828HVVWxr11J/sQvXfvfBQQS9Np76Js0pywUvQHTdaO1dW3gCr6sv/yIZ81bpW49FIPrI4AuNg42uc7bVi1H36gJ+rbtteymqqrY5s0i7+tP3K+n6PPWicJy70MYr7yKUEXB+uuP5E/6ChwO1PQ0rPNmYR51K46jh1EWzmGE0wlzwdasIeZhI8v02oQQoraRQE1UC9WpMuOJ2ZzYfpI+D/Sgy80dq+R5c1M9AqvIwDL3C/HIqOWczcVudWDLteL0KNMdHBlEWP1QTmw56dNfMmpCVJw+Np7gT/+vQn1N/Qeh6A04T5/ENPyGMvVRLBb08Q19zhtatcXQqmxT9ZSAAAKfepmC6T+jb9EK8w23oqtbr/R+ioKxRx8M3XrhPHgAJTLKa7pjWZhH3Yqi02FdMt83O3eO3qOIiGdA6tiykZwtG8FoRN+6HYZOl2HfvgXH5g3FPp/xikFYHvk3utAw9xhuuQNnRjrWaT8BUDDtJ0zDR5E/8Uv3msS4BpiGDC/XaxNCiNpEAjVRLU7tOs3xc/vrbJyytcoCtRyPTWUD65QjUKsb7Nqq6NysoeyUbGx57vLUBrMBU7CJsCJrOAp5VoAUQlQtY58rqud5e/fH2Lt/hfoqOh36ZhXba0wxGl1r5m66DWfyKWwrl2Fb9BeOvbu0NnqPgNPQvTdM/ApsHjk3mw3Hts04tm32vrklAF10XdDpUELDMV13A6YrBvsdh3nUrVhn/Qo2G+qZFPInfIh9xVL39bse0LJ6QgghfMnfkKJaZHlMD8xNy8Vhd6A36C/483pl1MoRqOmNeoIiA7Wpjlmns3FY3etOgqICURSF0Pp+AjWl+MIlQghxIenqxmAeeTPmkTfjOHII29KFoKqYbxyrtdE3TCDkuynYVizBvnkD9h1bfdbiARh69SPg0ae8NlQv8bmjojEOHopt3iwArHNnup+zeSuMfQec56sTQohLmwRqolrkemS2UCEvLY/g6ODiO1SSHI9ALTAyoFx9Q+qGeARqWTid7r2FgiJda2z8ZdTMQSavza+FEKI66Bs1Rj/uPr/XdPVjMd84FvONY1Htdhz7dmPfshH7lg2QnY1p9BiMVw4u9wbV5ptuc2154HR6nbfc+9AF2fRdCCEuJRKoiWrhGTC5HldNoOYZIJYnowYQGhPMqV2nAcg8ne31gSU4ynUvf4GaTHsUQtQkisGAoU17DG3aw5g7z+te+rgGGPsNcGXyzjF0uRxDl27nOUohhLj0yddZolp4bhTt7/GFYMu3ee2jVp5iIlCk8uOpLK891IKiXBm1gHALxgDv7z+kkIgQojYz33y712PLPQ9X00iEEKJmkUBNVIvcIhm1oo8vzHN6TLdUICCsnFMfPSo/ZiVnewWXwecCNX/r1KQ0vxCiNtM3a4nlgUfRNWqM5eEn/e5/J4QQwpdMfRTVoujUx6oI1DyfMyA8AJ2hfN9TeGbUMk9lYfbYMNszOxcWG8rZg6naY8moCSFqO/PoMZhHj6nuYQghRI0igZqoFkUDs6qY+ljRio+FQj0yatnJ2djy7O77Rbk37A0rklGTQE0IIYQQQpSXTH0UVc5pd5Kblud1rmiG7ULwDAYD65Rv2iNASIw7o+awOb0DvyjvjJonmfoohBBCCCHKSwI1UeXyMvK0jaMLVfUatYpk1MxBJkzB/vdDC6rjzqiFxoZ4XZNATQghhBBClJcEaqLK+ZvmWNVTHwMrEKgBhNYL8TlnCjJhCjRqj32mPoZKoCaEEEIIIcpHAjVR5fxNc8xNzUNVVT+tL8zzVmTqI3hXfixUtMx/aEwIeOwJKxk1IYQQQghRXhKoiSrnL3tmL7BjzbVd0Oc932Ii4D+jFhwd5PXYYDZ4tavocwkhhBBCiNpLAjVR5Ypbj5Z7gac/5nisUavo1Ef/GbUgn3Pd7+qKKchEk94JxLStV6HnEkIIIYQQtZeU5xdVrrj1aDmpuUQ0DL8gz6mqaqVk1PwGalG+92ozpCWtBjdHp5fvQoQQQgghRPnJp0hR5TyrL3qfv3AZtYKsApx2p/Y4MLKCUx9j/Ex99JNRAyRIE0IIIYQQFSafJEWVyzmbU8z5CxeoeRYS0Rt1mIsps1+aED9r1Pxl1IQQQgghhDgfEqiJKueZUQuu655KeCE3vc4tsj5NUZQSWhcvMCIAvdH71yYoyn9GTQghhBBCiIqSQE1UKVVVvTJndZtHaccXspiIZxavooVEABSd4hVcAgRXcBqlEEIIIYQQxZFATVQpa64Ne4FdexzVLFI7royMWubpLNZOhoGUrgAAJ49JREFU2kDSjlNe5z0zaudbLr/oOrXAYtaoCSGEEEIIUVESqIkq5Zk1U/QKUY3ruK8VU2SkrJx2J7/9ay5rJm5g5j/nkJvuvl9lbHZdKMQjo2YJs2Aw6c/rfkIIIYQQQhQlgZqoUl4BU0QAQR6bRRdXZKSs9i1JJO1oOgD2fDsnt7uzap4BYtB5TlUM8cioBcv6NCGEEEIIcQFIoCaqlNdeZpFBXtMQ8zLycdgdFbqv6lTZ8OMmr3PJ+89ox94ZtfML1KKbuqdr1kmIOK97CSGEEEII4Y9seC2qlGchkcA6Ad7TEFXIS8+vUJbq4MrDnD2U5nUuxSNQq8w1ak16J9D5pg5kJmXS4+6u53UvIYQQQggh/JFATVQpz8xWUJ1AjBYjpiAT1hyr6/rZ3HIHaqqqsv6HTT7nvQO1ysuoKTqFfg/3Oq97CCGEEEIIURKZ+iiqlL+1Yp5ZtdwKVH48uuE4p/ek+JzPTskhLz0Ph91BXka++3nPs5iIEEIIIYQQF5oEaqJK+Vsr5lncoyIl+j2zaQ0vjycg3KI9Tt5/hrw072qS55tRE0IIIYQQ4kKTQE1UqRw/GTXPNWPl3fQ6aftJTmw5qT3udnsXoj020U7Zf4Ycj/VppmATBrPM+BVCCCGEEBc3CdRElcotskYNvDNc5c2orf9hs3Yc2z6GuI6xRQK1s36fUwghhBBCiIuZBGqiyjjsDvLS3WvFAiN9pz6WZ9PrtOPpHF5zVHt8+e1dAKjrGagdOFOpm10LIYQQQghRFSRQE1Wm6Foxvxm1cmx6fXi1O0irkxBBo24NAIhu5g7U0o6lk3Ei0+c5hRBCCCGEuJhJoCaqjOf6NM+1Yl5r1MqRUTu81h2oNe7VCEVRAAiPD8MYcG4dmuqqCllICokIIYQQQoiaQAI1UWVyitl0OtCz6uPZXFRVLfVetnwbJ7a6i4gkdG+oHSs6haimkdrj5L3u0v0SqAkhhBBCiJpAAjVRZXI9pjV6Bmqex/YCO9ZcW6n3Or4lCYfVAYApyET9dvW8rnsWFPEke6gJIYQQQoiaQAI1UWU8M2qema2AMAuKXtEel2XT6yNrjmnHDS6LQ2/Qe10vLlCTjJoQQgghhKgJLppALS8vj3feeYcuXboQHBxMSEgIHTp04LXXXiM9Pf287z9lyhQGDhxInTp1sFgsNG3alPHjx7Nv377zH7woE89CIZ6VHhWdQmBEgEe70gO1w+vc69M8pz0WqltsRk0CNSGEEEIIcfG7KHb+TUpKYvDgwezatcvr/Pbt29m+fTsTJ05k7ty5tGnTptz3ttlsjBkzhmnTpnmdP3jwIF9++SWTJ0/m+++/Z/To0ef1GkTpcr0yat5TEIPqBJJzJvdcu5IDtbTj3pUcC6s9eqqTUAedXofT4fQ6Lxk1IYQQQghRE1R7Rs1utzNixAh27dqFoiiMHz+exYsXs3TpUh599FF0Oh2HDx/muuuuIzMzs/QbFvHvf/9bC9IGDx7MrFmzWLVqFe+//z7h4eHk5uZy2223sXHjxsp+abWKvcDO4TVHycvIL7aN18bTkUFe17wKipQSqHlOe4xsXIeQusE+bQwmPXUSIrzOKTqFgHBLifcWQgghhBDiYlDtGbVvvvmGDRs2APDxxx/zyCOPaNf69+9Pz549ufXWW0lMTOSDDz7g1VdfLfO9d+zYwaeffgrAqFGj+PXXX7US7j179uTaa6+lZ8+epKen8+STT7Js2bJKfGW1hzXXxrRHfiPlwFlC6gVzy9ejCAz3LdrhOaXRX0bNXzt/PMvyJ3T3zaYVim4eyZnEs9rjgPAAdPpq/25CCCGEEEKIUlX7p9ZPPvkEgNatW/Pwww/7XL/lllsYNmwYAJ9++il2u73M9/7ss89wOp0YjUY+/PBDLUgr1KpVK1544QUAli9fLlm1ClBVlYX/WUrKAVdAlHU6m4XvLPUpsa+qqlemzCej5rWXWvGBmr3AzvEtSdrjRn7WpxUqWlBEKj4KIYQQQoiaoloDtT179rBnzx4AxowZg07nfzjjxo0DIC0tjSVLlpT5/rNmzQKgX79+NGjgP/Nyxx13aMczZswo872Fy+ZftrF/SaLXuUOrj7B9lvd6Q2u2VSunD75Bk2dxkZI2vfYsy28MMBLbPqbYtkULisj6NCGEEEIIUVNUa6C2atUq7bhfv37FtuvTp492XNbpiQcPHuTUqVOl3js6OpqWLVuW697C5dimE6z4co32WG90/3Fa/vkqzh5O1R57ZtN0Bh2WUO+1YoFlnPp4xGPaY8PL4tAb9cW2jWomgZoQQgghhKiZqjVQ2717t3bcrFmzYtvFxMQQEBDg06cy7g3QuHHjct1bQObpLP54ZQGq0zXFMaReMLd+M1or1uGwOvjz9UXYz2W/vNanRQSg6LynoXpm2EoqJnJ4rbuQSEnTHgHMQSbC4kLdzxEpgZoQQgghhKgZqrWYSFKSa62RwWCgfv36JbaNjY0lMTFR61PWewM0bFjyB/q4uDgAUlNTKSgowGw2l+k5ipOcnExKSkq5+hTdmuDAgQPnNYZCZxLPkHYs47zvo6pQkF1AdnI22WdyST+WTn5WAQB6o55Od7TlVO5JGt0Ux5IP/wbg5NYkMh9LI6JhBKlH0jiZ43pPIuvWYefOnV73zzqTrV0nF6Z//BuoKqpTxZ5vpyDHSn5mAYf2HtH65IXn+NynqILIPE7uc933jCOu1PZCCCFqvqZNm2KxSJVfIUTNVq2BWlpaGgBBQUE+hT6KCgpyFZ/IyChb0FF4b4DgYN/y7f7uXXj/unXrluk5ijNhwoRyVaf05/rrrz+v/lXtreGv+L/grz7LRnhqmp/znjaU/pzv9n+99EaljUUIIcQlZ8eOHbRt27a6hyGEEOelWqc+FhS4MjJl+darcOpjYZ+y3rss9y+8d3nuL4QQQgghhBAXSrUGaoVVHkvLpgFauffiKkMWd++y3N+zlHxZ7y+EEEIIIYQQF0q1Tn0snJKYn59fatvCNmVdP+Y53bG0+3teP9/1aQAPPfQQN954Y7n6ZGZmsmHDBkJDQwkPD6dBgwYVGsuBAwe8pk3+9ttvpRZTETWLvMe1g7zPtYO8zxdG06ZNq3sIQghx3qo1UAsJCQEgN7f4Kn+FcnJyAIiIiCjXvT37lnZvRVEIDw8v0/1LUrdu3Qqtc+vZs+d5P3dRzZo1k3n6lzh5j2sHeZ9rB3mfhRBCFKrWeX6NGjUCwGq1llolsbCKY2xsbLnuDXDixIkS2xZej46OxmCo1thVCCGEEEIIIao3UGvdurV2nJiYWGy7kydPkpeX59OnMu4Nrs2xy3NvIYQQQgghhLiQqjVQ69atm3a8cuXKYtutWLFCO+7Vq1eZ7h0bG0t8fHyp905JSWHfvn3lurcQQgghhBBCXEjVGqg1a9aM9u3bAzB58mSv6ouevv/+e8C1Pq1///5lvv/IkSMBWLBgQbEbZRfeG2re3mVCCCGEEEKIS1O116IfP348ANu2beO9997zuT516lTmzp0LwP3331+uSoj3338/er2egoICHnzwQRwOh9f1vXv38uabbwLQvXt3rwyfEEIIIYQQQlSXag/U7r//fjp37gzA008/zbhx41i4cCHLly/niSeeYOzYsQAkJCTw7LPPevU9fPgwiqKgKApXXHGFz73btWvHQw89BMDs2bPp168fM2bMYPXq1Xz44Yf07NmT9PR0jEYjn3322YV9oUIIIYQQQghRRtVe4lCv1zNnzhwGDhzInj17mDx5MpMnT/ZqExcXx5w5cwgLCyv3/T/44AOSkpKYPn06q1atYtWqVV7XzWYz33//PV27dj2v1yGEEEIIIYQQlaXaM2rgKvyxefNm3n33Xbp06UJISAhms5nWrVvz7LPPsm3btgrvK2M0Gpk2bRpTpkxh0KBBREZGYjQaiY+P584772TTpk3cfPPNlfyKhBBCCCGEEKLiqj2jVshisfDUU0/x1FNPlblPQkJCsQVIirr55pslIBNCCCGEEELUCBdFRk0IIYQQQgghhJsEakIIIYQQQghxkblopj6KyhEdHc3LL7/s9VhcWuQ9rh3kfa4d5H0WQghRHEUt6yIvIYQQQgghhBBVQqY+CiGEEEIIIcRFRgI1IYQQQgghhLjISKAmhBBCCCGEEBcZCdSEEEIIIYQQ4iIjgZoQQgghhBBCXGQkUBNCCCGEEEKIi4wEakIIIYQQQghxkZFATQghhBBCCCEuMhKoCSGEEEIIIcRFRgI1IYQQQgghhLjISKAmhBBCCCGEEBcZCdSEEEIIIYQQ4iIjgZoQQgghhBBCXGQkULtE5OXl8c4779ClSxeCg4MJCQmhQ4cOvPbaa6Snp1f38GqNAwcO8Nhjj9GuXTtCQkKwWCw0btyYO+64g/Xr15faf8qUKQwcOJA6depgsVho2rQp48ePZ9++fWV6/rVr13LTTTcRGxuLyWQiLi6OG264gUWLFpWp/7Fjx3jkkUdo3rw5FouFqKgo+vbty3fffYfT6SzTPWqj//znPyiKgqIo2O32EtvKe1wzzJ8/n5tuuokGDRpgNpuJjo5m6NChzJ49u9S+8h4LIYSoFKqo8U6cOKG2adNGBfz+l5CQoO7cubO6h3nJ++qrr1STyVTs+wCozz33nN++VqtVHT16dLH9AgMD1V9//bXE53///fdVnU5X7D3+/e9/l9h/6dKlalhYWLH9BwwYoGZlZVX453OpOnDggBoQEKD9nGw2m9928h7XDDabTb3jjjtK/D2+//77VafT6dNX3mMhhBCVSQK1Gs5ms6ldu3ZVAVVRFHX8+PHq4sWL1aVLl6qPPvqo9g9+06ZN1YyMjOoe7iXrt99+UxVFUQG1Tp066uuvv64uW7ZMXbVqlfrxxx+r8fHx2gel999/36f/Y489pl0fPHiwOmvWLHXVqlXq+++/r4aHh6uAajab1Q0bNvh9/pkzZ2r9mzdvrk6aNEldvXq1OnnyZK8g/rPPPvPb//Dhw9qHu5CQEPXdd99VV65cqc6ePVsdOnSo1n/06NGV+nO7FAwcONDrg3BxgZq8xzXDXXfdpf0sOnfurP7www/q6tWr1e+//15t3ry5du29997z6SvvsRBCiMokgVoNN2HCBO0f308++cTn+s8//6xdf+mll6phhJc+u92uJiQkqIAaFRWlHjhwwKdNSkqK9iEvMDBQPX36tHZt+/btWkA9atQon2/qd+/erX3I69evn8+98/Pz1QYNGmgBeWpqqtf17Oxs9bLLLlMBNSwszOe6qqrqTTfdpAKqyWRS169f73P9wQcf1P4cLV68uMw/m0vdd99955Ox8BeoyXtcMyxcuNAr0MrPz/e6npaWpv2uh4aGqrm5udo1eY+FEEJUNgnUarhWrVqpgNq6dWvV4XD4bTNs2DAVUCMiIor9tl9U3KJFi0r9pltVVXXWrFlauy+//FI7/8ADD6iAajQa1aNHj/rt+/7772t9i34b7xmMT5kyxW//DRs2FJvRO378uKrX61VAffDBB/32z83NVevWrasC6rBhw4p9jbXJqVOn1IiICC1ALylQk/e4ZujXr58WCJ09e9Zvm6+++kr7Of/+++/aeXmPhRBCVDYpJlKD7dmzhz179gAwZswYdDr/b+e4ceMASEtLY8mSJVU2vtpi1apV2vHw4cOLbTdgwADtePv27drxrFmzAOjXrx8NGjTw2/eOO+7QjmfMmOF17bfffgMgMDCQkSNH+u1/2WWX0bZtW7/9Z8+ejcPhAGDs2LF++wcEBHDjjTcCsGDBArKysvy2q00eeeQR0tLSGDBgANdee22JbeU9vvglJSXx999/A/DEE09Qp04dv+2uu+467rjjDp588kmioqK08/IeCyGEqGwSqNVgngFCv379im3Xp08f7XjZsmUXdEy1Uc+ePXnmmWe47bbbiIuLK7adZ7W1goICAA4ePMipU6eAkt/D6OhoWrZsCfi+h4V/Drp3747JZCr2HoV/DtatW0deXp5Pf7PZTPfu3UvtX1BQwNq1a4ttVxvMnj2bX3/9FYvFwpdfflliW3mPa4YFCxagqioAo0ePLrZdTEwM33//PR988AE9e/YE5D0WQghxYUigVoPt3r1bO27WrFmx7WJiYggICPDpIyrHwIEDefvtt/nf//6HXq8vtp3nB7PCb9zL+h4CNG7c2KdPTk4Ox44dK1d/u93OgQMHtPOF92vYsCFGo7HU/kXHUNtkZWXx0EMPAfDCCy/QvHnzEtvLe1wzFGa5AwICaN26tXY+Ly+P/fv3c+zYMS2QK0reYyGEEBeCBGo1WFJSEgAGg4H69euX2DY2Ntarj6h67733nnZ81VVXAd7vR8OGDUvsX5itS01N1TJyFelftF/hcUX71zZPP/00x48fp23btjz11FOltpf3uGYoDFri4uLQ6XRs2bKF4cOHExoaSosWLWjYsCFxcXG88cYb2ntTSN5jIYQQF4IEajVYWloaAEFBQSiKUmLboKAgADIyMi74uISvzz//XFv/0q1bN3r06AG430OA4ODgEu9R+B6C+3083/6e96ho/9pk5cqVfPnllyiKwtdff11i5qKQvMc1w9mzZwEIDw/np59+olu3bsyZM8drA/OTJ0/y4osvMmjQIDIzM7Xz8h4LIYS4ECRQq8EKv421WCylti2c+lj0m2Bx4c2fP58nnngCcGU/P/74Y+2a5/tR2vtY+B569jvf/p7HFe1fW1itVu677z5UVeWBBx6gV69eZeon73HNkJ2dDcDRo0e56667CAgI4JNPPuHkyZPk5eWxatUqrrzySgBWrFjBvffeq/WV91gIIcSFIIFaDVZY5bG0bBqgra0orjKkuDCWLl3KDTfcgM1mA+DNN9/Usmng/X6U9j56ro8p7He+/QFtXV1F+9cWb775Jrt376Z+/fq88847Ze4n73HNUFiYIzk5GUVRWLhwIY888ggxMTFYLBZ69uzJ/PnzGTRoEAC//vqrVsBD3mMhhBAXgvwtXYMVTnHJz88vtW1hG7PZfEHHJNzmzZvH0KFDycnJAWD8+PE+a5o8pymV9j56Xi98H8+3P7inQlW0f22wa9cuLTj75JNPCAsLK3NfeY9rBs9M1AMPPMDll1/u08ZgMPDhhx9qj3/66SdA3mMhhBAXhgRqNVhISAgAubm5pbYtDBYiIiIu6JiEy6RJkxgxYoT2Lf19993H559/7tOu8D0E93tUnMLriqIQHh5e4f7g/eeg8B4V7X+pczqd3HvvvVitVoYNG1Zi6XZ/5D2uGTx/ztddd12x7dq1a6dVbV2/fr1PX3mPhRBCVBZDdQ9AVFyjRo0A19qZlJQUoqOji21bWN2rsPqjuHDeeustnn/+ee3xo48+ykcffeR3SlLhewhw4sSJEu9beD06OhqDwfWrGx8fj16vx+FwlLk/eP85aNSoEceOHatw/0vdV199xerVq9Hr9dx3331s2bLFp01qaqp2vHXrVvR6PSaTiTZt2sh7XEPExMRox6W99vj4eI4dO8aZM2cA+T0WQghxYUhGrQbz3OsnMTGx2HaFi+GL9hGV7+mnn/YK0l555RU+/vjjYteNlPU9BNemukX7mEwmmjRpUq7+RqPRa6+mwvsdPnwYh8NRav+iY7jUFW4K7HA4GDFiBJ07d/b5b/bs2Vr7rl270rlzZ4YOHQrIe1xTtGvXTjv2rMLoj9VqBdAyYvIeCyGEuBAkUKvBunXrph2vXLmy2HYrVqzQjstaqU6U36uvvsp//vMfwLVI/8svv+Tll18usU9sbCzx8fFAye9hSkoK+/btA3zfw8I/B2vWrMHpdBZ7j8I/B127dvUqK1/YPzc312+2qGh/o9Hod/2O8E/e45rB8+/TdevWFdvO6XRq71NCQgIg77EQQogLRBU1Wvv27VVA7dChg+p0Ov22ufbaa1VAjYiIUPPz86t4hLXDzJkzVUAFVIPBoP78889l7vvII4+ogGo2m9UTJ074bfPee+9p91+7dq3XtenTp2vXpk+f7rf/+vXrtTbvvvuu17Xk5GRVr9ergPrII4/47Z+bm6vWrVtXBdRrrrmmzK+tthg3bpz287XZbD7X5T2++OXn56thYWEqoLZr10612+1+2/3222/az3nSpEnaeXmPhRBCVDYJ1Gq4CRMmFPsPt6qq6pQpU7TrTz/9dDWM8NJ35swZNTo6Wvs5f/XVV+Xqv337du0D1vDhw30+IO7Zs0cNDw9XAbV79+4+/fPy8tSGDRuqgJqQkKCeOnXK63p2drbatWtXFVCDg4PV5ORkn3vcdNNNKqCaTCZ1+fLlPtfHjx+vvb4//vijXK+vNigtUJP3uGZ47rnntJ/BP//5T5/rSUlJaqNGjVRAjY6OVjMzM7Vr8h4LIYSobBKo1XB2u13t3Lmz9o/vHXfcoS5YsEBdtmyZ+vjjj2sfHBISEtT09PTqHu4l6cUXX9R+/t27d1c3b95c6n/79+/3ukfht/GA2qtXL3X69OnqqlWr1P/+979qRESECqhGo1Fdv3693zFMmzZN69+gQQP1q6++UlevXq3+8MMPatu2bbVrH3/8sd/+hw8fVoODg1VAtVgs6iuvvKKuWLFCnTt3rpaRBdSRI0dW+s/vUlBaoKaq8h7XBLm5uWq7du20n8WgQYPUadOmqWvWrFEnTJigxsXFademTp3q01/eYyGEEJVJArVLwIkTJ9RWrVpp/wgX/S8uLk7dsWNHdQ/zkuX54a2s//Xv39/rHlarVR01alSx7c1mszplypQSx/Hee++pOp2u2Hs8/vjjJfZfvHixGhoaWmz/vn37emUQhFtZAjV5j2uG5ORktXv37sX+jPR6vfrZZ5/57SvvsRBCiMokgdolIi8vT3333XfVLl26qCEhIarZbFZbt26tPvvss+rZs2ere3iXrJSUlHIHaf4CtUJTpkxRBw0apEZGRqpGo1GNj49X77zzTnXnzp1lGs/atWvVW2+9VY2Li1ONRqMaGRmpDh06tMzTnE6cOKE+9thjavPmzVWLxaIGBwerPXr0UCdMmFDsmh1RtkCtkLzHFz+Hw6FOnDhRHThwoBodHa1aLBa1WbNm6j333KNu27at1P7yHgshhKgMiqqqKkIIIYQQQgghLhpSnl8IIYQQQgghLjISqAkhhBBCCCHERUYCNSGEEEIIIYS4yEigJoQQQgghhBAXGQnUhBBCCCGEEOIiI4GaEEIIIYQQQlxkJFATQgghhBBCiIuMBGpCCCGEEEIIcZGRQE0IIYQQQgghLjISqAkhhBBCCCHERUYCNSGEEEIIIYS4yEigJoQQQgghhBAXGQnUhBBCCCGEEOIiI4GaEEIIIYQQQlxkJFATQgghhBBCiIuMBGpCCCGEEEIIcZGRQE0IIYQQQgghLjISqAkhhBBCCCHERUYCNSGEEEIIIYS4yEigJoQQQgghhBAXGQnUhBBCCCGEEOIiI4GaEEIIIYQQQlxkDNU9ACGEKI99+/Yxd+5c/vrrLw4dOkRycjJ5eXlERkYSHR1Nu3btuPrqqxkyZAjR0dFlvu/vv/9OQEAAgwcPvoCjF0IIIYQoG0VVVbW6ByGEEKU5ePAgjz76KHPnzi1Te5PJxN13381bb71FREREse2OHj3Ko48+yqxZs5g4cSJ33nlnJY1YCCGEEKLiJKMmhLjorVixgmuvvZbM/2/v/mOqrB44jn8uVySHaBo/A0kQpcLVpEaEV4Vcu1BQblquTU1ksYphyaK2lFWImyv/CLHUrVRInVqmVI4kU8ryR5S1aZhAgS0pQCVQVBDw+0frGVcul4v6ree292tje87hnPOchz/YPjvPc05bm1FntVoVGRmpoKAg+fj46Ny5c6qpqVFLS4skqbOzU2vWrFFZWZl2796t6Ohop2NPnz5dtbW1/8hzAAAAuItv1ACYWkNDg2bMmGGEtMDAQK1du1ZnzpxRdXW19u/frz179ujw4cM6e/asKisrNXPmTKP/yZMnlZKSotbWVqfjX758+R95DgAAgMEgqAEwtVdffVVnzpyRJIWEhKiyslKZmZkaOXKk0/b33nuvPvjgAy1dutSoq6ur07Jly/6R+QIAANwIBDUAptXZ2anNmzcb5aVLlyo8PNytvkuWLJHNZjPK77zzjrq7u2/4HAEAAP4fCGoATOv48eNqb283yvHx8YPq/8wzzxjXLS0tqqqqumFzAwAA+H8iqAEwrUuXLjmUGxsbB9XfZrNp/Pjxuv/++5Wamiovr7/+5VVUVMhischisejkyZNG+/T0dKM+MTGx33E7OztVXFysWbNmKTIyUr6+vvLz81NUVJTmzJmjDz/8UANtqFtfX2/c6/bbbzfqS0tLlZqaqjFjxmjo0KHy9/dXYmKi3nzzTZ0/f35Qzw8AADwX2/MDMK3GxkYFBwcb5QcffFBlZWWyWq3XNW5FRYWSkpJctpk2bZoqKir61JeXl+vpp59WXV2dy/6xsbEqKSlRTEyM09/X19crIiJCkhQdHa0ffvhB6enp2rJlS79jBgUFacOGDUpOTnZ5bwAA4PlYUQNgWkFBQbr77ruN8meffabExETt3bt3wBUrV0aPHi273S673a6bbrrJqP/7sGy73a64uLg+/datW6eHHnrIIaQFBATIZrPJZrPJ39/fqD9y5IgSEhK0f/9+t+bUO6RZrVbFxsZqypQpuuWWW4w2jY2NSk1N1Y4dOwb9zAAAwLOwogbA1LZt26bZs2f3qQ8NDVVaWpqSkpI0depUh5W3wRg7dqzx+qOrA6+/+uorJSUlqaurS5IUFRWloqIi2e12WSwWSVJPT48++eQTLVy40BgzMDBQ33//vW699VaH8XqvqPU2c+ZMFRYWKjQ0VJLU1dWl4uJiPf/888arj35+fjp27JjbG6sAAADPw4oaAFN7/PHH9dJLL/WpP3XqlNasWaPZs2crJCRE48eP14IFC7Rx48ZBf8s2kJ6eHi1YsMAIaRMnTlRlZaWSk5ONkCZJXl5eeuSRR3Tw4EEjhDU1NTmdvzMZGRl6//33jZAmSUOGDFFGRobKysrk7e0tSTp37pxee+21G/V4AADAhAhqAExv+fLlWr9+vUaNGtVvm9raWq1fv15z585VSEiIJk+erHXr1t2QA61LS0tVU1MjSbJYLNq4caNuvvnmftuHhISoqKjIKG/ZskWnTp1yeY+oqCitWrXKIfj1ZrPZtGjRIqO8devWPputAACA/w6CGgCPMH/+fFVXVys/P18TJkxw2fbKlSs6cOCAMjIyFBMTo6+//vq67r19+3bj2mazOXw315+UlBTj+7Kuri6Vl5e7bJ+dne3wvZwzWVlZxnV7e7s+//zzAecBAAA8E0ENgMfw9/dXXl6eTpw4oaqqKhUWFmrGjBkOG25craamRg888MB1bcDRO+jdd999bvXx8vLSpEmTnI7hzMMPPzzgmOHh4Q7ftX355ZduzQUAAHgeghoAj3THHXdo4cKF2rFjh5qbm3X06FGtXLlSaWlpGjZsmEPbzs5OzZkzRz///POg79Pd3e1w1tqKFSuM888G+tmzZ4/Rz9Wrj8OHD9e4cePcmk/vM9caGhoG/TwAAMAzENQAeDyLxaKJEycqOztbH330kf744w8VFBTI19fXaHPhwgUtW7Zs0GO3tLRc11EAfzt79my/vxs9erTb4/T+Nu5Gb5oCAADMY8i/PQEAuNFGjBihxYsXKzk5WdOmTVN7e7skaefOnXr33Xf73bDDmas3I4mNjVVAQMCg5+Rqxezv3Rzd0d3dbVxzugoAAP9dBDUAplRdXa2cnBw1NjaqqalJO3fudPjmyx333HOPsrKy9Prrr0v6a3Xs9OnTgwpaV692ZWdn93vW2rVqa2tzu+2ff/5pXLv6Ng8AAHg2Xn0EYErd3d3atWuXvv32W/366686ePDgNY0THx/vUO7p6RlUfx8fH4ew9uOPP17TPFxpbm52CGCuVFVVGdcD7X4JAAA8F0ENgClNmDBBfn5+Rrm4uPiaxmlpaTGu/fz8rum1xYSEBON6165dbvfLzc3Viy++qLfffluVlZUu27pzhEBNTY1+++03ozxlyhS35wIAADwLQQ2AKVmtVs2aNcsof/PNN9q0adOgx+ndx263y8vL8d/e1WVnUlNTjevjx4+rtLR0wD4HDhzQihUr9MYbbygrK2vArfTXrl074Ji9D9EOCAhQYmLigH0AAIBnIqgBMK3FixfLx8fHKKenp+u9995zu39BQYH27t0r6a+dIXNzc/u06X3I9KVLl5yOM2/ePAUGBhrlzMxMl1v9t7W16amnnjLKw4cP19y5c13O9eOPP3b5bOXl5XrrrbeMclZW1qA2IQEAAJ6FoAbAtMaNG6fCwkKjfPnyZc2bN09Tp07Vtm3bnG7CcfHiRX366adKSkpSXl6eUZ+Tk6O4uLg+7XtvyFFWVuZ0J8Vhw4Zp1apVRrmpqUnx8fHasGGDOjs7HdqWl5dr8uTJDt+S5eXlOQS9/qSnp6ugoEAXLlww6jo6OrRy5Uo9+uijxvd1UVFReuGFFwYcDwAAeC7LFfZ3BmByRUVFysnJUVdXl0O9t7e3wsPDFRwcrCFDhuj06dOqra1VR0eHQ7vMzEytXr3a6WuOzz77rFavXm2Uo6OjFRYWppEjR2r79u0ObfPz8/XKK6841Pn5+SkmJkZWq1W1tbV9zjZ74okntGnTpj5HAtTX1ysiIsIoh4aGGodi+/r66q677pLVatXRo0fV2tpqtAsICNC+ffsUExPT798LAAB4PoIaAI/w3XffKTc3V/v27XO7T2RkpJYvX67HHnus3za//PKLJk2a1Gd1ztvbW+fPn9fQoUMd6rdu3apFixbp999/d3lvHx8fvfzyy1qyZInTgHh1UDt8+LCee+45HTp0qN8xbTabSkpKHPoBAID/JoIaAI/y008/qbS0VIcOHVJ1dbUaGhrU3t6unp4ejRgxQhEREYqLi1NaWprsdrusVuuAY544cUL5+fn64osv1NzcLG9vb4WFhWn37t267bbb+rS/ePGiNm/erLKyMh05ckTNzc3q6OjQqFGjdOedd2r69OmaP3++wsLC+r3n1UGtrq5OY8aMUUlJiUpKSnTs2DG1trYqODhYCQkJevLJJ5WSknJtfzQAAOBxCGoA8C9wFtTGjh37700IAACYCpuJAAAAAIDJENQAAAAAwGQIagAAAABgMgQ1AAAAADAZghoAAAAAmAxBDQAAAABMhu35AQAAAMBkWFEDAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDIENQAAAAAwGYIaAAAAAJgMQQ0AAAAATIagBgAAAAAmQ1ADAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDIENQAAAAAwGYIaAAAAAJgMQQ0AAAAATIagBgAAAAAmQ1ADAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDIENQAAAAAwGYIaAAAAAJgMQQ0AAAAATIagBgAAAAAmQ1ADAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDL/A0RNUrDduAHDAAAAAElFTkSuQmCC", - "text/plain": [ - "
" + "cell_type": "markdown", + "source": [ + "Installing segger from the GitHub repository:" + ], + "metadata": { + "id": "XEY6CTzK0648" + }, + "id": "XEY6CTzK0648" + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/EliHei2/segger_dev.git\n", + "%cd segger_dev\n", + "!pip install \".[rapids12]\" -q" + ], + "metadata": { + "id": "TIQnPzfx08Zr" + }, + "id": "TIQnPzfx08Zr", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Downloading the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer):" + ], + "metadata": { + "id": "q3SNnImS09_N" + }, + "id": "q3SNnImS09_N" + }, + { + "cell_type": "code", + "source": [ + "!mkdir data_xenium\n", + "%cd data_xenium\n", + "!wget https://cf.10xgenomics.com/samples/xenium/1.6.0/Xenium_V1_hPancreas_Cancer_Add_on_FFPE/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs.zip\n", + "!unzip Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs.zip\n", + "%cd .." + ], + "metadata": { + "id": "Qjdt3f-U0_i9" + }, + "id": "Qjdt3f-U0_i9", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from segger.data.io import XeniumSample\n", + "from segger.training.train import LitSegger\n", + "from segger.training.segger_data_module import SeggerDataModule\n", + "from segger.prediction.predict import predict, load_model\n", + "from lightning.pytorch.loggers import CSVLogger\n", + "from pytorch_lightning import Trainer\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns" + ], + "metadata": { + "id": "trM8h-Ek16sJ" + }, + "id": "trM8h-Ek16sJ", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "db009015-c379-4f50-97ed-81dca9df28ac", + "metadata": { + "id": "db009015-c379-4f50-97ed-81dca9df28ac" + }, + "source": [ + "# **1. Create your Segger Dataset**\n", + "\n", + "In this step, we generate the dataset required for Segger's cell segmentation tasks.\n", + "\n", + "Segger relies on spatial transcriptomics data, combining **nuclei** and **transcripts** from single-cell resolved imaging datasets. These nuclei and transcript nodes are represented in a graph, and the spatial proximity of transcripts to nuclei is used to establish edges between them.\n", + "\n", + "To use Segger with a Xenium dataset, you need the **`transcripts.csv.gz`** and **`nucleus_boundaries.csv.gz`** files. The **transcripts** file contains spatial coordinates and information for each transcript, while the **nucleus boundaries** file defines the polygon boundaries of the nuclei. These files enable Segger to map transcripts to their respective nuclei and perform cell segmentation based on spatial relationships. Segger can also be extended to other platforms by modifying the column names or formats in the input files to match its expected structure, making it adaptable for various spatial transcriptomics technologies." ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Evaluate results\n", - "model_version = 44034 # 'v_num' from training output above\n", - "model_path = models_dir / 'lightning_logs' / f'version_{model_version}'\n", - "metrics = pd.read_csv(model_path / 'metrics.csv', index_col=1)\n", - "\n", - "fig, ax = plt.subplots(1,1, figsize=(2,2))\n", - "\n", - "for col in metrics.columns.difference(['epoch']):\n", - " metric = metrics[col].dropna()\n", - " ax.plot(metric.index, metric.values, label=col)\n", - "\n", - "ax.legend(loc=(1, 0.33))\n", - "ax.set_ylim(0, 1)\n", - "ax.set_xlabel('Step')" - ] - }, - { - "cell_type": "markdown", - "id": "e73687e1-ee8f-46e9-8bd2-1ddc571ef94b", - "metadata": {}, - "source": [ - "# **3. Make Predictions**\n", - "\n", - "Once the Segger model is trained, it can be used to make predictions on seen or unseen data. This step involves using a trained checkpoint to predict cell boundaries and refine transcript-nuclei associations.\n", - "\n", - "Key parameters for making predictions:\n", - "- **`--checkpoint_path`**: Path to the trained model checkpoint, which stores the learned weights.\n", - "- **`--batch_size`**: Batch size used during inference.\n", - "- **`--score_cut`**: Defines the score threshold for classifying predictions. Higher values of `score_cut` make the model more conservative in associating transcripts with nuclei.\n", - "- **`--receptive_field`**: These parameters once again define the nearest neighbors for nuclei (`k_bd`) and transcripts (`k_tx`) and their distances (`dist_bd` and `dist_tx`) during the prediction stage.\n", - "- **`--use_cc`**: Used when some **transcripts are not directly associated with any nucleus**—a common scenario when a nucleus isn't captured on the slide or within the field of view. In these cases, Segger uses **connected components (CC)** to group such \"nucleus-less\" transcripts into distinct cells. Even though these transcripts lack a directly associated nucleus, they likely still represent a real cell, and grouping them together ensures that these cells are not discarded.\n", - "\n", - "The predictions can be saved and visualized to assess the segmentation quality.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4279c71-4660-46fc-a9e5-834e25d31f53", - "metadata": {}, - "outputs": [], - "source": [ - "# Checkpoint directory for Lightning model above\n", - "model_version = 44034\n", - "\n", - "# Load in latest checkpoint\n", - "model_path = models_dir / 'lightning_logs' / f'version_{model_version}'\n", - "model = load_model(model_path / 'checkpoints')\n", - "dm.setup()\n", - "\n", - "receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 15, 'dist_tx': 3}\n", - "\n", - "# Perform segmentation (predictions)\n", - "segmentation = predict(\n", - " model,\n", - " dm.train_dataloader(),\n", - " score_cut=0.33, \n", - " receptive_field=receptive_field,\n", - " use_cc=False,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "9807abf3", - "metadata": {}, - "source": [ - "### **3.2 Faster Prediction with Segger**\n", - "We introduce a faster and more efficient pipeline for making predictions using a segger model. This new method accelerates the segmentation process by using CUDA-accelerated **nearest neighbors search** using [CAGRA](https://docs.rapids.ai/api/cuvs/stable/python_api/neighbors_cagra/) and **parallel processing**.\n", - "\n", - "**Note**: The previous prediction method will soon be deprecated in favor of this optimized pipeline.\n", - "\n", - "#### **Requirements for the Faster Prediction Pipeline**\n", - "The pipeline requires the following inputs:\n", - "\n", - "- **segger_data_dir**: The directory containing the processed Segger dataset (in PyG format).\n", - "- **models_dir**: The directory containing the trained Segger model checkpoints.\n", - "- **benchmarks_dir**: The directory where the segmentation results will be saved.\n", - "- **transcripts_file**: Path to the file containing the transcript data for prediction.\n", - "\n", - "#### **Running the Faster Prediction Pipeline**\n", - "Below is an example of how to run the faster Segger prediction pipeline using the command line:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e802c3f", - "metadata": {}, - "outputs": [], - "source": [ - "!python3 predict_fast.py --segger_data_dir path/to/processed_data \\\n", - "--models_dir path/to/models_dir \\\n", - "--benchmarks_dir path/to/save/results \\\n", - "--transcripts_file path/to/transcripts_file \\\n", - "--batch_size 1 \\\n", - "--num_workers 12 \\\n", - "--model_version 0 \\\n", - "--save_tag segger_embedding_1001 \\\n", - "--min_transcripts 5 \\\n", - "--cell_id_col segger_cell_id \\\n", - "--use_cc False \\\n", - "--knn_method cuda \\\n", - "--file_format anndata \\\n", - "--k_bd 4 \\\n", - "--dist_bd 12.0 \\\n", - "--k_tx 5 \\\n", - "--dist_tx 5.0" - ] - }, - { - "cell_type": "markdown", - "id": "0a823035", - "metadata": {}, - "source": [ - "#### **Parameters**\n", - "Here is a detailed explanation of each parameter used in the faster prediction pipeline:\n", - "\n", - "- **--segger_data_dir**: The directory containing the processed Segger dataset, saved as PyTorch Geometric data objects, that will be used for prediction.\n", - "- **--models_dir**: The directory containing the trained Segger model checkpoints. These checkpoints store the learned weights required for making predictions.\n", - "- **--benchmarks_dir**: The directory where the segmentation results will be saved.\n", - "- **--transcripts_file**: Path to the *transcripts.parquet* file.\n", - "- **--batch_size**: Specifies the batch size for processing during prediction. Larger batch sizes speed up inference but use more memory (default: 1).\n", - "- **--num_workers**: Number of workers to use for parallel data loading (default: 1).\n", - "- **--model_version**: Version of the trained model to load for predictions, based on the version number from the training logs (default: 0).\n", - "- **--save_tag**: A tag used to name and organize the segmentation results (default: segger_embedding).\n", - "- **--min_transcripts**: The minimum number of transcripts required for segmentation (default: 5).\n", - "- **--cell_id_col**: The name of the column that stores the cell IDs (default: segger_cell_id).\n", - "- **--use_cc**: Enables the use of connected components (CC) for grouping transcripts that are not associated with any nucleus (default: False).\n", - "- **--knn_method**: Method for KNN (K-Nearest Neighbors) computation. Only option is \"cuda\" for this pipeline (default: cuda).\n", - "- **--file_format**: The format for saving the output segmentation data. Only option is \"anndata\" for this pipeline (default: anndata).\n", - "- **--k_bd**: Number of nearest neighbors for boundary nodes during segmentation (default: 4).\n", - "- **--dist_bd**: Maximum distance for boundary nodes during segmentation (default: 12.0).\n", - "- **--k_tx**: Number of nearest neighbors for transcript nodes during segmentation (default: 5).\n", - "- **--dist_tx**: Maximum distance for transcript nodes during segmentation (default: 5.0)." - ] - }, - { - "cell_type": "markdown", - "id": "b0917be9-4e82-4ba5-869d-5a9203721699", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-11T23:06:23.977884Z", - "iopub.status.busy": "2024-09-11T23:06:23.977517Z" - } - }, - "source": [ - "### *Troubleshooting #2*\n", - "\n", - "In the cell below, we are visualizing the distribution of **Segger similarity scores** using a histogram. The **Segger similarity score** reflects how closely transcripts are associated with their respective nuclei in the segmentation process. **Higher scores** indicate stronger associations between transcripts and their nuclei, suggesting more accurate cell boundaries. **Lower scores** might indicate weaker associations, which could highlight potential segmentation errors or challenging regions in the data. We expect to see a large number of the scores clustering toward higher values, which would indicate strong overall performance of the model in associating transcripts with nuclei.\n", - "\n", - "The following would indicate potential issues with the model's predictions:\n", - "\n", - "- **A very large portion of scores near zero**: If many scores are concentrated at the lower end of the scale, this suggests that the model is frequently failing to associate transcripts with their corresponding nuclei, indicating poor segmentation quality.\n", - "- **No clear peak in the distribution**: If the histogram is flat or shows a wide, spread-out distribution, this could indicate that the model is struggling to consistently assign similarity scores, which may be a sign that the training process did not optimize the model correctly.\n", - "\n", - "Both cases would suggest that the model requires further tuning, such as adjusting hyperparameters, data preprocessing, or the training procedure (see below)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "id": "a450d3ca-2876-4f48-be89-761147b17387", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-11T22:07:04.216273Z", - "iopub.status.busy": "2024-09-11T22:07:04.215965Z", - "iopub.status.idle": "2024-09-11T22:07:08.177601Z", - "shell.execute_reply": "2024-09-11T22:07:08.177158Z", - "shell.execute_reply.started": "2024-09-11T22:07:04.216257Z" - } - }, - "outputs": [ + }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAApEAAAJ1CAYAAACB/qtfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAC4jAAAuIwF4pT92AABZpUlEQVR4nO3dd3hT5d/H8U/a0pYyy96UpRQqq2yQIlNkiANRxIeKC7fy0597II8DUFyg/hS1oqKgqCAoIihlKVJBhbL33rtAoe39/MHT82uapskpSdPxfl1XrivJuc+db3Jykk/OyTm3wxhjBAAAANgQFOgCAAAAUPgQIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGBbSKALQNF39uxZbd682brdoEEDhYeHB7AiAABwsQiR8LvNmzcrJibGur169Wo1bdo0gBUBAICLxe5sAAAA2EaIBAAAgG2ESAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2EaIBAAAgG2cbBw+k5CQoISEBJf7U1JS8r8YAADgV4RI+My2bduUmJgY6DIAAEA+IETCZ6KiohQXF+dyf0pKipKSkgJQEQAA8BeHMcYEuggUbcnJyYydDQBAEcOBNQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjWEPUahd3qaNDuzbm2ubKtWqa9Hy5flUEQAAxQMhEoXagX179efgAbm2iZ06M5+qAQCg+GB3NgAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbAsJdAEoOhISEpSQkOByf0pKSv4XAwAA/IoQCZ/Ztm2bEhMTA10GAADIB4RI+ExUVJTi4uJc7k9JSVFSUlIAKgIAAP5CiITPxMfHKz4+3uX+5ORkxcTE5H9BAADAbziwBgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2hQS6AHcWLlwoSapQoYJiYmLy1MfPP/+s3377TampqXrxxRd9WR4AAECxVmBDZNeuXeVwONS7d2/98MMPeerjhRde0NKlS1WlShVCJAAAgA8V6d3ZISEhMsboyJEjgS4FAACgSCmyIXLFihVavny5JKlUqVIBrgYAAKBoCeju7FGjRumTTz7JtU1iYqLq169vq9+zZ8/qwIEDysjIkMPhUMOGDS+mTAAAAGQT0BB5//33a8KECW53NxtjdObMGW3fvt1Wv8YYSZLD4ZAkDRs27OIKBQAAgJOA7s6uUKGCxo4dK2OMyyWrnKbndsnqnnvu0b333pufTwsAAKDIC/jR2bfeeqvCwsJ0/vx56z5jjIYPHy6Hw6GYmBiNHDnS6/6CgoIUHh6uihUrKiYmRlWqVPFH2QXW008/7fZI9NjYWCUlJeVzRQAAoCgKeIiUpCFDhrjcN3z4cElSzZo12R1twz///KPg4GA9/fTTLtNq1KgRgIoAAEBRVCBCZE66dOkih8OhZs2aBbqUQuXvv//WJZdcoueffz7QpQAAgCKswIbIBQsWBLqEQufYsWPasWOHBg8eHOhSAABAEVdkzxNZHP3zzz+SxNZbAADgd4TIIuTvv/+WdGGLZJ8+fVSxYkWVLVtWffr0sU68DgAA4AsFPkSeP39e//nPf9S7d2/VqFFDERERCgkJUXBwsNeXkJACu9fepzK3RI4fP16hoaG6/fbb1a1bN82dO1edO3fWnDlzAlwhAAAoKgp0utq5c6d69eqlDRs2SJLLOSDhLDg4WLVq1dKHH36oXr16Wff/8ssv6tWrl4YNG6atW7cqIiIigFUCAICioMBuiczIyFD//v21fv16wqOX3nvvPSt4Z9WtWzcNGTJEBw4c0Lx58wJUHQAAKEoK7JbIL774Qv/8848cDoeMMapbt65uvfVWNW3aVOXLl1dwcHCgSyxUYmNj9emnn2rLli2BLgUAABQBBTZETps2zbreo0cPff/99woLCwtgRQXb+fPn9ddffyk9PV3t27d3mX769GlJUnh4eH6XBgAAiqACGyJXrlwp6cIwhpMmTSJAenD+/Hl17NhREREROnTokEqUKOE0feHChZKkNm3aBKI8AABQxBTY/0QeOHDAGju7Tp06gS6nwIuIiFDfvn114sQJjR492mnaZ599pjlz5qhDhw6KjY0NUIUAAKAoKbAhsnz58pKkihUrBrYQH1m/fr3Cw8NVq1Ytr9qfOXNGr7zyilq1aqXSpUurTJkyatasmV544QUdO3Ysx3neeOMNVatWTaNHj1bXrl31yCOPqE+fPrrllltUrVo1TZ482YfPCAAAFGcFdnd2vXr1dODAAe3YsSPQpVy01NRUDR8+XKmpqV6137Nnj3r27Kk1a9Y43b9q1SqtWrVKH3/8sWbPnq0mTZo4TY+KitKff/6p559/XrNnz9aSJUtUtWpV3XXXXXruuedUvXp1nz0nAABQvBXYEHn11Vdr2bJl2rx5s9auXavo6OhAl5QnaWlpGjx4sJYuXep1+6uvvlpr1qyRw+HQiBEjNGjQIAUFBembb77RhAkTtG3bNg0YMEArVqxQ2bJlneavUaOG3n//fX88FQAAAEuBDZG33nqrxo0bp2PHjmnkyJH68ccfA12SbYcOHdKNN96o+fPnez3PBx98oKSkJEnSm2++qfvvv9+aFhcXpw4dOuimm27S5s2b9dprr2nUqFE+rzs3Bw4c0MGDB23Ns2nTJj9VAwAAAsVhCvCZvKdNm6YhQ4bIGKNrrrlGr7/+umrXrh3osrwye/Zs3XPPPdbu+KCgIGVkZKhmzZratWuX2/mio6O1bt06RUdHa/Xq1QoKcv3bav/+/TVr1ixFRkbqwIED+Tqs4/PPP3/RwXX16tVq2rSpT+q5tHYt/Tl4QK5tYqfO1Pqd7l9zAABgX4HdErlmzRrFxMTo2Wef1fPPP69vv/1WM2bMUGxsrJo1a6aKFSuqZMmSXvf37LPP+rFaZ4MHD7bOc+lwOPTUU09p0aJFSkxMzHW+devWad26dZKkIUOG5BggJWnYsGGaNWuWjh49ql9//VU9e/b07RMAAADwoMCGyJiYGDkcDkmyRq1JT0/X8uXLtXz5ctv95WeIXLZsmaQLB7pMmjRJ3bt3V9euXT3Ol/V/k126dHHbrnPnztb1xMREQiQAAMh3BTZESspxzOy87H3PDKP5pUaNGrr33nv1wAMP2DpJ+tq1a63rDRs2dNuuWrVqKlmypM6cOeM0T3645557NGjQIFvzbNq0SQMHDvRPQQAAICAKbIjs0qVLvoc/X1m8eLHbXdG52bNnjyQpJCTE4+l4atSooc2bN1vz5JcqVaqoSpUq+fqYAACg4CmwIXLBggWBLiHP8hIgJeno0aOSpFKlSnkM0KVKlZIkHT9+PE+PBQAAcDEK7Ig1xVHmycjDw8M9ts08qMjbE5gDAAD4EiGyAMncgunNbvzM/4bmdasnAADAxSCBFCClS5eWJJ09e9Zj28w2dg7cAQAA8BVCZAFSpkwZSdLp06c9tk1JSZEkRUZG+rUmAACAnBTYA2uCg4N91pfD4VBaWprP+vOXunXrSpLOnTungwcPqnLlym7bZh6VXaNGjXypDQAAIKsCuyUy8z9/xhifXAqD6Oho6/rmzZvdttu7d6/OnDnjMg8AAEB+KbAhUsr7icUdDodCQkJUtWpV61IYtG3b1rq+ZMkSt+0WL15sXe/YsaNfawIAAMhJgd2d/euvv3rd9uzZszp06JBWr16t6dOna9OmTUpPT9cDDzygJ554wo9V+lbDhg112WWXadWqVZo8ebJGjhyZ45Han3zyiaQL/4eMi4vL7zIBAAAKbojMazgaPXq0nn32Wb3yyit6+umnFRERoQcffNDH1fnP3XffrXvuuUf//POPxo0bp3//+99O06dOnarZs2dLku68806OzgYAAAFRoHdn50VISIheeuklDR06VMYYPfHEE9qyZUugy/LanXfeqZYtW0qSHnvsMQ0bNkzz5s3TwoUL9fDDD+vmm2+WJEVFRRWqrawAAKBoKXIhMtOoUaMUFBSk1NRUffjhh4Eux2vBwcGaNWuWGjduLEmaPHmyevbsqbi4OL3xxhtKT09XzZo1NWvWLJUrVy7A1QIAgOKqyIbIevXqqVGjRjLGWLt/C4saNWpo5cqVGjNmjFq1aqUyZcooLCxM0dHReuKJJ/TPP/+oadOmgS4TAAAUYwX2P5G+UK1aNa1fv147duwIdClasGCBrfbh4eH697//7fKfSAAAgIKgyG6JlKSdO3dKks6fPx/gSgAAAIqWIrslcsmSJdqyZYscDgejuuSThIQEJSQkuNyfOUQjAAAoOopkiExOTtaQIUOs21dccUUAqyk+tm3bpsTExECXAQAA8kGBDZEvvPCC122NMTp//rxOnDihNWvWaMGCBU6j3dx6663+KBHZREVF5Xh+z5SUFCUlJQWgIgAA4C8FNkQ+//zzOY7W4g1jjDVvfHy82rVr58vS4EZ8fLzi4+Nd7k9OTlZMTEz+FwQAAPymwIZIKW9jZ2dyOBy6/fbbNXHiRB9WBAAAAKkAh8guXbrY2hLpcDgUFhamihUrqkWLFho4cKAaNmzoxwoBAACKrwIbIu2eVxEAAAD5p0ifJxIAAAD+QYgEAACAbQV2d7Y7x44d05IlS7Rnzx4dPnxYwcHBKlOmjKKiotSyZUtVrVo10CUCAAAUeYUmRP7www8aM2aMlixZkutR2y1bttRDDz2koUOH5mN1AAAAxUuB35195swZDRkyRP3799fixYuVkZFhhUhjjMv1FStWaNiwYerZs6eOHTsWwMoBAACKrgIdItPS0tSnTx9NnTrVZetjSEiIKleurMjISAUF/fdpOBwOGWP0yy+/qHfv3jp79mx+lw0AAFDkFegQ+dhjj2nhwoXW7QYNGuitt97SunXrlJqaqv379+vw4cNKTU3V6tWr9dprr6lBgwaSLmyZTEpK0pNPPhmo8gEAAIqsAhsit23bpgkTJlgnHB8+fLhWrVql++67T5dccolT26CgIDVp0kQPP/ywVq1apdtvv13ShSA5ceJE7dy5M9/rBwAAKMoKbIj85JNPdP78eUlSnz59NGnSJIWFhXmcLywsTO+//7769esn6cIu8Y8//tivtQIAABQ3BTZE/vzzz9b1119/3fb8r732mnV93rx5PqkJAAAAFxTYELllyxY5HA41atRIjRo1sj1/o0aNdOmll8oYozVr1vihQgAAgOKrwIbIw4cPS5Jq1aqV5z5q1KghSTpx4oRPagIAAMAFBTZEli5dWpJ09OjRPPeReZ7IcuXK+aIkAAAA/L8CGyJr164tY4xWr16dp5OGHz16VKtWrZLD4bC2SAIAAMA3Cuywh5dffrn++ecfpaWlady4cXrxxRdtzT9u3DilpaXJ4XCoS5cufqoSWSUkJCghIcHl/pSUlPwvBgAA+FWBDZFDhw7VxIkTJUljx45VixYtNGjQIK/mnTp1qsaOHWvdvuGGG/xSI5xt27ZNiYmJgS4DAADkgwIbItu1a6devXpp7ty5Sk9P14033qi5c+fq4YcfVpMmTXKcJzk5WePHj1dCQoKMMXI4HOrWrZsuv/zyfK6+eIqKilJcXJzL/SkpKUpKSgpARQAAwF8cJvug1AXInj171KJFCx0+fNgKhZJUvXp1NWnSRJGRkZIu/P8xOTlZ+/btkyRrnO2qVatq+fLlF3WENy5ecnKyYmJirNurV69W06ZNfdL3pbVr6c/BA3JtEzt1ptbv3OWTxwMAABcU2C2R0oVT9CxatEi9e/fWjh07JF0IiHv27NHevXud2mYGx8ygWbduXX333XcESAAAAD8osEdnZ7r00ku1cuVKPfLIIwoPD7fuN8Y4XTKVKlVKDz74oFasWKHmzZsHomQAAIAir0BvicwUGRmpsWPH6rnnntOCBQu0aNEi7dmzR4cPH1ZGRoYqVKig2rVrq1OnToqLi1PZsmUDXTIAAECRVihCZKZSpUqpb9++6tu3b6BLAQAAKNYK7O7sjIwMr9t+++23Wr16tR+rAQAAQFYFLkTOmTNHvXr10j333OP1PA888ICaN2+u1q1ba/bs2X6sDgAAAFIBCpHHjh3T1Vdfrb59+2r+/PlasGCBV/Nt3LhRu3fvliStWLFCAwYM0LXXXqvjx4/7sVoAAIDirUCEyL1796pdu3aaNWuWdbT1xo0bdejQIY/z/vrrr5JknUfSGKMZM2aoU6dOOnLkiL9LBwAAKJYCHiJTU1PVv39/bdy40bovODhY/fr1U2pqqsf5hw0bph9//FE333yzQkJCrCC5Zs0aDRgwwNZ/KwEAAOCdgIfIsWPHasWKFVb469atm1avXq0ZM2aoZs2aHucPCwtT79699emnn2r16tVq3769Ne23337TG2+84cfqAQAAiqeAhsiTJ09qzJgx1igzd9xxh37++WddeumleeqvUaNG+vXXX9W7d29JF3Zxv/jii15t0QQAAID3Ahoip06dqtOnT0uSYmNj9d5771mBMq9CQ0P15ZdfWsMdHjt2TNOmTbvoWgEAAPBfAQ2R8+fPt66PGjXqogNkpnLlyumJJ56wbs+bN88n/QIAAOCCgIbIlStXSpIiIiJ05ZVX+rTvm2++WcHBwZKkpKQkn/YNAABQ3AU0RB44cEAOh0NNmjTx2VbITGXLllXjxo1ljNH+/ft92jcAAEBxF/ADa6QLu5/9oUqVKpKkEydO+KV/AACA4iqgIbJ06dKSpFOnTvml/7S0NElSyZIl/dI/AABAcRXQEFmtWjUZY7R9+3a/9L9161ZJUvny5f3SPwAAQHEV0BB52WWXSZL27dunLVu2+LTvrVu3ateuXXI4HLrkkkt82jcAAEBxF9AQ2b17d+v6J5984tO+P/roI+t6bGysT/sGAAAo7gIaIgcOHKgSJUrIGKPXX39du3fv9km/e/bs0dtvv23dHjBggE/6ReF06NAhXVq7ltvL5W3aBLpEAAAKnZBAPnjVqlU1ePBgffbZZ0pJSdG1116rX375RaVKlcpzn6dPn9YNN9ygEydOyOFwqGnTpurYsaMPq4Y7CQkJSkhIcLk/JSUl/4vJwmRk6M/B7n9IxE6dmY/VAABQNAQ0RErS6NGj9c033+jMmTNKSkpSly5d9MUXX+Tpf4wbNmzQzTffrBUrVlj3vfzyy74sF7nYtm2bEhMTA10GAADIBwEPkXXr1tVrr72mu+++Ww6HQytXrlTz5s01dOhQ3XzzzerYsaNCQ0Pdzn/69GktWLBAX331laZMmaK0tDQZY+RwODRixAj17ds3H59N8RYVFaW4uDiX+1NSUhg1CACAIibgIVKS7rrrLm3fvl2vvPKKHA6HUlNT9dFHH+mjjz5SWFiY2rRpo7p166py5cqKiIjQ4cOHdejQIe3evVt//vmnzp8/L0lWeJSkG2+80el/kfC/+Ph4xcfHu9yfnJysmJiY/C8IAAD4TYEIkZL00ksvqW7duho5cqTOnDkj6UIoPHv2rBYvXqzFixfnOJ8xRpKs8FiiRAmNHj1ajz76aP4UDgAAUAwF9Ojs7O666y4lJydr0KBBCgryXFpmgJSk8PBwDR8+XOvWrSNAAgAA+FmB2RKZKSoqSlOnTtWuXbs0ffp0zZ07V3///bf27Nnj1C40NFQNGjRQmzZt1L17dw0cOFBlypQJUNUAAADFS4ELkZlq1aqlBx98UA8++KAkKTU1VSdOnNC5c+cUERGh8uXLW7uwAQAAkL8KbIjMLiwsTJUrVw50GQAAAFAB+08kAAAACgdCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwLaQQBeAoiMhIUEJCQku96ekpOR/MQAAwK8IkfCZbdu2KTExMdBlAACAfECIhM9ERUUpLi7O5f6UlBQlJSUFoCIAAOAvhEj4THx8vOLj413uT05OVkxMTP4XBAAA/IYQCQAAAu7yNm10YN/eXNtUqVZdi5Yvz6eK4AkhEgAA+J2nkHj48GFtu+fWXPuInTrT12XhIhAiAQCA3x3Yt1d/Dh7gdnqdtyflYzXwBUIkAAC4KN7sij58+HA+VYP8QogEAAAXxdNWRsk3WxoPHTqkS2vXcjud/0zmL0IkAAAoFExGRq5hlf9M5i+GPQQAAIBtbIkEAABFgqfd3RK7vH2JEAkAAIoET7u7JXZ5+xIhEgAA5Mqbczyi+CFEAgCAXHGOR+SEA2sAAABgG1siAQAo5thdjbwgRAIAUMyxuxp5we5sAAAA2EaIBAAAgG2ESAAAANhGiAQAAIBthEgAAADYxtHZAACg2PA0vjZja3uPEAkAQBHm6RyQUvE6D6Sn8bUZW9t7hEgUe55+lUr8MgVQeHk6B6TEeSCRN4RIFHuefpVK/DIFACA7DqwBAACAbYRIAAAA2EaIBAAAgG2ESAAAANjGgTUAABRink7hU5xO34P8RYgEAKAQ83QKH07fA38hRMJnEhISlJCQ4HJ/SkpK/hcDAAD8ihAJn9m2bZsSExMDXQYAAMgHhEj4TFRUlOLi4lzuT0lJUVJSUgAqAgAA/kKIhM/Ex8crPj7e5f7k5GTFxMTkf0EAAMBvOMUPAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2DqwBAKCA8jQajcSINAgcQiQAAAWUp9FoJEakQeCwOxsAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALZxdDYAAAHi6RQ+nL4HBRkhEgCAAPF0Ch9O35P/Dh06pEtr18q1TZVq1bVo+fJ8qqjgIkQCAAD8P5OR4fHcnLFTZ+ZTNQUb/4kEAACAbYRIAAAA2EaIBAAAgG2ESAAAANjGgTUAAPiBp9P3SJzCB4UbIRIAAD/wdPoeiVP4oHAjRAJe8HTeMM4ZBgAobgiRgBc8nTeMc4YBAIobDqwBAACAbWyJBAAgDxj3GsUdIRIAgDxg3GsUd+zOBgAAgG2ESAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2MYpfgAAyMbTOSAlzgMJECIBAMjG0zkgJc4DCbA7GwAAALYRIgEAAGAbIRIAAAC2ESIBAABgGwfWwGcSEhKUkJDgcn9KSkr+FwMAufB09DVHXgOeESLhM9u2bVNiYmKgywAAjzwdfc2R14BnhEj4TFRUlOLi4lzuT0lJUVJSUgAqAgAA/kKIhM/Ex8crPj7e5f7k5GTFxMTkf0EAAMBvCJGADxw6dEiX1q6Va5sq1apr0fLl+VQRAAD+RYgEfMBkZHgc3SJ26sx8qgYAAP/jFD8AAACwjS2RAIAixdPpeyRO4QP4AiESAFCkeDp9j8QpfABfYHc2AAAAbCNEAgAAwDZCJAAAAGzjP5EAgEKFca+BgoEQCQAoVBj3GigYCJEAAAA2eBqlrLiMUEaIBAAAsMHTKGXFZYQyDqwBAACAbYRIAAAA2EaIBAAAgG38JxIAUKBwCh+gcCBEAgAKFE7hAxQO7M4GAACAbYRIAAAA2MbubCCfcHJaAEBRQogE8gknpwUAFCWESABAvvF05LXE0ddAYUGIBADkG09HXkscfQ0UFhxYAwAAANsIkQAAALCNEAkAAADb+E8kAMBnGLIQKD4IkQAAn2HIQqD4IEQCBYSnk5FLnJAcAFBwECKBAsLTycglTkgOACg4CJEAAK9wonAAWREiAQBe4UThALLiFD8AAACwjRAJAAAA2wiRAAAAsI0QCQAAANs4sAYAIInRZgDYQ4gEAEhitBkA9rA7GwAAALaxJRIoRDwNjciwiACA/EKIBAoRT0MjMiwi3GG0GQC+RogEgGKA0WYA+Br/iQQAAIBthEgAAADYxu5soAjhwJvii3M8AshvhEigCOHAm+KLczwCyG/szgYAAIBthEgAAADYRogEAACAbfwnEj6TkJCghIQEl/tTUlLyvxgAAOBXhEj4zLZt25SYmBjoMpALT0dvSxzBXRAx2gyAgogQCZ+JiopSXFycy/0pKSlKSkoKQEXIztPR2xJHcBdEjDYDoCAiRMJn4uPjFR8f73J/cnKyYmJi8r8gAADgN4RIAAgwThQOoDAiRAKAH3n7f8Zt99zqdjq7qgEURIRIAE48HXxz7PhxlS9XLtc+ODjnv/g/I4CiihAJwImng2/qvD2Jg3OyYFc0gOKKEAkAufAmJLIrGkBxRIgE4HOedokXpt3dnnZHExIBFFeESAA+52mXeN2JH130Sc+9OWDFF32wOxoAckaIBJDvvDnpuaeg6Wk3sq/6YEsjAOSMEAmgQPLmAJ/86AMAkLOgQBcAAACAwocQCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwLSTQBaDoS01Ndbq9adMm3/V9/rzWHjqSa5v0DJNrG0/Ti1IfhaXOotRHYamzKPVRWOosSn0Uljrzq4/U8+eVnJycax951aBBA4WHh/ulb7scxhgT6CJQtM2YMUMDBw4MdBkAABR6q1evVtOmTQNdhiR2ZwMAACAPCJEAAACwjd3Z8Ltjx44pMTHRul27dm2FhYVdVJ+bNm1y2kX+3XffqWHDhhfVJ3yP5VQ4sJwKPpZR4ZAfy6kg/SeSA2vgd+XLl9fVV1/t18do2LBhgfmPCNxjORUOLKeCj2VUOBT15cTubAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2EaIBAAAgG2ESAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2MbY2SiUKleurOeee87pNgoellPhwHIq+FhGhUNxW04OY4wJdBEAAAAoXNidDQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkfCrM2fO6JVXXlGrVq1UunRplSlTRs2aNdMLL7ygY8eOXXT/X375pbp3764KFSooPDxcDRo00N13360NGzZ4Nf+yZct0ww03qEaNGgoNDVXNmjV17bXXav78+RddW2Hiz+V09OhRvfTSS+rQoYMqVKig0NBQVatWTf369dNXX30lY0yu8zds2FAOh8PjZejQoRdVZ2Hgr+W0a9cur15jh8OhSZMmue2H9cn3yyghIcHrZZP1sm3bNpe+WJfcW79+vcLDw1WrVi2f9FdsvpsM4Ce7d+82TZo0MZJyvERFRZnk5OQ89X3u3Dlz/fXXu+07IiLCfPXVV7n28eqrr5qgoCC3fTz66KN5qq2w8edyWrp0qalevbrbviWZK6+80pw4cSLH+U+ePGkcDkeu82debr755ot5GQo8fy6nWbNmefUaSzIffPBBjn2wPvlnGX388cdeL5usl127djn1w7rk3tmzZ03Hjh2NJFOzZs2L6qu4fTcRIuEX58+fN61btzaSjMPhMHfffbf55ZdfzIIFC8wDDzxgrSANGjQwx48ft93/gw8+aK1QPXv2NDNmzDBLly41r776qilfvryRZMLCwkxSUlKO83/77bfW/I0aNTIJCQnmt99+M5MnT3b6EpgwYcLFvhQFmj+X07Zt20xkZKSRZEJCQsyIESPMjz/+aJYtW2Y+//xz0759e+t17t+/f459LFmyxGrz+eefm5UrV7q9bN++3RcvSYHk7/XpxRdfNJJMyZIlc32NV65caQ4fPuwyP+uT/5bR4cOHPS6TlStXOn0m5hQyWJdydv78eXP11Vdbr83Fhsji9t1EiIRfvPPOO9ab/a233nKZ/sUXX1jTn332WVt9r1q1yvpAvu6660xGRobT9LVr11ora5cuXVzmP3v2rKldu7b1gX7kyBGn6adOnTKxsbFGkilXrpzL9KLEn8tp6NCh1hfqt99+6zI9LS3N6Rf7rFmz3NYXFBRkTp06ZevxixJ/LidjjLnhhhuMJNO2bVvb87I+XeDvZZSbVatWmfDwcCPJtGvXzqSlpbmtr7ivS1kdPHjQdO/e3Wkr38WEyOL43USIhF80btzYSDLR0dEmPT09xzb9+vUzkkxkZKQ5f/68133fddddRpIpUaKE2bFjR45tXn31VetDIfsvvqwf5l9++WWO8yclJVltXn31Va9rK2z8tZxOnTplSpYsaSSZ66+/3m27gwcPmtDQUCPJ3HjjjS7TM5d1o0aNvHtCRZQ/1ydjjLn00kuNJHPHHXfYro316QJ/LyN30tLSrC2gYWFhZv369Tm2Y11yNmvWLFOnTh3rfZkZ/i4mRBbH7yZCJHxu7dq11pt89OjRbtt99dVXVru5c+d63X+1atWMJNO9e3e3bQ4cOGD1/eSTTzpNGzx4sPXflNTUVLd9NG3a1EgyHTt29Lq2wsSfy2nRokXWPJ988kmubTN/WTdt2tRlWuYu70GDBnn1uEWRv9enlJQU6wt04sSJtutjffL/MsrNW2+9ZfX53HPPuW3HuvRfmVveM/eUPP300yYuLu6iQ2Rx/G7i6Gz43NKlS63rXbp0cduuc+fO1vXExESv+t6yZYv27dvnse/KlSvr0ksvzbHvzPratWun0NBQj/X98ccfOnPmjFf1FSb+XE5Vq1bVc889p9tuu03NmzfPtW1GRoYkKTU11eX+VatWSZLHPooyfy4nSVq9erW1DPLyOrM++X8ZuXP06FE999xzkqRatWrpsccey7Ed65KzZcuWSZKioqL0888/a/To0RfdZ3H9biJEwufWrl1rXW/YsKHbdtWqVVPJkiVd5vFF35JUr149l3lSUlK0c+dOW/OnpaVp06ZNXtVXmPhzOTVq1EjPP/+8Jk2alOuX1pEjR7R69WpJUu3atZ2mbdq0SSkpKVZ/r7/+ujp06KCyZcsqIiJCTZs21ZNPPqnDhw97VVNh5c/lJEl//fWXJMnhcCgyMlL//ve/1bRpU4WHh6t8+fLq1KmT3nnnHZ0/f95lXtanC/y9jNwZN26cjh49KkkaNWqU1Xd2rEvOatSoobFjx2rdunXq3r27T/osrt9NIQF9dBRJe/bskSSFhISoevXqubatUaOGNm/ebM3jbd+SVKdOnVzb1qxZU9KFoJKamqqwsLA8zZ/5uJdddplXNRYW/lxO3nr99detcNKrVy+naX///bd1/fbbb9fJkyedpq9Zs0Zr1qzR+++/r++++85pK09R4u/llPk6BwcHq3Xr1k5bNlJTU7V06VItXbpUkyZN0qxZs1SjRg2X2qTivT4FYl06ceKEJkyYIOnCD7D/+Z//cduWdcnZ4sWLFRTk221oxfW7iS2R8LnMX8alSpWSw+HItW2pUqUkScePH7fVtySVLl3aq76z9n+x8xcl/lxO3lixYoXGjRtn9X/bbbc5Tc/6xXfy5Eldf/31+uabb/T777/rq6++Uv/+/SVJhw8f1pVXXumTLTsFkb+XU+brnJaWJofDoYcfflg//fSTfvvtN33wwQfWF9TKlSvVu3dvnT592qU2qXivT4FYlxISEqww+MADDygkxP02IdYlZ74OkFLx/W4iRMLnMv/bFh4e7rFt5u6X7P+H89S3N/1n3bWTOd/Fzl+U+HM5ebJz504NHDjQ6m/UqFGqXLmyU5vML76goCBNmTJFX331la655hq1a9dO119/vWbOnKlXXnlF0oVdQXfccYdPaito/LmcjDH6559/JF34r9ayZcs0fvx49erVS+3bt9ftt9+upKQkDRgwQNKF/0++9NJLLrV5U19RXp/ye10yxmjixImSpDJlynh877Mu+V9x/W4iRMLnMn/lefpFLska8s7bX4ZZ23nq32QZTi9zvoudvyjx53LKze7du9WjRw/r/z9XXXWVRo4c6dJu8uTJSkpK0sKFC3XTTTfl2Ndjjz2mrl27SpKWLFmiFStWXHR9BY0/l5PD4dC6deu0aNEiJSYmKiYmxqVNaGioJk+erHLlykmSJk6cqPT0dJfHKc7rU36vS3PnzrWGzxs+fLi1bNxhXfK/4vrdVLTWZBQImZviz54967FtZpuwsDBbfXvTf9bpmf1f7PxFiT+XkztbtmzR5Zdfbn0Btm3bVlOnTs3xQzMyMlKxsbHq1KlTrn0OHz7cuv7LL79cVH0Fkb+XU40aNdS5c2dFR0e7bVOuXDldd911kqRjx45p5cqVTrV5U19RXp/ye12aNm2add2bca5Zl/yvuH43ESLhc2XKlJEkp/9OuZN5xGBkZKStvrPO66lvh8Oh8uXL53l+O/UVJv5cTjn5888/1aFDB23dulWS1Lp1a/30008e///jSbNmzazrO3bsuKi+CqL8Xk7u5PQ6sz5dkJ/LyBij2bNnS7pwAEbr1q3z1E9Oivq65E/F9buJEAmfq1u3riTp3LlzOnjwYK5tM49Iy3rEpzd9Sxd2i+Ymc3rlypWtP53XqlVLwcHBtua3U19h4s/llN3cuXPVtWtXHThwQNKF85zNmzfP+gC9GBEREdb1c+fOXXR/BU1+Lqfc5PQ6sz5dkJ/L6I8//tD+/fslSddcc02e+nCnqK9L/lRcv5sIkfC5rLvFNm/e7Lbd3r17rdOJ5LYrLS99Sxd2nWafJzQ0VPXr17c1f4kSJTyet6sw8udyymrGjBnq37+/Tp06JenCfyDnzp2b6/+4Tp48qTlz5mjy5MnWiYHdyfqlXaVKFdv1FXT+XE47d+7UzJkzNWnSJO3atSvXtjm9zqxPF+TXuiRJP/zwg3X92muv9diedSl/FNfvJkIkfK5t27bW9SVLlrhtt3jxYut6x44dveq7Ro0aqlWrlse+Dx48aP3vLnvfmfX9/vvv1kgdudXXunVrlShRwqv6ChN/LqdMP//8s2644QZrq8awYcM0Y8YMtydFznTo0CH16dNHw4YN0xtvvJFr26yjhcTGxtqqrzDw53KaN2+err76at1xxx3WLlJ3Ml9nh8OhVq1audRXnNen/FiXsvcRGhqq9u3be2zPupQ/iu13U2BGW0RRd9lllxlJplmzZiYjIyPHNn379jWSTGRkpDl79qzXfd9///1GkgkLCzO7d+/Osc24ceOs8UmXLVvmNG369OnWtOnTp+c4//Lly602Y8aM8bq2wsafy2n79u2mXLly1ut43333uX2M7DIyMkzt2rWNJFO2bFlz7NixHNulpqaaRo0aGUmmfPnyJiUlxev6ChN/LaeNGzday6dLly5u223YsMEEBwcbSaZXr15O01ifLvDnupRV2bJljSTTtm1br9qzLnnHF2NnF8fvJkIk/OKdd97J9Y3+5ZdfWtMfe+wxW32vWrXK+kLr37+/SUtLc5q+bt06U758eSPJtGvXzmX+M2fOmDp16hhJJioqyuzbt89p+qlTp0zr1q2NJFO6dGlz4MABW/UVJv5cTt26dbPmvemmm2zXNnr0aGv+//mf/3H5Ys7IyDC33Xab1Wb06NG2H6Ow8Ody6t69uzXvpEmTXKYfPXrUtGrVykgyDofDLFy40Gk669MF/lxGmbZv3271ce+993o9H+uSZ74IkcXxu4kQCb9IS0szLVu2dPrg+vnnn01iYqJ56KGHrBUtKirK5Zfx1q1brfni4uJy7D/zF58k07FjRzN9+nSzdOlSM378eBMZGWkkmRIlSpjly5fnOP/XX39tzV+7dm3zn//8x/z222/ms88+M02bNrWmvfnmm75+aQoUfy2n+fPnW9MiIyPNwoULzcqVKz1esjp9+rSJiYmx+unatav5+uuvzbJly8wXX3xhOnXqZE3r1KmTSU1N9fOrFTj+XJ+Sk5NNmTJlrJB42223mblz55rffvvNTJgwwfpSk2QeffTRHOtjffL/Z54xxsybN89qN378eK9rY13yzJsQyXeTK0Ik/Gb37t2mcePG1ps++6VmzZpm9erVLvN5s6KeO3fOXHfddW77DgsLM19++WWu9Y0bN84EBQW57eOhhx7yxctQ4PljOQ0ZMsRtf7ldstu1a5eJjY3NdZ4ePXq43UVXlPhzfUpMTDSVK1d227fD4TCPPPJIrn9HYH3y7zIyxphPPvnEajdt2jRbtbEu5c5XIbK4fTcRIuFXZ86cMWPGjDGtWrUyZcqUMWFhYSY6Oto88cQT5vDhwznO4+0HqjEXdhH16NHDVKxY0ZQoUcLUqlXLxMfHm+TkZK/qW7ZsmbnppptMzZo1TYkSJUzFihXNVVddZX788Ue7T7VQ8/VyyvqL+WJCpDEXPpQnTZpkunfvbi3n6tWrm759+5qpU6f6+qUo0Py5Ph06dMiMGjXKxMbGWn1HRUWZ+Ph48/vvv3tVH+uTf5fRhAkTrHZLly61XRvrknu+CpGZist3k8OYLOPnAAAAAF7gFD8AAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkYAbGzZs0Ouvv64rr7xSl156qSIjIxUeHq6aNWuqRYsWGjp0qD799FMdPHgw0KXCz06cOKFp06YpPj5eLVq0UPXq1RUWFqZKlSopOjpavXr10pgxY/T333973afD4bAuCxYs8F/xefD8889btXXt2tVlekJCgjU9Kioq3+vzVteuXa06n3/+ebftjDH6+OOPtXLlyvwrLg/OnDmj77//Xvfcc49iY2NVq1YthYeHKzIyUpdccom6du2q559/Xr/99puMMYEuF8WBAeBk8+bNpm/fvkaSV5fQ0FAzYsQIc+TIkUCXDh87d+6cGTVqlClTpozX74e2bduapUuXeuw76zy//vqr/5+MDc8995xVW1xcnMv0jz/+2Jpet27dfK/PW3FxcVadzz33XI5tVq9ebTp37lwgl0OmjIwMM2HCBFO5cmWv34fR0dHm+++/D3TpKOJC/BdPgcJn8eLF6tu3r06cOGHdFxwcrPr166tq1aoKCwvTyZMntXHjRh09elSSdO7cOb333nv68ccf9dNPP+nSSy8NVPnwoVOnTqlfv35KTEx0ur9y5cqqX7++SpcurdTUVO3Zs0dbt261tvz88ccf6ty5s95++23dc889gSgdNrRo0UJpaWmBLsOt9PR0DRkyRNOmTXO6v3z58mrUqJHKli2rc+fO6eDBg9qwYYMyMjIkSWvXrlX//v312GOP6ZVXXglE6SgG2J0N/L89e/Zo4MCBVoCsUqWK/vOf/+jw4cPasGGDFi1apHnz5mnZsmU6cuSIli9fruuuu86af/v27erTp4+OHz8eqKcAH7rzzjutAOlwODRixAitXbtWBw4c0O+//6558+Zp0aJF2rx5s/bt26eXX35ZpUuXliRlZGTovvvu08yZMwP5FOCFghwgJemZZ55xCpCDBg1SUlKSjh49qj/++EPz5s3TwoULtXbtWh09elTvvvuuKleubLUfM2aM3n777UCUjuIg0JtCgYLijjvusHYFVa9e3Wzfvt2r+UaPHu20G+nRRx/1c6XwtyVLljgt0/fff9+r+f7++28TGRlpzVezZk2Tmprq52rzX2HZne2NrMu5oO3O3rp1qylRooRV35NPPunVfDt37jR169a15itVqpTZv3+/n6tFccSWSEAXdklPmTLFuj169GjVqVPHq3mffvppde7c2bo9adIkpaen+7xG5J+PP/7Yut65c2fdcccdXs3XrFkzjR071rq9e/duzZo1y+f1oXj4/PPPdf78eUlS3bp1NXr0aK/mq1WrliZNmmTdTklJ0WeffeaXGlG8ESIBXfj/UEpKinW7ffv2tua/++67retHjx7VmjVrfFYb8l9SUpJ13e57YejQoSpTpox1e8mSJT6rC8VL1vdhmzZtFBTk/Vd2jx491KhRI+s270P4AyESkHT27Fmn2/v377c1f+fOndWoUSN16NBB/fr18+rDfvv27XrhhRfUqVMn65QxlStXVuvWrfX4449r7dq1tmpITU3VBx98oF69eqlq1aoKDQ1VtWrV1KNHD3344YfWf7+uvPJK67QnCQkJufZ54sQJjR8/XnFxcapYsaJCQ0NVq1YtDRgwQF9//bXVrnHjxrZOVzN37lzdeeedio6Otk6dVKdOHV111VV65513nAK9O5mPV6lSJUnSrl27FB8frypVqigiIkL169fXTTfd5PRF7K2s7we774Xw8HBdddVVatmypXr27Kl69erlWr+71yzr6WnWrVsn6cKWzVGjRqlly5aqUKGCSpcurejoaN15551atWqVSx8LFizQkCFDVL9+fYWHh6tChQq6/PLL9fbbb7u857PydIofu9auXav//d//tV6PsmXLKiwsTFWqVNFll12mu+66Sz/++GOufSxYsMCq6frrr5ckLV26VL169VLZsmVVrlw5xcTE6N5777VOu+XuFD9ZT1GU1RVXXGHdHx8fL0m67LLLrPv69u3r9XO+5pprrPm83ZKd3cW8DyWpf//+at68ubp166amTZt6bJ+WlqYvv/xSgwYNUr169RQeHq6IiAhFRUXpuuuu09SpU73+D+nBgwc1ZswY9ejRw/p8i4yMVHR0tO644w7NmTPHYx9Zl9MjjzwiSZo9e7Y6deqkUqVKqUKFCmrVqpUeeeQRpaamusyfkZGh6dOn65ZbbrEOQoqIiFC9evV03XXXafLkydaWXuRRoPenAwXBvn37nP4b1bNnT5OWluaXx0pLSzOPP/64CQsLy/UUHcHBwebuu+82Z8+e9djnsmXLTP369XPtr1WrVmbz5s2md+/e1n0ff/yx2z5nz55tqlSpkmuf3bt3N4cOHTKXXnqpV/8r27x5s+nSpYvH05NUq1bNTJ8+PdfnnNm2YsWKZvfu3aZ27do59vXnn396fP2yu/LKK635IyIizLp162z34UnWGnN6zbKenmbt2rUmISEh11MNhYSEmEmTJhljjDl9+rS59dZbc32NY2NjzaFDh3KszVen+Dl48KC57rrrjMPh8Oq0NO3btze7d+/Osa9ff/3VanfdddeZefPmmdDQUJc+KlasaM6dO+fyGmY9xU/W+t1dhg0bZowxZty4cU6v8YEDB9w+30yHDx92qm3JkiUe58nJiBEjrD4cDodZsGBBnvrxxty5c029evU8vi7NmjUzycnJbvtJT083L774ogkPD/dqeee2bmVdTv/617/MJ598kuN76bLLLnOZNykpyTRr1sxjDQ0bNjSJiYk+eQ2LI0Ik8P+aN2/u9OHSuXNnM3/+fJORkeGzxzh79qxTQMm8NG7c2FxxxRWmVatWTn+kz6zj1KlTbvtcvHixKVmypNM8FSpUMJ07dzZt27Z1Cqt16tQxl112mccQ+dVXX5mgoCCXYBcXF2datmxpgoODrftbtGhhatWq5TFELl++3OU8dxEREaZdu3ama9euLl9gDofDvP76626fd9bn2rVr1xy/IKKjo20snf8aP368SzB5++23zYkTJ/LUX271exMi//3vfzu9Lpnvl+yvWVBQkElKSjIDBgyw7qtUqZK5/PLLTevWrU1ISIhT+2uuuSbH2nwRIvfu3WsaNWrk9HilSpUysbGxpkePHqZTp045/khp1qyZOX/+vEt/WUNkly5d3J4z8e67787xNcwaIufOnWt69+7t9INKkmnTpo11/6uvvmqMufADM+vr9vbbb7tfqP/vnXfesdpfcsklHtu7880337isL6NHj/YqyNrxySefuKzv5cqVM+3btzft2rUzZcuWdZpWqVIls379epd+0tLSzDXXXOOyTOrWrWvi4uJM69atXcJl+fLlzcKFC3OsK+v77KqrrnIbTMeMGeM03+zZs01ERITL8+nYsaPp0qWLqVmzptO00NBQM23aNJ++psUFIRL4f1OnTs3xA6pmzZpmxIgRZurUqWbv3r0X9Rh33XWXU99DhgwxW7dudWpz9OhR89RTTzkFtZtvvjnH/o4cOWJq1Kjh9CXz7rvvWltijDHm+PHj5tFHH83xF3xOIXLz5s2mVKlSTl8Y06ZNM+np6Vabffv2mWHDhuX4euUUiA4ePOgUNEuXLm0mTpxozpw549Ru5cqVTlsqg4KCzE8//ZTjc8/+uBEREWbixInm0KFD5sCBA2bq1KlmypQpbpZE7k6cOGGqVq3q8hjh4eGmX79+5q233jJ//fWX02til50QmXnp27evy5f3lClTnN4rmV/45cqVM59//rlTjTt27DAdOnRw6jOnLUG+CJE33nij1SYsLMy8++67OW5VX7hwodMPG0nmq6++cmmXNURmXurXr29mzZplTp48abZv325ee+01py3P3pxs3NNyMMaYfv36WW3at2+fY5us2rdvb7V/+eWXPbZ3Jz093TRt2tTleYeEhJhu3bqZsWPHmmXLluUYur21YsUKp62mlSpVMpMnT3b6DElNTTWvvvqqU5hu27atS1/33XefU52dOnUySUlJTm1OnjxpXn75ZZfH3LFjh0t/OW0xbtWqlUlMTDQpKSlm48aN5tlnnzW7du2y5tmwYYPTFvuqVauaKVOmuOxZSkxMdHrflSxZ0vz99995fh2LK0IkkMVjjz2WYzDKemnYsKG59dZbzaeffmr27dvndd+LFi1y6uepp57Ktf0XX3zh1H7evHkubZ544glrenBwsPnll1/c9jdx4kSvQuRNN91kTS9TpoxZvXq12z4fffRRr0Lkbbfd5tTnihUr3PZ57tw5079/f6t9VFRUjn8tyP64nnZ/27V8+XKnMJ3TpXz58qZPnz5m7NixuT6nnNgNkf3793f7F4usr68kU6JECfP777/n2Hb37t1OW6cnTpzo0uZiQ+TWrVud6nnnnXdyfS327t3rtOXo9ttvd2mTPUSWLl3a42m4fBUiv/76a6d2mzZtcvuYGzdudFonswacvNiyZYupVq1aru/DUqVKmW7duplRo0aZJUuW2PorTvfu3Z3ezzltYcz07rvvOj3u3LlzrWmrVq1y2prZv39/pyCa3c8//+wUSq+77jqXNtlDZO3atc3Ro0e9fj41atTI9T1y4sQJ07ZtW6t9586dc+0brgiRQDYff/yx07n+crs4HA7TsWNH8+GHH+b6gWmMcdrFGBsb69Vu8uuvv96ap0+fPk7T0tPTnbaWjRw50mN/2XelZw+Rx44dc9qd/tZbb+Xa3/nz5122ImX/It67d69TaPHUpzHGHDhwwGkX/dSpU13aZH3MJk2aeOwzL9auXeuy5S63S82aNc0jjzxidu7c6bFvOyEyKCjIbN682W1fs2bNcupv6NChuT52p06drLYPP/ywy/SLDZEffPCBFSiqVavm1Zaynj17Wn1eeeWVLtOzh8h77rnHY5++CpGpqammYsWKVrtRo0a5fcxnn33W7TqbV7t27XLaGurpUqlSJTNixAizdu3aXPtdv36903zvvfeex1qy/kVhxIgR1v1Dhw617q9YsaI5fPiwx76efvppp/d49gCbPUSOHTs21/5WrFjh1H7mzJkea1i1apXTXpply5Z5nAf/xdHZQDbx8fHasGGDXnjhBV1yySW5tjXGaOnSpbrtttvUtGlTt6fROHPmjNPRp/fcc4/LkaE5ueWWW6zr8+fPdzpac9myZU5HbN57770e+3v44Ydznf7DDz9YRytGRETo1ltvzbV9SEiI7rvvvlzbzJo1yzpyMjw83GOf0oWhBa+88krr9uzZs3Nt74sjiHPSuHFjLV68WF999ZV69+6tkJDcR4rdvXu3Xn31VTVq1EgvvviiNQTdxWrVqpXq16/vdnr2af3798+1v5o1a1rX/THC0u23367Tp09r7dq1+uGHHzy+bpJUrVo16/qZM2c8tvfXMs9JaGiobrrpJuv2559/7rZt1vMxevNe90bNmjX1/fffa+7cubr22msVHh6ea/tDhw7pvffeU0xMjB566CG3R+J///331vXSpUs7fd64M378eE2YMEFz587VM888I+nC5+BPP/1ktYmPj1eFChU89vXggw8qNDRU0oUjqS92PZ8+fbp1vXbt2urXr5/HGmJiYtSiRQvrtqca4IwQCeSgUqVKeuaZZ7R+/XqtWbNGb775pgYOHKiKFSu6nWfjxo3q1q2bvv32W5dpf/zxh9OpJNq1a+dVHbGxsdb1c+fOOZ2uZt68edb1qKioXENGpri4OJUoUcLt9Kx9tm3b1hrGLzc9evTIdXrWYB0TE+NVn5Lzc/d0jrusXwK+FhQUpOuvv15z5szRwYMHNW3aNI0YMULR0dFu5zl79qyefvppDRkyxCdB0tPpWUqWLOl029P47VlDiL9OjB8WFqbGjRurZcuWubbbvn27Pv/8c/3555+2avLnMs9J1kC4YcMGLV++3KXN4sWLtWXLFklSZGSkBgwY4NMaevbsqenTp+vw4cOaOXOmHnroIbVs2dLtKcXS09P15ptvqnfv3jkG8z/++MO6Hhsbq4iICI819OvXT/fee6969uypGjVqSJLWrVtnnVZJknr16uXV86lUqZJatWpl3V68eLHbtg6HQ82bN8+1v6yfE+3atfPqh7pk77MGzjz/PASKuejoaEVHR+uBBx6QMUbJycn69ddf9fPPP2vevHlOH87nzp3T0KFD9c8//6hBgwbW/Zs2bXLqMyYmJk+17N6927q+c+dO67qn0JApLCxM9evX1/r163Ocnpc+M89B6G5rR9bnnpSU5PUHe1ZZn3dOso4V7E/ly5fXoEGDNGjQIEkXzoWXmJio+fPn64cfftCOHTuc2k+dOlUtWrTQ448/flGPW7VqVVvty5Ytm+v0vCyDi5GWlqbk5GStXbtWmzdv1pYtW7Rx40YlJyfryJEjLu2NMR77zK9lnqlVq1Zq1qyZ/vnnH0kXtji2adPGqc2nn35qXR8yZIjCwsL8UktERIT69+9vbXE+fvy4Fi1apPnz5+vHH390Wb8XLlyohx9+WO+9957T/ZmBV/J+fc/Jnj17nG43adLE63ljYmL0+++/59hPVmXLlrW2WrqT9bPm66+/9stnDZyxJRKwweFwKCYmRvfff79mzpypffv26X//939VqlQpq83p06f14osvOs139OhRnzx+1i/cffv2WdfLlSvndR+RkZFup+W1z/Lly7ud5ovnfvbs2Vx3cXoKTf5SuXJlXX/99Xr33Xe1fft2ff/99y5bS1566SWdOnXqoh7H0+7L7PI7JLqzfft2jRgxQpUrV1aLFi1000036emnn9ZHH32kRYsWOb2fvdnlnVUglvmwYcOs61OnTnXaYpqamqpp06ZZt321K9sb5cqVU79+/fT6669r3bp1WrRokeLi4pzafPDBB9q6davTfVn/ymBnfc/u8OHDTrdz+4zJLmvb7P1k5c3y9sVnTU4/auAeWyKBi1C2bFk99dRTuvLKKxUXF2eNtPLdd9/pww8/tL7Ms+7KLlGihLp165anx6tevbp1PesIDXZ2mea2lccffWZ97lFRUXne4pFbPXaGg/Onfv36qVu3burfv79++eUXSdLJkyc1f/58XX311Xnut6CEQju++eYbDR06NMfwHxwcrHr16qlFixZq3769evbsqfHjx+uTTz7xuv9ALPOhQ4fqscceU1pamvbv36958+apd+/eki78v/DYsWOSLoxyk3UXaX7r3Lmz5s+fr2HDhln/38zIyNDMmTP14IMPWu189b66mL9sZB0BJzg42G07b5Z31s+a6Oho1alTx3Y93v7dBhcQIlHsbdiwQSNHjtT+/ft14MABfffddx7/x5VdbGys7r33Xo0dO1bShV/Ehw4dsna5Zf21nZGRoVmzZtne8pJd1j4zv7y8ceLEiYD1GRcX53GoxUC777779M8//2j//v0aOHCgxowZY2v+iIgITZgwwWmX3oYNG3xdZoG2cuVK3XTTTTp37pyk/x6Y0qNHDzVv3lyXXHKJy65eb4fTC6QqVaroqquu0syZMyVJX3zxhRUip0yZYrXzxVbI0aNH65dfftH+/fvVsmXLXA/myUlwcLDeeustTZ8+3fqrSfb3YdZ182IOsMp+EM3Ro0ed9s7kJuvWw6xjzudFZGSkdbDhDTfc4DTUJfyDEIliLz093emIvN9++812iJSk9u3bO93O+us869Gw6enpWr9+vVdj2eamYcOG1nV3/3HMLi0tzel/UDn1uWzZMlt97tixI9ddzVmfe3Jysld9BtLy5cutAw5+/fXXPPURHR2tsmXLWuHaV0dpFxZPP/20FSDLly+vhQsX6rLLLst1nsKyGzE+Pt4KkbNmzVJ6errOnz9vHZ1cokQJ3XzzzRf9OKtXr7bGVD906FCe+qhQoYIuueQS63+c2d+HWdfNjRs3etXnnj179NNPP6l+/fqqV6+e6tSpo9q1azu1SU5OVq1atbzqb/Xq1db1qKgor+Zxp2bNmlaILAyfNUVBwdgHBATQJZdc4vQL2M4utayy/6LO+sf/jh07Ou06mjVrlld9btq0ScOHD9cLL7ygyZMnOx0B2bFjR+v6zp07tX37do/9/f777067rLPL2ueKFSt0+vRpj30uXLgw1+nZ+9y7d6/HPiXpww8/1AMPPKDx48c7nYrE31q3bm1dT0pKytOX0blz55xeO2+OnC8q0tLSNHfuXOv2iBEjPAZIY4z+/vtv63ZBDt39+vVTpUqVJF34D9+iRYs0b948a3n37dtXVapUuejHyfo+PHjwoNMpwuzI+rmU/X2Y9YdvUlKS24PjspozZ46GDx+url27qmXLljLGqHHjxk5nrsi6/HNz4MABp+We9UjtvMj6WZP9lGi5GTNmjEaOHKm33norzz8ciytCJIq94OBgXX/99dbtP/74w/auI8n53HG9e/d2+g9PxYoVnU7r8/bbb1v/n8zN2LFj9fHHH+u5557T8OHDnb5cu3Xr5vTB/f7773vs79133811+jXXXGPtZk9JSfHqdch+xGd2ffv2tQJ0RkaGtcs/N8ePH9fjjz+ut99+W//617+8em6+MnjwYOu6MUaPPPKI7dPgTJ061do9GxYWpu7du/u0xoLs4MGDTrums/6P152vv/7a6cjc/Ny1bfd/gdm3NM6YMcPaMin57oCaQYMGOX2GPPnkk159ZmS1ePFipzMuXHXVVU7Ts56L9eTJk5o6darHPr/88kvr+hVXXCGHwyGHw6E+ffpY93/yySdebVl+6623nP7HmL0+u7KeF/Lo0aP6z3/+43GerVu36tlnn9Xrr7+uBx98UN98881F1VDsBOw050ABsmnTJqdRVUqUKGEmT57s9fyjR492GsUmp1EPsg+ddsMNN+Q6ksecOXOchhG74YYbXNo8/vjj1vSwsDCzfPlyt/198803LiNb5DTsYdYxjytWrGi2bNnits/x48e79JnTqB9ZR9sICgoy3377rds+MzIynEa/kGR++OEHl3aeHvNiZB06TZIZPHiwOXbsmFfzrly50lSqVMnjyCqe6vdmtJVM2YcZzD4ee3ZZxz0fNmyYy/SLGbHmzJkzTu/bq666KtdaVq1a5TQajCTTvHlzl3bZR6zxhjevYdaRkX788Uev+v3rr7+seaKiokz16tWNJFOlSpWLGsc6u+zDWXbt2tXs2bPHq3m3b99uGjRo4HE5XHHFFVabqlWr5jiGdaYZM2Y41TNnzhxr2sqVK52meRr28JdffnEaHatr164ubbwZoz277ONhuxv+05gLQ6xmff5BQUFm1apVXj0OLiBEAv/vvffecwlEl19+uZk6dao5fvy4S/vTp0+bH3/80XTt2tVpnn/961859p+RkeEydFlcXJz5888/ndqdOnXKjBkzxoSHh1vtSpcuneN4vSdPnjS1atWy2pUrV84kJCQ4fZGdPn3ajBkzxukDO/OSkJDg0ufOnTtN6dKlrTY1a9Y0M2bMcBqm8dixY+aRRx5x6U+SWbBggUufGzduNOXKlbPaBAcHm6eeesocOXLEqd2aNWvM1Vdf7dSfu6Hj/Bkid+zYYapUqeL0GJUrVzavvPKK2bBhQ47zrF692owcOdJpuTVo0MDtWL9FNUQaY3JcJ7KvQ/v27TMvv/yyU4jL+rpl568QmXX9uf/++73q1xhjmjdv7lK3N0OP2nH8+HHTuHFjp8coXbq0efrpp81ff/2V4zybNm0yL7zwgtP6VqFCBbNt27Yc2//1119O79natWubmTNnmvT0dKvNuXPnzDvvvOM0vnnv3r1d+rr//vudau3cubNJSkpyanPq1CkzduxYp8eMiIgwycnJLv3lJUQuWbLE6bMuIiLCjB8/3qSkpDi1W7ZsmencubNTvXfffbdXj4H/IkQCWbz11lsmJCTE5cuhRIkSpkGDBqZTp04mLi7ONG3a1GnLZeblzjvvdPrwze7IkSMmNjbWZb66deuarl27mubNmzt9UEsyoaGhZsaMGW77XLp0qSlVqpTTPBUqVDCXX3656dChg9O0hg0bOrWbMmVKjn1+8803Jjg42KlttWrVTFxcnGnbtq3Tc8/e59KlS3Ps84cffnCpMyQkxLRs2dJ07drV1KtXz2kMW0mmadOm5tChQzn2588QacyFQBsVFZVjUK5evbpp1aqV6datm2nVqpWpXLmyS5tGjRrluhW3KIfIRYsWubx/IiIiTLNmzUxcXJxp1KiRy3pWp04d63qpUqVcxpb3V4js06ePU7/Nmzc33bp1M/fee2+ufb/xxhsuy9wfW7F2795tWrRokeP7sFKlSla9bdq0MTVr1nRpU7VqVZcgl91nn33msjyqVKliunTpYjp37uwUSCWZSy65xOzdu9eln3Pnzpm+ffu61BAVFWW6du1q2rZt6xQeM98XOe1pMCZvIdIYYz788EOX5xMeHm7atGljunTpYmrXru1SY9euXc3Zs2e9fgxcQIgEsklKSnLaxeHNpX79+mbatGle9X/69Glz5513unzJ5nRp2LChWbhwocc+Fy1aZOrVq5drXx06dHAJG7ntVv72229zDEdZL/379zfLly93um/lypVu+1y5cqVp2bKlV6/p4MGDXbZUZuXvEGmMMSdOnDAjR450Cfa5XcLCwsxDDz1kTp48mWvfRTlEGmPMlClTvHrdSpYsaV5++WWzatUqp/uzb6H3V4hcsmSJCQ0NdanLU2g5ePCg0xav1q1be1VTXqSmpprRo0ebyMhIr9+HwcHBZtiwYWb//v1ePcbPP//s9kdT1kvv3r3Nvn373PaTnp5unnnmmRy3MGe/tG3b1vz9999u+8priDTGmHnz5rn8wM3pEhQUZO677z4CZB4RIgE31q5da1555RUzcOBA06RJE1O+fHlTokQJExwcbCIjI02rVq3MiBEjzOzZs01aWprt/jdu3GieffZZ07lzZ1O9enUTGhpqIiIiTP369c2gQYPMlClTTGpqqtf9paSkmHfeecd0797d1KhRw4SGhprKlSubnj17msmTJ5v09HRz5MgRpw/QxYsX59rn4cOHzbhx40ynTp1M1apVTWhoqKlevboZOHCgtXV0xYoVTn3u2rUr1z4zMjLMrFmzzB133GGio6NNZGSkCQkJMeXLlzexsbHmgQcecAkQOcmPEJnp2LFj5tNPPzW33367adu2ralWrZoJDw83QUFBJiIiwjRs2NBcc801ZuLEiebgwYNe9VnUQ6Qxxmzbts08/vjjJjY21pQvX94EBwebMmXKmPr165v+/fubsWPHOgWSrF/62f9L6q8QaYwxv//+uxk4cKCpWrWqCQkJMWXKlDExMTHmzJkzufbfrVs3q/+JEyd6VdPFOH36tJk+fbq59957TadOnUytWrVMRESECQoKMuHh4aZu3brmqquuMuPGjTM7d+603X9qaqr56KOPzDXXXGPq1q1rSpYsaUqWLGkaNGhgbrnlFjNv3jyv+9q9e7d58cUXTVxcnPV5VLJkSdO4cWMzfPhwM3fuXI99XEyINMaY8+fPmy+++MIMHTrUNGrUyJQtW9aEhISYihUrmg4dOpjHH3/crF+/3na/+C+HMV4MUgqgSFi3bp2io6Ot29u3b8/TqA5ZzZkzxzoys0SJEjp9+vRFn0gdKOhSU1NVrVo1HTt2TCVLltSePXtyHf4TKIr4pAcKqePHj+upp55SVFSU6tatqwEDBriMApJd5gmRpQsnIs4eIDdv3qy3335bUVFRioqK0sCBAz3WkbXPyy67jACJYmHmzJnWqE6DBw8mQKJY4tMeKKRKlSql999/3zrP2pQpU3TTTTe5bb93716nczQOGDDApU1ISIjefPNN6/bSpUvVoUMHt32uXbvW6RyOOfUJFEUTJkywrt97770BrAQIHE42DhRSISEh1ri90oWRQSZPnuwySoMxRvPmzVPnzp2tEzqHhobqsccec+mzbt26iomJsW4PGjRIM2bMcDnZdlpamqZPn64uXbpYI3VUqFCBL1MUeRkZGXrmmWeskZq6devmNLoMUJzwn0igEFuzZo3atGnjNMRe6dKlVa9ePVWpUkVnzpzR+vXrdfjwYWt6UFCQEhISdMstt+TY5y+//KJevXo5BcfIyEjVrVtXFStW1MmTJ7Vu3TprXGhJCg8P1/fff68ePXr44VkCgfXwww9r6dKlioiI0Nq1a63xmcPCwrRixQo1adIkwBUCgUGIBAq55cuX65ZbbtH69es9tq1Vq5Y++OADp+HOcjJnzhzdfvvt2r17t8c+mzRpooSEBLVp08brmoHCZOzYsS5b7kNCQvTRRx+5/TEGFAeESKAISEtL08yZM/Xtt99qxYoV2rVrl1JSUhQaGqqaNWuqRYsWGjBggG644QaPB99kOnPmjL766ivNmjVLf/31l/bu3avTp08rIiJCtWrVUps2bXTNNddowIABCg4O9vMzBAJn8eLFuvPOO7VlyxaVLVtWbdu21RNPPKFOnToFujQgoAiRAAAAsI0DawAAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAtv0fBe7DHrVOS4oAAAAASUVORK5CYII=", - "text/plain": [ - "
" + "cell_type": "markdown", + "id": "4e5df7f3-7f36-45b4-b7da-b301513efbce", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-11T22:56:41.967336Z", + "iopub.status.busy": "2024-09-11T22:56:41.966988Z" + }, + "id": "4e5df7f3-7f36-45b4-b7da-b301513efbce" + }, + "source": [ + "To create the dataset, you need to specify the path to the **transcripts** file and the **nuclei boundaries** file. These are typically downloaded from a spatial transcriptomics dataset like the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer).\n", + "\n", + "- **`--transcripts_path`**: Path to the transcripts file, which contains single-cell transcriptomic data.\n", + "- **`--boundaries_path`**: Path to the boundaries file, most often representing the nuclei boundaries in the imaging dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "edd35db3-56e4-4a3e-9309-f83133274d47", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T00:49:03.132088Z", + "iopub.status.busy": "2024-09-12T00:49:03.131930Z", + "iopub.status.idle": "2024-09-12T00:49:05.472827Z", + "shell.execute_reply": "2024-09-12T00:49:05.472394Z", + "shell.execute_reply.started": "2024-09-12T00:49:03.132072Z" + }, + "id": "edd35db3-56e4-4a3e-9309-f83133274d47" + }, + "outputs": [], + "source": [ + "# Paths to Xenium sample data and where to store Segger data\n", + "xenium_data_dir = Path('data_xenium')\n", + "segger_data_dir = Path('data_segger')\n", + "\n", + "# Setup Xenium sample to create dataset\n", + "xs = XeniumSample(verbose=False)\n", + "xs.set_file_paths(\n", + " transcripts_path=xenium_data_dir / 'transcripts.parquet',\n", + " boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet',\n", + ")\n", + "xs.set_metadata()" + ] + }, + { + "cell_type": "markdown", + "id": "33bd04f6-c4e3-42f8-81b2-c1e483d9faaf", + "metadata": { + "id": "33bd04f6-c4e3-42f8-81b2-c1e483d9faaf" + }, + "source": [ + "The following parameters are used to build a tiled Segger dataset:\n", + "\n", + "- **`--processed_dir`**: Directory where the processed dataset will be saved.\n", + "- **`--x_size`, `--y_size`**: These parameters specify the size of the tiles used to divide the image. The size of the tiles determines how the spatial region is partitioned for processing.\n", + "- **`--d_x`, `--d_y`**: These define the step size of the spatial grid used to bin transcripts and nuclei into tiles.\n", + "- **`--r_tx`**: Specifies the radius used for graph construction. A smaller radius will connect transcripts to nearby nuclei, while a larger radius might connect them to more distant neighbors.\n", + "- **`--scale_boundaries`**: The factor by which to scale the boundary polygons. Suggested to keep `=1` when boundaries refer to nuclei.\n", + "- **`--k_tx`**: Defines the number of nearest neighbors considered when building graphs for transcripts (`k_tx`).\n", + "- **`--val_prob` and `--test_prob`**: These control the proportion of the dataset that will be set aside for validation and testing. For instance, `--val_prob 0.1` means 10% of the data will be used for validation.\n", + "- **`--compute_labels`**: When set to `True`, this flag triggers the computation of labels (cell assignments) for each transcript. Use False if you just plan to perform prediction using a pre-existing model.\n", + "\n", + "Once the dataset is processed, the output will be ready for training the Segger model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8cf7102-ad9c-4bd0-bbd7-61a1d73abccd", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T00:49:06.357990Z", + "iopub.status.busy": "2024-09-12T00:49:06.357793Z", + "iopub.status.idle": "2024-09-12T00:49:07.235307Z", + "shell.execute_reply": "2024-09-12T00:49:07.234925Z", + "shell.execute_reply.started": "2024-09-12T00:49:06.357975Z" + }, + "scrolled": true, + "id": "c8cf7102-ad9c-4bd0-bbd7-61a1d73abccd" + }, + "outputs": [], + "source": [ + "try:\n", + " xs.save_dataset_for_segger(\n", + " processed_dir=segger_data_dir,\n", + " r_tx=5,\n", + " k_tx=15,\n", + " x_size=120,\n", + " y_size=120,\n", + " d_x=100,\n", + " d_y=100,\n", + " margin_x=10,\n", + " margin_y=10,\n", + " scale_boundaries=1,\n", + " num_workers=4, # change to your number of CPUs\n", + " )\n", + "except AssertionError as err:\n", + " print(f'Dataset already exists at {segger_data_dir}')" + ] + }, + { + "cell_type": "markdown", + "id": "9d2b090b", + "metadata": { + "id": "9d2b090b" + }, + "source": [ + "### **1.2 Faster Dataset Creation with Segger**\n", + "\n", + "Segger introduces a faster, more efficient pipeline for processing spatial transcriptomics data. This method accelerates dataset creation, particularly for large datasets, by using **ND-tree-based spatial partitioning** and **parallel processing**. This results in a much faster preparation of the dataset, which is saved in PyTorch Geometric (PyG) format, similar to the previous method.\n", + "\n", + "**Note**: The previous dataset creation method will soon be deprecated in favor of this optimized pipeline.\n", + "\n", + "#### **Requirements for the Faster Pipeline**\n", + "The pipeline requires the following inputs:\n", + "\n", + "- **base_dir**: The directory containing the raw dataset.\n", + "- **data_dir**: The directory where the processed dataset (tiles in PyG format) will be saved.\n", + "\n", + "The core improvements in this method come from the use of **ND-tree partitioning**, which splits the data efficiently into spatial regions, and **parallel processing**, which speeds up the handling of these regions across multiple CPU cores. For example, using this pipeline, the Xenium Human Pancreatic Dataset can be processed in just a few minutes when running with 16 workers.\n", + "\n", + "#### **Running the Faster Dataset Creation Pipeline**\n", + "Below is an example of how to create a dataset using the faster Segger pipeline:" + ] + }, + { + "cell_type": "code", + "source": [ + "from segger.data.parquet.sample import STSampleParquet" + ], + "metadata": { + "id": "vlDtoWZb24FJ" + }, + "id": "vlDtoWZb24FJ", + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e933ebf3", + "metadata": { + "id": "e933ebf3" + }, + "outputs": [], + "source": [ + "xenium_data_dir = Path('data_xenium')\n", + "segger_data_dir = Path('data_segger')\n", + "\n", + "sample = STSampleParquet(\n", + " base_dir=xenium_data_dir,\n", + " n_workers=4,\n", + " sample_type='xenium'\n", + ")\n", + "\n", + "sample.save(\n", + " data_dir=segger_data_dir,\n", + " k_bd=3,\n", + " dist_bd=15.0,\n", + " k_tx=3,\n", + " dist_tx=5.0,\n", + " tile_width=120,\n", + " tile_height=120,\n", + " neg_sampling_ratio=5.0,\n", + " frac=1.0,\n", + " val_prob=0.1,\n", + " test_prob=0.2,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6ab27f9a", + "metadata": { + "id": "6ab27f9a" + }, + "source": [ + "#### **Parameters**\n", + "Here is a complete list of parameters you can use to control the dataset creation process:\n", + "\n", + "- **--base_dir**: Directory containing the raw spatial transcriptomics dataset.\n", + "- **--data_dir**: Directory where the processed Segger dataset (in PyG format) will be saved.\n", + "- **--sample_type**: (Optional) Specifies the type of dataset (e.g., \"xenium\" or \"merscope\"). Defaults to None.\n", + "- **--k_bd**: Number of nearest neighbors for boundary nodes (default: 3).\n", + "- **--dist_bd**: Maximum distance for boundary neighbors (default: 15.0).\n", + "- **--k_tx**: Number of nearest neighbors for transcript nodes (default: 3).\n", + "- **--dist_tx**: Maximum distance for transcript neighbors (default: 5.0).\n", + "- **--tile_size**: Specifies the size of the tile. If provided, it overrides both tile_width and tile_height.\n", + "- **--tile_width**: Width of the tiles in pixels (ignored if tile_size is provided).\n", + "- **--tile_height**: Height of the tiles in pixels (ignored if tile_size is provided).\n", + "- **--neg_sampling_ratio**: Ratio of negative samples (default: 5.0).\n", + "- **--frac**: Fraction of the dataset to process (default: 1.0).\n", + "- **--val_prob**: Proportion of data used for validation split (default: 0.1).\n", + "- **--test_prob**: Proportion of data used for testing split (default: 0.2).\n", + "- **--n_workers**: Number of workers for parallel processing (default: 1)." + ] + }, + { + "cell_type": "markdown", + "id": "9962e4b8-4028-4683-9b75-d674fa6fb01d", + "metadata": { + "id": "9962e4b8-4028-4683-9b75-d674fa6fb01d" + }, + "source": [ + "# **2. Train your Segger Model**\n", + "\n", + "The Segger model training process begins after the dataset has been created. This model is a **heterogeneous graph neural network (GNN)** designed to segment single cells by leveraging both nuclei and transcript data.\n", + "\n", + "Segger uses graph attention layers to propagate information across nodes (nuclei and transcripts) and refine cell boundaries. The model architecture includes initial embedding layers, attention-based graph convolutions, and residual connections for stable learning.\n", + "\n", + "Segger leverages the **PyTorch Lightning** framework to streamline the training and evaluation of its graph neural network (GNN). PyTorch Lightning simplifies the training process by abstracting away much of the boilerplate code, allowing users to focus on model development and experimentation. It also supports multi-GPU training, mixed-precision, and efficient scaling, making it an ideal framework for training complex models like Segger.\n" + ] + }, + { + "cell_type": "markdown", + "id": "8cbf5be9-27f3-45c2-ab28-8d8ceb078745", + "metadata": { + "id": "8cbf5be9-27f3-45c2-ab28-8d8ceb078745" + }, + "source": [ + "Key parameters for training:\n", + "- **`--data_dir`**: Directory containing the training data.\n", + "- **`--model_dir`**: Directory in which to store models.\n", + "- **`--epochs`**: Specifies the number of training epochs.\n", + "- **`--batch_size`**: Batch sizes for training and validation data.\n", + "- **`--learning_rate`**: The initial learning rate for the optimizer.\n", + "- **`--hidden_channels`**: Number of hidden channels in the GNN layers.\n", + "- **`--heads`**: Number of attention heads used in each graph convolutional layer.\n", + "- **`--init_emb`**: Sets the dimensionality of the initial embeddings applied to the input node features (e.g., transcripts). A higher embedding dimension may capture more feature complexity but also requires more computation.\n", + "- **`--out_channels`**: Specifies the number of output channels after the final graph attention layer, e.g. the final learned representations of the graph nodes.\n", + "\n", + "Additional Options for Training the Segger Model:\n", + "\n", + "- **`--aggr`**: This option controls the aggregation method used in the graph convolution layers.\n", + "- **`--accelerator`**: Controls the hardware used for training, such as `cuda` for GPU training. This enables Segger to leverage GPU resources for faster training, especially useful for large datasets.\n", + "- **`--strategy`**: Defines the distributed training strategy, with `auto` allowing PyTorch Lightning to automatically configure the best strategy based on the hardware setup.\n", + "- **`--precision`**: Enables mixed precision training (e.g., `16-mixed`), which can speed up training and reduce memory usage while maintaining accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4db89cb4-d0eb-426a-a71f-d127926fa412", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T00:49:07.236043Z", + "iopub.status.busy": "2024-09-12T00:49:07.235854Z", + "iopub.status.idle": "2024-09-12T00:49:08.351946Z", + "shell.execute_reply": "2024-09-12T00:49:08.351565Z", + "shell.execute_reply.started": "2024-09-12T00:49:07.236028Z" + }, + "id": "4db89cb4-d0eb-426a-a71f-d127926fa412" + }, + "outputs": [], + "source": [ + "# Base directory to store Pytorch Lightning models\n", + "models_dir = Path('models')\n", + "\n", + "# Initialize the Lightning model\n", + "metadata = ([\"tx\", \"bd\"], [(\"tx\", \"belongs\", \"bd\"), (\"tx\", \"neighbors\", \"tx\")])\n", + "ls = LitSegger(\n", + " num_tx_tokens=500,\n", + " init_emb=8,\n", + " hidden_channels=32,\n", + " out_channels=8,\n", + " heads=2,\n", + " num_mid_layers=2,\n", + " aggr='sum',\n", + " metadata=metadata,\n", + ")\n", + "\n", + "# Initialize the Lightning data module\n", + "dm = SeggerDataModule(\n", + " data_dir=segger_data_dir,\n", + " batch_size=2,\n", + " num_workers=2,\n", + ")\n", + "\n", + "dm.setup()\n", + "\n", + "\n", + "# if you wish to use more than 1 device for training you should run this:\n", + "batch = dm.train[0]\n", + "ls.forward(batch)\n", + "\n", + "# Initialize the Lightning trainer\n", + "trainer = Trainer(\n", + " accelerator='cuda',\n", + " strategy='auto',\n", + " precision='16-mixed',\n", + " devices=1,\n", + " max_epochs=100,\n", + " default_root_dir=models_dir,\n", + " logger=CSVLogger(models_dir),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207864b8-7e52-4add-a4a2-e95a4debdc06", + "metadata": { + "scrolled": true, + "id": "207864b8-7e52-4add-a4a2-e95a4debdc06" + }, + "outputs": [], + "source": [ + "# Fit model\n", + "trainer.fit(\n", + " model=ls,\n", + " datamodule=dm\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9a7d20c6-ca16-4beb-b627-afb41e3fb491", + "metadata": { + "id": "9a7d20c6-ca16-4beb-b627-afb41e3fb491" + }, + "source": [ + "### *Troubleshooting #1*\n", + "\n", + "In the cell below, we are visualizing key metrics from the model training and validation process. The plot displays **training loss**, **validation loss**, **F1 validation score**, and **AUROC validation score** over training steps. We expect to see the loss curves decreasing over time, signaling the model's improvement, and the F1 and AUROC scores increasing, reflecting improved segmentation performance as the model learns.\n", + "\n", + "If training is not working effectively, you might observe the following in the plot displaying **training loss**, **validation loss**, **F1 score**, and **AUROC**:\n", + "\n", + "- **Training loss not decreasing**: If the training loss remains high or fluctuates without a consistent downward trend, this indicates that the model is not learning effectively from the training data.\n", + "- **Validation loss decreases, then increases**: If validation loss decreases initially but starts to increase while training loss continues to drop, this could be a sign of **overfitting**, where the model is performing well on the training data but not generalizing to the validation data.\n", + "- **F1 score and AUROC not improving**: If these metrics remain flat or show inconsistent improvement, the model may be struggling to correctly segment cells or classify transcripts, indicating an issue with learning performance.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43a9c1a4-3898-407d-ac0f-f98b13694593", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-11T22:06:58.182616Z", + "iopub.status.busy": "2024-09-11T22:06:58.182357Z", + "iopub.status.idle": "2024-09-11T22:07:01.063645Z", + "shell.execute_reply": "2024-09-11T22:07:01.063184Z", + "shell.execute_reply.started": "2024-09-11T22:06:58.182599Z" + }, + "id": "43a9c1a4-3898-407d-ac0f-f98b13694593", + "outputId": "70ba8e1b-7814-497a-c8b6-8aa7295cd4d9" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, 'Step')" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2oAAAKACAYAAADtg4tbAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAC4jAAAuIwF4pT92AAEAAElEQVR4nOzdeXhTZdoG8PtkbZrupaVAadn3TUBQ2TcBBRQRZRAH1HHDEcdlnM8ZRcXRcWQUV0YBBVFHUQRFUAREQDYRAdnBAqVAge57mvV8f4ScnNMkbdqmTdrev+tiJsl5z8mbFEuePM/7vIIoiiKIiIiIiIgoZKiCPQEiIiIiIiJSYqBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGIYaBGREREREQUYhioERERERERhRgGakRERERERCGGgRoREREREVGICdlA7cSJEwgLC0NycnJArvfZZ59h1KhRiIuLQ1hYGNq3b48HH3wQJ0+eDMj1iYiIiIiIAkUQRVEM9iQqMpvNGDlyJHbu3IlWrVrh/PnzNb6W1WrF9OnTsXLlSq/Hw8PD8eGHH+LWW2+t8XMQEREREREFUshl1Gw2G26//Xbs3LkzINf761//KgVpY8aMwddff42dO3fiP//5D2JiYlBWVoYZM2bg119/DcjzERERERER1VZIZdRycnIwbdo0/PDDD9JjtcmoHT58GL1794bD4cCUKVPwxRdfQBAE6fjx48dx7bXXoqCgAEOHDsXWrVtr/RqIiIiIiIhqK2QyauvWrUO/fv2kIE2lqv3U3n77bTgcDmi1WixYsEARpAFAly5d8PTTTwMAtm3bxqwaERERERGFhJAI1G6//XZMmDABGRkZEAQBTz/9NIYMGVLr63799dcAgKFDh6J169Zex/zxj3+Ubq9atarWz0lERERERFRbIRGo/fzzzwCANm3aYOPGjXjhhRdqfc3Tp0/j0qVLAJyBmi8JCQno3LkzALD0kYiIiIiIQkJIBGotW7bEK6+8guPHj2PUqFEBueaxY8ek2x06dKh0bNu2bT3OISIiIiIiChZNsCcAANu3bw/ImjS5zMxM6XZKSkqlY1u1agUAyMvLg9lshl6vr9VzZ2VlITs7u1rnmM1mnDt3DrGxsYiPj0f79u0RFhZWq3kQEREREVHDFBKBWqCDNADIz8+XbkdERFQ61mg0SrcLCwuRmJhYq+deuHAhnn/++Vpd4/Dhw+jevXutrkFERERERA1TSJQ+1gWz2SzdriozZTAYvJ5HREREREQUDI02UJNn6Sq25a9IvpVcXWT3iIiIiIiIqiMkSh/rgrzcsby8vNKx8uO1XZ8GALNnz8bUqVOrdU5aWhpuvvnmWj83ERERERE1fI02UIuMjJRul5aWVjrWdVwQBMTExNT6uRMTE2u9zo2IiIiIiJquRlvnl5qaKt2+cOFCpWNdxxMSEqDRNNrYlYiIiIiIGohGG6h17dpVun3q1KlKx54+fdrjHCIiIiIiomBptIFay5YtkZycDADYsWOHz3HZ2dk4efIkAOC6666rl7kRERERERFVptEGagAwefJkAMDGjRsVG2DLffjhh9JtNvMgIiIiIqJQ0KgDtfvuuw9qtRpmsxkPPPAA7Ha74viJEyfw4osvAgAGDhyIAQMGBGOaREREDZpos8N+uQDWQ2dh/uE3WH4+CdFq8zrWdvoSSt5eh5I319bzLImIGpYG3TkjPT0dbdu2BQAMGzYMW7ZsURzv0aMHZs+ejbfeegvffPMNhg4discffxwtWrTA7t278cILL6CgoABarRZvv/12EF4BERFR9TiKTbCdvAD76csQoo3QdEiCOiUBgkbtObaoDLaj52A9cg62I2fhyC6C4Y5hCBvXt9LnsKVdRPnq3Shfswf2ywXQ9kiFblAX6K7rCk2vVNh/vwjL3jRYf02D9bd0OC7lAw5RcQ0hOhz6Ub2hH9cXmjYJKN9wAOZ1e2E7dt55PCocxgfGQdA16I8iRER1ptH/dnz11VeRmZmJL7/8Ejt37sTOnTsVx/V6PT788EP0798/SDMkIqJAEe0OiAWlcOQUwZFbBEeJGaqIMAjR4VBFGyHEGCEY9RAEwfNchwP2jBzA7oC6bSIElbLoxFFUhvK1e2HZcRSCXgd1crz0R4gyAioBUAkQVAIcOcWwHjsP27FzsB07D/ulfKjiIqBOioWqRSzUzWMAQYBYboFYboVotgI2OyACV/7H+X8OB0SHCIiAWFoO28lMOC7ker5wrRrqNolQRYVDLCmHo8QEsaQcYmGZx1DLrhNQf/13aLuneLx3pk+3wfTZT7AdzlAcs+47Beu+Uyh9a53/P4vCMpSv2oXyVbu8Hy8qg2XHMehH9PT7mkRETUmjD9S0Wi1WrlyJFStWYMmSJdi/fz+KiorQvHlzjB49Gn/961/RrVu3YE+TiKjJEi02OArLIKgECLFGzwCpsAy23zNhS7sIx+UCOLIK4cguhCO7CI4SE1BudQY75RaIpeUemZ2KhEgD1KkJzj+tEyAWlsJ69DxsJy8AJotzTHwkdNd1gX5wV6gSY1C+Zg/Kv9sHlFtq/DrtucWw/36xxudXymqH/feLsFc9ErDZUfT4UsSt+YeUzRJFEcXPfALT/7bVahpCpAGqhCjYz+cCFu+ljwCg6ZIM/YT+0HRuVavnIyJqzARRFCv/F43qxZEjR9CjRw/p/uHDh9G9e/cgzoiIGiNRFL1mk1zH7GkXYb+QB1VSDNStm0FlDFOOMVshFpsgxEf6vI7rWo7MPNjSLsJ+6hJsZ7MgFpbBUWyCWGyCWGSCo6gMYmEpxFKz+0SdBurmMVC1iIWg1zqDs4v5AXntjYkQEQZ1+xYQC0pgP5dTaXAqRBqg6Z4CdWoCyldslx43zpmAiMduAgCULf0Bxc9/pjhP0yMFYTcPhLZPO1j3psGy4xgsv6Q5g1WdBtqeqdD27wBtvw7QdGwBdfMYCOF6AICjpByWLYdQvn4/LD8ehFhqhqZLK+hv7I+wG/pD0z6pDt4VIqLGhYFaiGCgRkSVcQU+oskCiCJEuwOwO6QqOWlcsQn2jCzY0rNhz8iC/XyeMzBylcKVmaFqEQttn7bQ9mkHbZ+2cBSWwbLlEMxbDsORmae4nhAbAXXzGIhlZjjyiiGWlAMAVEkxiHjsJoRNuQ6C2p0Bsx7OQOnb62D56YgyAKtvOg1UzaIgRIRdKQEs9W8+ruDTxz+NQmwEwiZdDcEYBseFXNjP58J+IRdimfPn4ipVFAw6aDq1grZba2i6JkPdJhGOvBI4LuXDfjEfjqwCQBAghOkghGkhhOkA1xozQTaPK6WUEFSAVg1N2+bQdG4FVXK8FCiL5VbYzlyG/dRFiBabM6sVYYAQEQZVbARUreKksUV//8idNdOoEffV3+HIKULB3W9KwZ62T1tEzZ8FTceWHq9fNFthz8yDukUchDCtXz8K0WKDWG6BKircr/FEROTEQC1EMFAjIsAZkIm5xbCfy4Ht9CXYjpyD9UgGbEfPQSw2BXt6HjRdkhHxj6lQxUei9PU1MG84UCfPI0QaoOnUEupW8VAlRjv/JERDFWUAwnQQ9FpnwGMMkwK0ihk/0WqDI78U9nM5sGdkw342C/ZzORAiwqDpkgxt19bQdGoJ0WqDZdcJZwZp+1E4sgqhHdAJhqmDoB/VC4LevwAlFDlKypE79jlpnZu6XRIc2YXS3y1VyzjEff0PqBOigjlNIiICA7WQwUCNqOkQy8yw7Pkd9rNZcGQXwp5V6Fx3dTEf9nM5EMuCmImC88O6I6eo0jVGfl+reQw0HVpA3T4JqvhIqCINEK78UUUbnU0+Ypz/D5sdjksFsF/Mh/1SvrNcrt2VDFKL2EpLLcl/5h3HUHDHax6PC+F6xK78G7TdWgdhVkREVFGjbyZCRBRIot0BR04RVLERfrcVF6022H6/CMtPR2HZehiWvWk1D4J0GkClgqAWAJXKXSJ3haDXQp3SDOrURGezjJQEZyAUEQYhwgBBr4Ht94uwHjgD64HTzu5+GjV0g7pCP7wHdEO7Q50YDdHhgCOrEPbzuXBkFTrL6OIioYqLgGixofT1NSj/6mevU9R0aw3jnAnQXdel2uVuqmgjG0zUMf2grjDcMQymT7a6HxQERL3xJwZpREQhhBm1EMGMGlHoEUUR1l3HUf7tPtjTLztL5jLzAKsdQrge4feMRvh9Y6GKNCjP2X8alp+OwnYyE7bfM2E/cxmw+tWPTyJEGqBOSYCmSytou6dA0yMFmq6tFc8VbNaD6Sj+5xew7jkJwFkGafzLROiv7+PRuZFCS8USyIj/mwLjA+OCPCsiIpJjoBYiGKgR1T3R4YDtcAYcBaXOzYG1aggaNYRwPYS4SKhijRA0ajgKy1D+5U6UfbwV9tOXKr2mEBsB459vhH5UL5jX/QLTl7tgP325yrkIkQZo+7S9st4qBurEaKiaOzstqlOaQRVtDNTLrlOiKMK6Nw2w2qG9phMDtAbElp6F0oXfQtuzDQwzhrG0lIgoxDBQCxEM1Ijqjv18Lkxf7kT5yp3OVua+CAKEGKOzs2It9svySq2CpkcK9MOc5YXaPm2dwSIRERGRF1yjRkQNkqOwDCX/WgnbmcvOJhXNopx/ogwQS8vhKHLu1WVLvwzr7pM+260riCLE/BLlY4IA3fAe0A/v4cx2tW4GVWwEyj7agrLFG3w2/tAO7ATdgI7QdGwJdaeW0LRt3qC7BRIREVH9YqBGRAEjiqJzn6gzWXDkFUPbuy3UrZv5d67dAdux87AdzYD1cAZsRzJgP5sN3fAeiHzmNkUpoP1iHgpmvQnbiQvVn6RKgDq5GUS7HbDaIdrszr3BKjT3EGIjYLhtEAx3DIMmJcHjMhGPToJhxnCUvr3WuS+V1Q51agLCplwHw+Rr/H7dRERERN6w9DFEsPSRGip7ViHK3t8Iy7YjsKVnASZZyaBKgH58PxjvHwttrzZezxdtdpSv3o2S17+RGhtUpGoZh+j/3AXddV1gS7uI/D++7rExc1XUbRJhmDoIYVOuhTopVjkHUYRYemVD59xiiDY7tD3b+L2hrz2rEI68Ymg6t+I6HyIiIgoIBmohgoEaNTT2rAKUvfs9yj7ZCpitVY7XDuwEwy3XONvGt24GVfMYmDccQMmrX8F+qvKGHQAAQYBh2mCUf/srxMIy6WHDjOFQt4iFI6cIjtxiOIpNzk2Po9x7dWmv7gBt/w4MooiIiKjBYOkjEflNdDhg3X8G5V/thunzHT4DNFWzKECjguNSgfSY9eeTsP58UjZIABwVvidSq6DpkgxNjxRou6fAkV+C0rfWATY7IIowffqTe6wgIPK5aQifOTKAr5CIiIgoNDBQI6IqWQ+mo3zNHpR/+6vXkkNVsyiE33s9dNd2hrpNIlRR4RDtDpg3HkDZe9/Duv+050XlQZogIGzyQBj/MsljPZh+VC8UPrJEmXXTaRD9+p8QdkO/QL1EIiIiopDC0scQwdJHChbR4QDKrRDC9V6Pl773PUr+tdLrMVVCNMIfGIfw6UMgGLyfL4oirL+egmnFdtjTLsJ+LgeOnCLpuH5cX0Q8NgmaTq18z9FkRvG/voTp4y1QxUUg+q37obu2czVeJREREVHDwkAtRDBQo2Cw/nYGhY++D/uZLET841YY/3S98vjx88i78QXA7nA/qBKgu6Yz9BOvhmHyNRDCdNV+XtFkhv1CHoSocKgTo/0+z1FYBiFMyzb3RERE1Oix9JGoCRJFEablP6L4n58DVjsAoOTFldB2aw3ddV2dYxwOFP/9YylIUyXFwPjQjdCP6wt1QlStnl8w6KHp0KLa56miw2v1vEREREQNhSrYEyCi+uUoNqHwz4tQ/OynUpAGABBFFD7yvlSWaPpsO6z7TkmHo16eifA7h9c6SCMiIiKiqjFQI2pCrIfPIm/SP2Fet1d6TJUQDeicyXVHdiEKH/sA9qxClLz8pTRGf2N/6If38LgeEREREdUNBmpETYDocKB0yUbkTf4X7GeypMd1g7oi7tu5iPz7rdJjlm1HkDf5XxCLnHuVCZEGRM69vd7nTERERNSUcY0aUSPnyClC4RNLYdly2P2gIMA4ZwKMcyZAUKtgmDkSlp3HYd5wwHnOhVxpaMQTN0PdPKZ+J01ERETUxDGjRtSImXccQ+645xVBmqp5DGI/eRQRj06CoHb+ChAEAVGvzIKqVZzifE2vVBhmDK/PKRMRERERGKgRNUqiw4GSt9aiYMYCxZ5lutG9Ef/dXKmzo5wqxojoN+8FrgRvUAmIeulOKZgjIiIiovrD0keiBsxRWAr7uVyoW8dDFW10PlZQisJH34flx0PugToNIv8xFYY/joAgCD6vp+vXATHvzYbps58Qdsu10PZIreuXQEREREReMFAjaoDsWQUoXbgepv9tBSw2AM59zjSdWsF26pJijZk6JQHRCx+AtkeKX9fWj+4N/ejedTJvIiIiIvIPAzWiBsSeVYiyd9ej7JOtgNmqOOa4VADLpQLFY/rRvRH16t3cKJqIiIiogWGgRtRAmL7+GUX/txwwWZQHVALgED0ei/jrZITfPxaCimvMiIiIiBoaBmpEIU60O1DyyiqUvfe94nFVQjSMs8fD8IchcOSXwHYiE7aTF+C4mA/9hKuh69c+SDMmIiIiotpioEZUB0RRhCO7EKqYCAi6mv9n5igqQ+EjSxSNQYTocBgfnoDwGcMghOkAAOoWcVC3iIN+eI9az52IiIiIgo+BGlGAOApLYdl+DOath2HZegSOywVQd2iB2E8eq9aG0Y6SctiOn4ft6DmULf8R9rSL0jFN51aIXvwQNCkJdfAKiIiIiChUMFAjqgHRbIX1cAZsh886///QWdhOXvBYK2ZPu4iCu99C7Od/hcoY5vt6dgfKFn0P02c/wX422+sY/Zg+iFpwD1QRvq9DRERERI0DAzUiAGKZGdBrq9zcWTRbUfbxVpS+sw5iXolf17YdyUDhQ+8hZsmfIWjUHsftlwtQ+MgSWHef8HkN48M3wvjoJDYGISIiImoiGKhRkyaKIgofXgzz2l8AQYAQa4QqLhKq+Eho2idB26cttL3bQt2uOcrX7EHJa2sUe5RVpGoWCd3Q7tAN6wHr/tMwLdsMALBsOYziZ/6HyJdmKDacNm89jMLHPoCYW6y4jhARBk2XZGi6JiPshv7QXdu5bt4AIiIiIgpJDNSoSbPuTXMGaQAgihDzSmDPK4E97SKsP5+E6X/bnMc0asBmV56s10LbIwWanqnQ9kiFtmcq1B1bSFmvsIlXw3G5AObv9gEATJ9ugxCmhTolAY6cItjOZruf23XJiVcj4omboW7djNkzIiIioiaMgRo1aWZZN8VKyYM0rRqGO4Yh4s83QtUsyucpgkqF6AX3IP9yIaz7TgEAypb+4H2wXovI56bBMG2IIuNGRERERE0TAzVq0uRt740P3wjdiJ7OrNqlfNiOZMB64AxsJzMBuwMAEDZpAIyP3wRNaqJf1xfCdIhZ/BDybvmXzyYh6vZJiH7nfmi7JNf+BRERERFRo8BAjZos+6V82I6dl+6H3XwNNO2TPMaJZWbYfs+EKjEa6hZx1X4eVXwkYj56FMVz/wdHbjFUzaKcf+IjoWmXhLAJ/SGE62v1WoiIiIiocWGgRk2WZeth6ba6dTOo2zX3Ok4I10Pbu22tnkuTkoDYZY/U6hpERERE1HSwWwE1WeYf3YGabkRPrg0jIiIiopDBjBo1aPbMPJi3HYEjpwiOnGKIecVwlJmh698BhjuH+9xkWrTYYNl+VLqvH9GzvqZMRERERFQlBmrUIIk2O8qWbETJa18DFpvHccum31C6eAOMs29A+B3DIIRpFcete9MglpQ77+i10F3TqT6mTURERETkF5Y+UoNjO3EBeVNeRsnLX3oN0lzE3GKUvLACOSP+AdMXOyCKonTMLFufprumMwQDm3kQERERUehgRo0alNJ316Pk1a8Aq3tfM1WzKGiv6QxVfARU8VEQi8pQ9slWwGQBADgu5qPor8vgyCuB8f6xAJT7p+lH9KjX10BEREREVBUGatRglK/f58yiyYTdci0i594OVYxR8Xj4/eNQtvBbZ8B2JetW8vKX0HRqCU3HlrCfzJTG6oZzfRoRERERhRaWPlKDYfpih3RblRSDmA8eRvRrd3sEaQCgTohC5LPT0GzzP6FKjnc+KIoonLMYZR9sco9r1xyaNv5tXk1EREREVF8YqFGD4Cg2wfKTu0tj1H/ugn5kryrPUyfHI2bJn6UNpcVikyJQ0zObRkREREQhiIEaNQjmzQelEkYhxgjdQP+7NGq7JCPqtbu9HtMN5/o0IiIiIgo9DNSoQTCv3yfd1o/pA0FbveWVYeP6wvjoJOWDBh10A9iWn4iIiIhCDwM1CnmiyQzzFnc7/bDxfWt0HePDN0IvO1c/rIfH/mpERERERKGAXR8p5Jm3HpFa7QuRBugGda3RdQSVCtGv3o3i5rFwXM5H5DO3B3KaREREREQBw0CNQp75u1+l2/qRPSHoa54FE8L1iHpuWiCmRURERERUZ1j6SPVGFEWIouj7uMMB25nLEG3uzaxFsxXmzbLNqcf3q9M5EhERERGFAmbUqF7YM/OQP2MBxKJShN81GuH3jIYQppOOW/acRPHcT2E7fh7qds0R8/7D0LRtDsuOYxCLTc5BBh30w7oH6RUQEREREdUfZtSoXpS8+hXspy/BkVOMkvmrkTPyGZhW7YL9cgEKH3sf+bfNh+34eQCA/fRl5E15Gdb9p1H+nazb44ieEAz6YL0EIiIiIqJ6w4wa1Tn75QKUr9mjeMyRmYeixz4AVALg8CyHFPNKkPeHVyGoBemxsHE16/ZIRERERNTQMKNGda5s2WbA6lx3JkQaAK3afVAWpGm6pyDiyVvcx8stEEvNzts6DXQje9XXlImIiIiIgooZNapTjpJymD7ZKt03zpkA/dirUPLvVTCv2wsAEKLCEfHXm2GYPgyCWgVtnzYouP+/7rVpAHRDu0MVEVbv8yciIiIiCgYGalSnyj/fDrGoDIAzm2aYNgSqSANi3rkf1gfHw3b0HPSjekEVHymdo7uuK2I/fxIFs96A43IBAMBw88BgTJ+IiIiIKCgYqFGdEW12lL6/SbrvCtJctD1SoO2R4vVcbddkxH39d5Qt2Qh1yzjob+xf5/MlIiIiIgoVDNSozpjX74PjQq7zjkaN8LtGVet8dVIsIp++rQ5mRkREREQU2thMhOqEKIooXbxBuh82oT/ULeOCOCMiIiIiooaDGTUKOHtWIczf/Qrbb+nSY+F/uj54EyIiIiIiamAYqFFA2M5cRtnSH2DZeRz2tIuKY7pBXX2uRSMiIiIiIk8M1KjWRKsN+XcugON8rtfj4bPH1/OMiIiIiIgaNgZqVGvm7/d7BGmqFrHQXdsZYROuhn5Q1yDNjIiIiIioYWKgRrVWtnyLdFs3sicin50GdUoCBEEI3qSIiIiIiBowBmpUK9bj52Hdc1K6b/zzjdCkJgZxRkREREREDR/b81OtmD7aIt3W9EiB9qp2QZsLEREREVFjwUCNasxRVIby1bul++F/HMFyRyIiIiKiAGCgRjVWvmoXxDIzAECIDkfYxKuDPCMiIiIiosYhZAI1k8mEl19+GX379kVERAQiIyPRq1cvzJs3DwUFBbW6dmlpKebPn4+BAwciOjoaer0ebdq0waxZs/Dbb78F5gU0MaIoouyjLdJ9w22DIRj0QZsPEREREVFjIoiiKAZ7EpmZmRgzZgyOHj3q9XibNm2wbt06dOvWrdrXTk9Px7hx43DixAmvx9VqNV5//XX8+c9/rva1A+nIkSPo0aOHdP/w4cPo3r17EGdUOfOOYyi44zXnHUFA/JZ/sokIEREREVGABD2jZrPZcNNNN+Ho0aMQBAEPPvggNm/ejC1btmDOnDlQqVRIT0/HpEmTUFRUVK1rW61WTJw4UQrSJkyYgJUrV2LLli3497//jejoaNjtdsyZMwdr1qypi5fXaJmW/yjd1g3rziCNiIiIiCiAgt6ef/Hixdi7dy8A4I033sDDDz8sHRs2bBiuvfZa/OEPf8CpU6fw6quv4vnnn/f72v/73/9w+PBhAMCDDz6IhQsXKq49ceJE9OvXDyaTCX/9618xadKkAL2qxs2eVQDzxgPS/fA/jgjeZIiIiIiIGqGgZ9TefPNNAEDXrl3x0EMPeRyfNm0aJkyYAAB46623YLPZ/L72+vXrAQCCIOCll17yON61a1fcddddAICTJ08iLS2t2vNvimxHzwMOZ8WsKiEaumE9qjiDiIiIiIiqI6iB2vHjx3H8+HEAwPTp06FSeZ/OzJkzAQD5+fn48ccfvY7xJisrCwAQHx+PmJgYr2O6du0q3b506ZLf127KHIWl0m11qzgI6qDH+0REREREjUpQP2Hv3LlTuj106FCf4wYPHizd3rp1q9/Xb9GiBQAgJycHeXl5XsecOnVKut2yZUu/r92UiQXuQE2IjQjiTIiIiIiIGqegBmrHjh2Tbnfo0MHnuKSkJBgMBo9zquIqmQSAZ5991uP42bNn8cEHHwAA+vTpg3bt2vl97abMkV8i3VbFGIM4EyIiIiKiximogVpmZiYAQKPRSNkvX1zZLtc5/pg6dSpuvPFGAMDbb7+NyZMnY/Xq1fjpp5/wxhtvoH///igqKkJUVBQWLVpUw1fR9DgKyqTbQjQDNSIiIiKiQAtq18f8/HwAgNFohCAIlY41Gp0BQWFhod/XV6vVWLVqFV555RXMnz8fX331Fb766ivFmLFjx+KNN95A586dqzf5SmRlZSE7O7ta5zSkRiaibI2aKiY8iDMhIiIiImqcghqomc1mAEBYWFiVY12lj65z/HXixAns378fpaWlXo///PPPWLFiBf7+979DownM27Fw4cJqbSPQ0DgK5IEaM2pERERERIEW1NJHV5fHqrJpACCKouIcf2zfvh2DBw/GqlWrkJSUhI8++gi5ubkwmUzYs2cPpkyZgoKCAjz77LO44447YLfba/ZCmhhFMxEGakREREREARfUQC0iwtkxsLy8vMqxrjF6vd6va5eVlWHq1KkoKipC8+bNsXv3bsyYMQNxcXEICwvD1VdfjZUrV+LRRx8FAHz++ed45513avhKmhZHgbyZCLs+EhEREREFWlBLHyMjIwE4g6qquEoXY2Nj/br2V199Je2LNm/ePCQnJ3sd9/LLL+Pzzz/HhQsX8M4772DOnDl+Xb8ys2fPxtSpU6t1TlpaGm6++eZaP3d9kDcTYekjEREREVHgBTVQS01NBQBYLBZkZ2cjISHB51hXt0d/9zr75ZdfpNuuzo/e6HQ6jB8/HkuWLMHJkydRUFDgc3NsfyUmJiIxMbFW1whVosOhaCbC0kciIiIiosALaulj165dpdvyjacrunjxIkwmk8c5lZE3D6kq8JIHVUVFRX5dv6kSi8sBhyjdZ0aNiIiIiCjwghqoDRgwQLq9Y8cOn+O2b98u3b7uuuv8unazZs2k22fOnKl07Pnz5wE4m5rIzyNPDlk2DYIAIdIQvMkQERERETVSQQ3UOnTogJ49ewIAli9fLnV2rOjDDz8E4FyfNmzYML+uPXjwYOn2xx9/7HNcSUkJvv32WwDAVVddhfBw7gtWGTHf3UhEiA6HoA7qXyEiIiIiokYp6J+yH3zwQQDAwYMHMX/+fI/jK1aswLp16wAA9913n99dH8eMGYP27dsDABYsWIBt27Z5jLHZbPjTn/6EnJwcAMBDDz1Uo9fQlHAPNSIiIiKiuhf0QO2+++7DVVddBQD429/+hpkzZ2LTpk3Ytm0bHn30Udxxxx0AgDZt2uCpp55SnJueng5BECAIAoYPH644ptVqsXjxYmg0GlgsFowePRqzZ8/Gd999hz179uCjjz7CddddhxUrVgAArr/+esycObPuX3ADJw/UhGgGakREREREdSGoXR8BQK1WY+3atRg1ahSOHz+O5cuXY/ny5YoxrVq1wtq1axEdHV2ta48YMQJffvkl7rzzThQVFeG///0v/vvf/3qMmzhxIj755BOo1epavZamQCxka34iIiIioroW9Iwa4Gy5v3//fvz73/9G3759ERkZCb1ej65du+Kpp57CwYMH0b179xpde9KkSfj999/x9NNPo2/fvoiKioJOp0OrVq0wZcoUfPPNN1izZo20pxtVTln6yPV8RERERER1QRB9dfCgenXkyBH06NFDun/48OEaB6d1qfj5z1C29AcAgGHmCEQ9Pz3IMyIiIiIianxCIqNGDYe8Pb8qJiKIMyEiIiIiarwYqFG1sOsjEREREVHdY6BG1SIWuJuJCAzUiIiIiIjqBAM1qhZm1IiIiIiI6h4DNaoWR0GJdFuIZtdHIiIiIqK6wECN/CY6HBDlGbVYNhMhIiIiIqoLDNTIb2JJOeBw7+bA0kciIiIiorrBQI38Jspa8wOAEMXSRyIiIiKiusBAjfzmkHd8jAqHoOZfHyIiIiKiusBP2uQ3dnwkIqLaOr//AnJO5UIUxaoHExE1YZpgT4AaDjFf1vExhmWPRERUfVve2IHcM3kwNgvH9X8fiZR+ycGeEhFRSGJGjfzmKJRn1NjxkYiIqqc4qwS5Z/IAAKU5ZYhqHhnkGRERhS4GauQ3lj4SEVFtnP3lnHQ7ulUUYpKjgzgbIqLQxkCN/CbfQ01goEZE1CTYrfaAXevsz+5ALXVA64Bdl4ioMWKgRn5jRo2IqOkwl5jx6X1f4r83fIDjG0/W+noOmwMZv56X7qcOSKn1NYmIGjMGauQ3eaAmRDNQIyJqSOxWO/IzCiA6/Ou2uH/lIWSdyIbdYsfe/x2o9fNfOnYZlhILAECtVSG5T8taX5OIqDFj10fym6jIqLHrIxFRQ+GwO7D6ibW4cOAiOgxrhxvnXV/peGu5Fb+tOizdLzhfCIfdAVUt9s88u8edTWvZqwV04doaX4uIqClgRo38pih9jGXXRyKihiJ9dwYuHLgIAEjbehrmEnOl449+ewLlheXSfbvFjpKskkrOqNrZPRnSba5PIyKqGjNq5DexkKWPRER1zWF34LdVh1FwoRAdh7dHq94tIAhCra554MtDivsl2aXQR+i9P7/NgX0rfvN4PP98IaJaRFX6PJmHL2HrmzsAAOPnjpa6OpYVmHD5RLY0juvTiIiqxkCN/CKKIhwFZdJ9NhMhIqobPy/biz3L9wEADq4+gsTOCeg3rTc6DG0HlcZ3IYwoihAdokd5Yu6ZPJz79YLisZLsUsS3jfN6nd+3nkbRpWKPx/MzCpB6tfdMmCiK2LfiN+xY9DNEu3MN3IZ/bcbUt26GoBKQ8ct54MrSuIgEI+Lbxvp8HURE5MTSR/KLWFIO2NwtmhmoEREF3ukd6VKQ5pJ1IhvfPb8JH97xKc4fyPR63uUT2Vh004dY9of/Ifv3HMWxA7K1Zi4l2aUejwHOgOvX/+33eqzgXKHXx8uLyvHN39dj+393S0EaAFw8fBlHvzsBADi7x92WP+Xq1rXOEBIRNQUM1Mgv8kYiACBEs5kIEYW23Ut/wYoHVyP954yqB4eAgguF+P6lzdL9is02ii4V45unvoO52HN92ba3dqC8sBzFl0uwbu4GaQ1aeVE5jn/v2Vq/JNv7erOMveeRnZYr3e92Qxfpdv75Ao/x+RkF+N+fVuLMzrPuB2Ux2Pb3dsNUYFJsdN2G69OIiPzCQI384pCvT4s0QNCogzgbIqLK5abn4edlv+LS0cv49tkNKLpYFOwpVcpabsW6uRuk9vVagwa3//cWTF9yK7qM6SiVM1rKrDi89pji3KyT2cg8dEm6X5hZhM2vboMoijiy7jhsZpvH8/nKqP366QHpdurA1ugwrK10P/9cgcf4rW/tQPFld9DX5poUTF9yK7QG58qK8sJyrH36e5jyTQAAQSWgdf/kyt4KIiK6gmvUyC/c7JqIGpLc03nSbavJhh/+sw03/+fGoJfcWcosyDx4CZePZ0FQqxAWpYchKgxp284gR5bJGv3kcMS1ca7jGvv0KEQ2j8AvHztLEg98eQh9pvaE+soXZge+9CxtPLn5FFr1aYnfVruPhccZUJbnDJiKvWTULp/IVqxl6/+HPohIcHf4Lb5cApvZBo3e+dHBYXPgwm8XpePX3NUfA/7YD4JKwMBZ/bH9v7sBQBFEJnVLRFik9yYmRESkxECN/CIvfRQYqBFRiKvYDCNj73kc/e4EustK+epL4cUiHP7mGM7vv4DLJ7IV67i86TO1JzqN7KB4rNfkHvj1s9/gsDlQkl2KtC2n0Xl0R5Tlm3Dyh9+lceFx4SjLczZ++nHBT1IDD5Vahf53XIVtb+0EAJTmlKGiExvd12neNRGt+rSEaBeh0qjgsDkAESi4UIRm7ZxNSPLO5kvZOpVahX5/6ANB5QyE+9zaE8fWn0DumXzFc7DbIxGR/1j6SH5x5DOjRkQNR+FFz66FP72zE6W53kv+6oq13IoV96/C3k/249LRrCqDtJY9kzD4gWs8Ho9oZkTnUe7gbd/nByGKIg6vPQq71QEAMMaH4/aFN8MQHeYcJHuqDsPaIqlLonTf2xo1eWljx+HtIAgCVBoVolu6W/IXyNapydvtx7eNlTJtAKDWqDHi0SEez8H904iI/MdAjfzi4B5qRNSAeFuTZi6x4MfXfoIo+g6W7FY7bBa7z+PVlXUyBybZxtEAoI/Qoe11qeg4oj1a92uFhI7NEJkUgdQBrXHD89dLJY0VXTW1l/u6J7Jxfn8mDn51RHqs503dEdUiCtf/faTHub2n9EREoruMsbzIDGu5VTGmMNP9nsW0ipZux7aOkW7nZxQo5uCS2DnB4zlb9W6JrmM7SfcN0WFo7mUcERF5x9JH8ovINWpE1IAUyTJqba9LlboSntqejrQtp9FxRHuPc4qzSvD57NUwFZZj/NzRaD+krceY6io4725pH5sSg3HPjEKz9vEee535I6FjMyT3bYXz+5zryNa/8INU5qjWqtBjYlcAzoYe/ab1xq+fOTetTuycgBbdm8Nhd0BQCRAdzkC1JKcUsckxAADRISrKRaNkWbTY1u6gLV/Wov9yFYEaAAx+8Fpkp+Ui51QuBs7qL5VGEhFR1RiokV8UzURiGagRUegSHSKKL7uDjuv+NADWMqu0B9mWN7ej7aA20OiUmauTm9OkbogbXv4RMzonIFKWhQKc+4zlpedDUAkwxBgQFqmvNPiQB2qJnROQ2Kl2GaW+t/WSAjVXkAYAnUZ2gDHOvW3KtfcOgFqnRt7ZAgy6fyAEQYBao0Z4nEFan1aa7Q7USnNLYZdlEqNbREq3Y2QZNVfpo91mR84pd/MTX5my8FgDbn/3FjhsdujCdTV70URETRQDNfKLopkISx+JKISV5JRK67YAIKpFFEY9OQwfz1wBu9WBsjwTck7lIKlrc8V58tI/S4kFP8zfipteuUHqFGm32vHdcxtxanu6NE5QCQiL0iP5qlYY83/DoQ1T7n0mD9RikqNRW20GpiA2JUZRggg4Sxvl1Bo1rr1ngMf5Ec2MUqAmb9FfmOkObA0xYYqgSp5Rc216nXcmXwrsVBoV4tvF+5yzRqcGdNzShYiourhGjfzC9vxE1FDIyx6dQYcWMa2iFZmh4kuezTTk+4EBwNk953Dk2+MAnK3o17/wgyJIA5zZO1NBOX7/8RSOfnfC45oFF2SBWqvaB2qCSlCsVQOAFj2a+732S9FuXx6oydb0yZuHAMqMmqmwHOVF5Yqyx2bt4jyyk0REVHsM1MgvDNSIKJQ4bA5cPHwJRZc9uzvKG4lEtXAHHVFJ7nK+iu37ASjKJV1+ensnCi8WYcO/NiNt62n3AS/VjvLmGoCzTLLwQmAzagDQ5fqOCHN1dgTQ+5aelYxWikhw//4uVWTUvL9ngLN8UWd0Z9jyzxVW2UiEiIhqj6WP5Bfuo0ZEdaXocjF+33wKANBrcneP8kG5snwTDq89hkNrjqIkqwRqrQp3LL1N0ZlQkR2SrbWKai4L1CoEZaIookiWZRPUAkS7CEuZFZ/+aSXMJRbpWJtrUnDjC2NhN9tweN0xaWPn3DN5imuW5pbBarJJ92MDFKhpw7QYP3c0flq4C616t0AnL41RfJEHavIW/UWZvjNqgiAgJjlaCs4KzhUoG4nUct1dXRNFEQ6Ho+qBRERXqFQqqew9mBioUZVEUVS052dGjYiqw2FzIOv3bAACwmMNMMSEQaVR4czODBxeexRn95yT9vyyW+0Y8Md+HtcovFiE3e//gt+3nFKsP7NbHUjbdgZX33GV9Ji89FGRUWvhO6NmLrHAanK3qx98/zX4aeEu6ZhL636tcOO866HRqaHRqdGqdwvpWG56PkSHKDUXka9PM0SHQR+pr/yNqoaU/sm444Op1T5PHqgV+8ioRVfIqAHOdWquQC3nTJ6ykUiX0ArURFGE2WxGUVERiouLYbFYqj6JiKgCg8GA2NhYREZGQqUKThEiAzWqklhmBqzubmAM1IioOtY+873UHt9FpVbBYffMcpw/kOkRqImiiK+f/NajgYZL9u85ivvyza7l5Y6RzWXrsyqsUSuWBW4avQZX3dYLF367iNM70qXHW/ZMwsQXxyk2do5LjZNu28ptKLpcLAU6gW4kEgjyNWolvtaotfIWqMVIt09tOwOHzfmzU2tViG8b5zE+WERRRHZ2NnJzc6seTERUCZPJBJPJBL1ej9TUVKjV9b8Wl2vUqEpifqnivhAd7mMkEZFScVaJR5AGwGuQBgC5Z/I9HivNLVMEacb4cLQb3Ea6n52mDNSUa9RkpY+yoK34crFi4+viLHfgFtk8AoIgYOTjQ6UMVFK3REz69w3QGpRlmbpwreK6uafd5Y+hGai5v2gryy+D3WaH1WRFWZ5Jerxi6SOgbCgiz77Ft4uHWhsajUQYpBFRXTCbzcjMzFT8m1FfmFGjKskbiQgRYRC0/GtD1FBYyizQGrRBq7W/ePiS+44AqcQRcGZj2g9ph/ZD2uC75zcBcO4NZiowwRBjkMbJg5/wuHDc9fkdKLpUjNNXOjAWnC+EpcwKXbgWNosdJTnu31nyMr5IWUBlKbPCXGxGWJSzKYe8FNKVeTPGh2P6kluReyYPLXu2gErj/bvN+LZx0vm5Z/LQblAbaV4uIROoNZNVRIjOINhS6i75VGtVMMZ7fhknb9Ev52+3yfpgNpsVQZpOp0NsbCyMRiM0Gv67RUT+EUURFosFeXl5KC52/m4vKSlBcXExoqI8v8iqS/zNRVWSr0/jHmpEoU8URZz6KR27P9iD3DP56DquM8b83/CgBGuZh9yBWpfRHTH6/4ajvLAcpkIzIhON0EfoIYoiflzwE8qLzACca72S+7gDNfl6qIQO8VBr1IhpGQ2tQetcVyYCOadz0bJHEoqzit3BoKAsdzREh0Gj18Bmdjb4KLpUIgVq8tb88k2uDTEGJF/VqtLXGN8uDmd2ObOG8oYioRioafQahEWHobywHICz/NF1G3AGsyq1Z0Aac2Vj7IoSQ2h9WlGRO9On0+mQmprKAI2IakSj0cBgMODcuXMoLXV+Di4tLa33QI2lj1Qlka35iRqMc/suYMWDq7Hume+lMsJj60/g9x9PBWU+F49clm636JkEtUYNY7wRzdrFQR/hbK4hCIJinVPF7onyQK1ZO+c4QSWgWQf3JsuudWryRiKRiRGKsjxBEHy26Jfflo/xR3zbWNncne+56BCVe6j5CHSCQdH5MaukykYigLPE09jMM9PWPIQ6Prq++QaA2NhYBmlEVCuCICA62v0lmytgq08M1KhKij3UYhmoEYUim8WOb/6+Hqse/QaXj2V5HP9p4S5FV0NvRFHErvd/wVdPrkPWyZxKx/rDUmZVrB9r0SPJ59i4NrJg53SFQE12v1l7d3CW2KGZdNtboCZfn+aiaCgia9GvXKNW3UDNHWTmZ+TDYXOgJKcUdou7CVNMcv1+C1sZRaCWU6oM1LysT3OJrRBsqnVqxMmC1GBylSq5GI38t4qIas9gcFd3WK3Wel+nxkCNqiQWyPYWYukjUUg6te20okMh4NzvS611/povyS7F3v/tr/Qa5/dnYs/yX3H253PY9s4On+NMheVS17/KXD6eBdHu/EdNZ9Qhvo3vD/WKjFq6u6GI3WZH3ln3/XhZoJbQURaopTmzbvLuhVFJnkGHr4yaPGiLSopAdcS2joGgdpaV2q0OFFwoVJQ9hseFQxeu83V6vavY+VHR8bGSQC0mJUZx31WGGgoq7pPGbBoRBULFTo/1vScjAzWqkmVPmnRb3TJ02jATkdvZX85Lt5u1j8dt79yMm/59A666rbf0+K+f/abInlQkL1PMOpHt9ZvDw2uPYfHNH2L5nZ/BUlZ5hk7eSCSpW6LXtU/SnNspSx9dz52fUSgFhSq1CnGyYCGhoztoyz2dC7vNrsioRXvLqMk7P14J1Gxmm6LroXyNmj80eg1iWrnLY3LP5IXk+jQXeUORkuxSFGXK3rNKM2rK1xHqG10TETV0DNSoUo7CMlh2HpPu60f2CuJsiJoG0SHip4W7sGTKchxYeajq8aKIc/suSPf7/aGPVGZ49Yy+0toiu8UubeLsjXwtmNVkU+yz5bL/84MQHSIKM4twZmd6pfPKlAVqLSspewSAuDbuQM1cbEZpbpnHnGJTYhRrzuLaxEnBn93qQP7Zggqt+b1k1GSlj0VXGogUZ8uqBlQCjAnVrxyouMYupAO1imvU5O9ZZRk1WYt+AEgMoY6P5HT48OGgPG96ejoEQYAgCHj66aeDMofKuOY2Y8aMYE+FqFoYqFGlzD/8Jm12LcRHQjugY5BnRNT47V62F/tW/IbSnDJsW7gT5cXmSscXnC9EiWyNVet+7i6FunAtBj9wrXT/1E9ncPaXc16vIw+KAChKDgHnOrj88wXSffnG0hWJDhGX5I1EqgjUDNFhirbwroYiOadljUTaKzP6Gp1asbYt+/cc5WbXXjJq3kof5ZtfG5sZa1TOV7GhSEMJ1LLTchVr6bxlIV1iU5SvI5Ra8zd1RUVFePDBBzFp0qRgT4WIAoiBGlXK/N0+6XbY9X0gVFK6RES1l7btNPZ8+Kt0X7SLOPfr+UrOADL2uo/Ht4uDMU7Zna/z6A5o0dMdKG17ayfsNrtijM1sUwQXAJB/tkBxv+BcgbTmDFCu66oo72w+zCXO5g6CSkBSt8RKXwNQoaGIK1CTd3yUrU9zkZc/Xjh4UdFqvqrSR3OxGeZSC4oue+6hVl0NKaMWKVuj5tqqAAAMMWGVrqWLbhGF2Culp7EpMYhLDY1GIgQ89thjePfdd+t9/QwR1S1+6iafHCXlMG91l1Hox/cL4myIGr+c03nY8NJmj8fP7vGeAXORlz3Ks2kugiBg+JxBzg2n4QyiMn5RBn+5Z/IgOpRr0vIy8j3GyMn3HqtIvn9as3ZxfjXTiFesU3M+t7wDZLN23gI1d0MReTMVtc65DUBFxrhwqcEK4FynJs+oRQUgUCs4X4jCzNAN1HyVdla2Pg1wrhG85bUJGPXXYZj86gSfG4BT/bPZbFUPqkNt2rSBKIoQRRH//Oc/gzoXosaEv2XJJ8uPhwCL85e/EB0O3bWdgzwjosarvKgca/+xHlaT5weus7+c99kS2GF34LwsUEvpl+x1XGKnBLS7ro10/8LBi4rj2RXKHgHPjJq8GyNQeaB28Yg7UKuq7NGlYlaqvKhcsU5OHsi5JMha9JsKZBs3N4+AoPLc4FtQCYhIVK5TK86SZ9Sq15rfJaZVtBQAig4RdqtDdix0WvMDgN6ogy5c6/G4tzV9FUUkRKDHhK7VbrhCRETVx0CNfCpf7y6/0o/uDUHLdsdEdcFhd+C75ze5OzIKwOgnh0vHS7JKPIIml6yTOVKJoUqtQqveLXw+j/zYRVnGCwByvQRqeRnK58yrkFErulzsM4CUZ9T8DdSayQK1vPQ8qeU+AOgj9Yq1VS4JHTyzbIDvjZsB5Tq14ssVMmrV3OzaRaVRIdZLKWBEghHaMM+gKNjkLfpdqsqoERFR/WKgRl6J5RZYfnSXPYax7JGozpzcfEqxzuy6Pw1A9xu7KIIQXw1A5GWPzbslVlpiKA+YLp/Ihk3WRMJbRq0srwxmWSOTihk1W7lNsSZMOi/fhMIL7k6CLXs09zknOfkaNavJhjM7z0r3m7WPgyB4Zsj0Efoqm4ZUdqzoUrFyjVotMkXyjKCLvG1/KPEW9DJQa3iee+45CIKADz/8EABw9uxZqcPhc889B8Dd8XDZsmX49ddfMWjQIBgMBsTHx2PEiBE4d879u8XhcODzzz/HH/7wB7Rr1w6RkZHQ6/VISkrCmDFjsHDhQpjNns2NKuv6OGvWLGf59fDhAID9+/dj5syZSElJka596623YuvWrXXzJvlp/fr1mDp1Klq3bg29Xo/4+HgMGjQI8+fPR2mpZwdcuR07duCPf/wj2rZtC71ej+joaHTr1g2zZ8/GwYMHfZ5nsVjw3nvvYdSoUYiPj4dOp0NiYiKGDh2KV155BUVFvrdToaaBgRp5Zd56BGKZ85exYNRDN7hbkGdE1Hgd/PqIdLvd4Dbof8dVAICUAa2lx32tU5M3Gknxsj5NLrFTM6h1zo6Gdosd2SezATjb++ecyvN6jiurZjPbvO7BVuSl/FG+f5qxWbiigUdldOE6RMo2mz75o3sPR2/r01zk69RcvAVvLvLyxsLMIkV5ZWQ1N7uW87ahd6itT3MxNmOg1tQcP34cw4cPx86dO1FeXo68vDycOnUKrVo5f29cunQJAwYMwO23347PPvsMZ86cQUlJCSwWCy5fvoxNmzbhoYcewjXXXIOCgoIazWHJkiUYOHAgli9fjnPnzknX/vLLLzF8+HC89NJLAXzF/ikuLsbNN9+M8ePHY+XKlTh//jwsFgvy8vKwc+dOPPnkk+jcuTN+/fVXr+c/99xzGDx4MD766COkp6fDYrGgqKgIx44dw3//+1/06dMHL774osd5eXl5uO666/DAAw9g8+bNyMvLg9VqRXZ2Nn766Sf87W9/Q8eOHXHgwIE6fgcolDFQI6/M693dHnUje0EIwdIdosYg51SuogxxwB/7SZmj1KvdgdqF3y4qOvQBzuBJXmLYuq/39Wkuaq0azbu4uy+69jkryS5VZM6iZWuqXC368zMKPJqNAN47P8oDtRbdk7xmwnyRZ6VKc8qk2946ProkegnU/C19vHwsS9pQGwAiE2tW+gj4yKiFaKAWmchArTF44IEHsH//fkycOBEA0KJFC+zfvx/79+/HAw88oBg7f/58WCwW/Otf/8L27duxZMkSzJ07FyqVCqIoYtKkSVIwcscdd2DVqlXYtWsXvv32W7z44ouIj3f+N3jgwIEaBVRHjhzBAw88gMjISLzwwgvYtm0bfvzxRzzyyCNQqZwfR5955hkcOXKkiisFjs1mw6233oqvv/4aAHDVVVdh6dKl2L17N7799lvcfffdUKlUuHDhAkaMGIETJ04ozt+8eTOef/55AED//v3x8ccfY/fu3diyZQteeeUVJCQkQBRFPP3009i5c6fi3L/85S/S+/3ggw9i/fr1+OWXX7B27VrMnDkTAJCVlYU//OEPsNuVXXqp6eCiI/IgWmwwb/pNus+yR6K6c+ibo9LtxM4Jir2pWvZMgtaggdVkg81sw4WDFxXBW+bhS9IeWFqDxq8W+C17JiHzSiORi4cuAdOULfAjEoxo2SNJKl3Mv5JRy033nnFz7UUmp9jouqd/69Nc4tvGIX1Xhsfjzbw0EpGOdaheRi1KljWTZ9PCovRem2z4y1uzk1AN1CquUVNrVYp97KhhSEpKQlJSEuLinH/3dDod+vTp43Wsw+HAq6++ij//+c8AgEGDBknH1q5di19++QUA8OSTT+Lf//634tzx48djxowZ6NatG0pLS7Fq1Sq88sor1ZprTk4OEhMT8fPPP6NNmzbS48OHD0dKSgoef/xxOBwOfPzxx/jXv/5VrWvX1IcffogNGzYAAKZMmYLPPvsMGo37o/H48eNx44034tZbb0VxcTHuuusuRcC1fPlyAEBCQgI2b96MyEj3751hw4Zh7Nix6Nu3L+x2O5YuXYrrrrsOAGA2m7FixQoAwP3334+FCxcq5nXjjTciMTER8+fPx/Hjx7F7927Fz4uaDgZq5MGy4xjEYpPzTpgO+uE9gjshokbKUmbF8e9PSvd73qQsMVZr1Uju0wpndjnXamX8cl4RqJ371b0+rVXvllBrq96oWb5OLfPwJYiiqGja0axDvKIphiuj5mqXX1FxlrL00WaxI+tEtuz5/FufJj2/l6wUBCCuje9ALbGjZ7atsg6Gvkoxa9rxUXrO5pFSYO0SuoGaMqMWmRQJVSPdJ1MURaDUd4fSemGMqFZmuS5otVrcc889Xo/9/vvvSE1NRXZ2Np588kmvY1JSUnDttddi06ZNyMjw/DLFH08++aQiSHO555578MQTT0AURRw+fNjzxDry2muvAQDi4+PxwQcfKII0l1tuuQX33nsvFi1ahF27dmHnzp1SwHX58mUAQLNmzRRBmkuvXr3w9NNPw+FwYMCAAdLjBQUFsFicTaDatWvndW4PP/wwioqK0L59ezRvXr3fo9R4MFAjD+Vrf5Fu64f3gBCuD+JsiBqvk5vTYCmzAgB0Rh06j+zgMSZ1QGspUDu75xyGzL5WOiZvQNK6b+Xr01zkjT1MBeUouFCI3NOyQK1dPOKubGoMyDJqso6P+gid1GlS3jERALJPZkut6TV6jdf1Y5XxVj4Y3TKq0kyXsZkRhugwmK40NtEZdQiL8v17KyLeCEEtKDbvBmq+h5qLoBIQ1yYOl49lXXkgdMsJIyqsUausVLTBKy1B0eQxQZ1C1OqNQETtvgiorR49esBgMHg99thjj+Gxxx6Dw+GQyhC9SUpyftFjtVqrHOvNmDHefw7R0dGIi4tDbm4uSkrqJ6i+ePEijh51VjTcdtttiIry/d/A/fffj0WLFgEANmzYIAVqnTt3xvr163Hs2DHcf//9ePrpp9G6dWvFua6mLnIJCQmIjY1Ffn4+XnrpJbRo0QK33nqr4ufTunVrvPvuu7V9mdTANc6vz6jGbGcuo/zrPdJ9/fi+QZwNUeN2aI277LHr9Z2gNXgGIykD3OvOcs/koSTb+SGmvNiMrJPuzFXr/pWvT3MJiwpDnCxjdvHQJUVGLaFDvOJ4YWYRbBY78mQdH1NkWb2Ka9QU1+oYD7Wm6iyfXGxKjMf+Z5WtTwOcXe3kAWFUi8hKsxcqjQqRXtrTR9YyUAOUgWZk8who9KH5fWjFjFqoBpQUOK6mIZVxBV52ux3p6enYtGkTFi5ciPvvvx89e/bExx9/LI11OBy+LuNTamqqz2NGo/PvZH1t3i1fC3f11VdXOrZ3797Q6XQe582ePVua96JFi5CSkoI+ffrgb3/7G7Zs2eLztahUKjz++OMAgMLCQvzxj39EfHw8brzxRrz55ptIS0vzeh41PQzUSKFk/mrA5lzzom7dDGHjGKgR1YXLx7MUJYI9JnnvrBrTKlqx3ursL84s2snNacCVhJAh1uC9ZNAHeTlixq8XUHC+ULrfrF08olq6y+BEh4icU7kovOju+NjmmhTpdsWujzny7FwVAZY3Gr1G0czEOaeqX5s8UIv2o8ukt+6OtS19BJzbCLjEpXh2gQwVYdFhUgdQgIFaU1BZxghwBl4ffvghhg4dCqPRiLZt22LMmDF46KGHsGjRIhw+fLjaGbSKXEGNN64vV3ztzRhoubnu31WJiZWv71Wr1VIzlbw8d3VBp06dsG7dOkU552+//YZXXnkFI0aMQEJCAv70pz/h2LFjHtf8+9//jqeffhparfMLOpPJhG+//RaPPPIIOnbsiB49euCVV15BWVmZx7nUdITmV30UFNb9p2H+1t1+1vjEzRD07PZIVBfk2bSWPZN8BiOCICD16tbS+CPfHsfxDSdxfn+mNKb1Va08slCVadkzCUfWHQcApG09LXVzVOvUiEmOhkqjQkxytLQ+7dRPZ6SgUK1TI7lPS+la5YXlsJqsUjYw57T7Q0xlLfUrE982DgXnZMGjHwFf9xu64MCXh+CwO9Dthi5Vjo9qHokLuKh4zN9tBCrTdWxnHPr6KMryytD39t61vl5dEQQBkYkRUpBeMThuVIwRztLDIM8h2CrLMptMJkyaNAmbNm2SHlOr1ejYsSO6d++O/v37Y+TIkXjnnXekBhoNnTwg9Gf9oKvzYsWxw4YNw8mTJ/Hdd99h1apVWL9+vbR2raCgAO+//z6WL1+ODz/8EH/4wx8Uz/nCCy/goYcewueff441a9Zg+/bt0j51R44cwd/+9jcsXrwY27ZtQ4sWLWr9mqnhYaBGAJy/sIr/9aV0X9MjBWETKy8FIKKaMRebceIHd2lLxSYiFaUOdAdq8lb+Ll3GdqzW87eQdWK0yza9jm8bB5XG+Y15bEqMFKilbT0tjYlLjUVEghEqtQoOu7P0qfhyCeLaxEIUReSekmfU/M/yycW3jcOpbWfc1/Ej4ItNicG9q/8Ia7nNY/2VN96CstquUQOAsEg97lx+Oxw2h1/NXYKpz5Se2PrWDsSlxiia1DQ2giAEfX1YqHv66aelIG38+PF48skncc011yAsLEwxrrqdHkOZq1Mm4GyDXxmr1Yr8/HyP81y0Wi0mTZqESZMmAQAOHz6M77//Hl988QV+/vlnWK1W3HfffRg3bhxiY5WZ9qSkJMyZMwdz5syByWTCTz/9hG+//RafffYZLl++jLS0NDzxxBP45JNPavuSqQFi6SMBACw/HoJ1j7v7XMT/TYFQyxIHIvLu+KbfYSt3rl0Iiw5Dh6Heu365JF/VymtHvla9W+DWN29C22t8r/vwJqZVNAwxYR6PyzNXcakx0m15aWR8m1io1CpEyPbhKrqyTq34UrHUHAXw3hjEH/IyTk2Yxu+yPH2E3q8gDXBm1CoKROkj4AwMQj1IA4Det/TAvV/NxB0f3Baya+mo7tntdixevBiAsznGN998g+HDh3sEaQBw7ty5+p5enenZs6d027U1gS8HDhyA1er83da5c2fp8dLSUvz66684ffq0YnyPHj3w+OOPY/fu3XjwwQcBACUlJdi+fbs05uLFi9i8ebNiHZvBYMD111+P119/HUeOHEFysnPt8fr162v4Kqmh4ydxgmh3oOTfq6T7uiHdoB9c+Tf8RFRz8rLFrtd3qvJDst6oQ9tB7mCsZc8k3LJgIqa8MQmtele/HEYQBLTs4bm/mTwDFutjfVXclSBKHugUX1mnJi97jEyKgD6iZh1jk/u2gj7CuXC/w9B21Srr9FfFfdY0eo3X4LWxM0SH1cn7S/WrNmvHsrOzUVzs/LKlZ8+eUKu9f8mQlpYmbdAM1F/Tj7rSokULdOvm/Kzz+eefS++BN65AFgBGjhwJACguLkZ0dDT69++PefPm+Tx37Nix0m2Tybn10X//+1+0bNkSo0aNUgRvcvHx8ejfvz8AoLy83M9XRY0NAzVC+Ze7YDtxZT8mQUDE/00J7oSIGjn5JtGJnf1rXz/mb8Mx6q/DcOubk3DrWzehdd9WtdqXqYWXjagTfGTU5OLbOAM4eYdE1+vJqdDmv6YM0WGY/v5UTPjnWIx6YmiNr1OZih0eI5sHf58roprS651fitSktX10dLS0f9jOnTu9BiwXL17E7bffLq3TAiDtA9aQzZkzB4BzM+57771X8fpcVq1ahSVLlgAA+vTpg+HDhwMAIiMjMWTIEADAihUrFEGsnGtja0EQ0Levs0HbuHHjpOD6H//4h5Stk8vMzMRPP/0EANJ51PSw1qGJE212lLy+RrofdvNAaLunVHIGEdVWkayDYmUbM8vpI/ToMaFrwObgPaPmDq5iZXupybnKGSMVGbUrgdopWSORGnR8lItKikRUAJp7+BKZGAEIkJqkRCYGv9kDUU25NkTOzc3F/PnzMWLECMTGxqJ9+/ZVnmswGDB+/Hh88803yMzMxLBhw/D444+jffv2KCgowLZt27B48WLk5OQozisqKqqyk2Sou/fee/HFF1/ghx9+wIoVK/D7779jzpw56NKlC/Lz8/HFF19g2bJlEEURBoMBn3zyieILnblz52Lr1q0oLy/HiBEj8PDDD2Pw4MGIjY3FuXPnsHTpUnz33XcAnHu1dejg3Cuzbdu2uPPOO/Hhhx9i586d6Nu3r/S8NpsNhw4dwquvvorc3FwIgoCnnnoqKO8PBR8DtSbOsvM4HJlXPlxp1Ih47KbgToiokTOXWlBeZJbu12UwUpmETglQ69RSM5GIBCPCotylf7pwHSISjCjJLpUe04RppPnK29u7Sh9zFR0fa7Y+rb6otWpENHO/vkDsoUYULDfeeCPmzZsHURTx5JNPAgDuuOMOxb5nlXnrrbewb98+XLhwAfv378eMGTM8xnTr1g233347nn32WQDAsWPHpDVUDZVKpcLq1avxhz/8AevWrcO+ffswa9Ysj3Ft2rTBF198IZVKuowYMQILFizA448/juLiYrz00kten2fUqFF4//33FY+9/fbbOHv2LLZs2YLDhw/jvvvu8zhPq9Xi1VdfxQ033FDzF0kNGksfm7jyr36WbuuH94C6tX9lWERUM/KyR7VODWNceFDmodGp0bxzgnS/WQfPDFjFrFpcaqy0nkm+Rq3ocglsZhvyzxW4r1eL0sf6Ig/OghUwEwXC1VdfjS+++AJ9+/aFwWBAVFRUtfbfSk1Nxf79+/HEE0+gS5cu0Ov10Ol0SEpKwqhRo/Dee+/h119/xQMPPCCVSX7++ed19XLqVWRkJNauXYs1a9bglltuQcuWLaXXPnToULz77rs4dOiQtF6sokceeQR79+7Fvffeiy5duiA8PBw6nQ7Jycm4+eab8cUXX2DTpk0ee8hFRETghx9+wEcffYTx48ejRYsW0Ol0iIiIQNeuXTFnzhwcPHgQDz/8cH28DRSimFFrwkSTGebv90n3wyZfE8TZEDUedpsdeen5aNYu3qNRQ7EsUItsHhHURg6pA1sj80q7/1a9PJuSxKXG4tyvF6T78W3dDUbkGbXSnFLknMr12I8t1LW5JgUXDzv3O0od0Hjb01PTMGXKFEyZ4rnG3N8NpBMSEjB//nzMnz/f55iwsDCv66natGnj83mWLVuGZcuWVfn86enpfs2zJvx5DyZOnIiJEyfW6Pp9+vTBokWLqn2eSqXCjBkzvGYwiQAGak2aeeNvEEudJVhCRBj0o3oFeUZEDZ8oivh89lfIOpGNzqM7YNwzoxXHC2Xr06L9XJ9WV66a2gvmEmdDgN639PA4XjGjFt/GXc4YmeAO1ESHiPQ952TjYqX92EJZv2l9ENs6BsZmRjTvkhjs6RARESkwUGvCTF/tlm7rx/eDEKYL4myIAu/ikcs4sPIQ2l6Xgi5jOtXLcxZcKETWiWwAwIlNaRj1xDBoDVrpeNFFd0Yt2OV22jAthjx4rc/jcanKFv1xsoyaRq9BeJwBZXnOdtNndp6VjsWH+Po0F7VWjY7Dq262QEREFAwM1JooR24xLFuPSPcNkwcGcTZEgSeKItbP24SiS8U4+WMamrVvVi8NLiylyrKg/HMFSOzkXgsmX6NWcS+vUFMxUJNn1ABn50dXoOYKToGGsT6NiEJbRkYG8vLyqh5YiZSUFMTFNYwvjoi8YaDWRJWv2wvYHQAAVfMYaAd2DvKMiAKr8EKROygSgUNrjmDEX4bU+fNaTcpALS89XxmohVBGrSrG+HB0HN4Ov285jXaD23jMN6p5BC4fy/I4r7at+YmI5s6diw8//LBW11i6dKnXLo5EDQUDtSaqfLW77DHspgEQ1KG/noSoOi4evqS4f/z7kxh03zXQhWt9nBEY1vIKgdrZfOm2KIoNKqMGAOOfG4PBWSVe9xmL9BFoNpTSRyIiolAWMp/OTSYTXn75ZfTt2xcRERGIjIxEr169MG/ePBQUFNT6+seOHcNDDz2Ejh07Ijw8HFFRUbj22mvx1ltvwWKx1P4FNCC2s1mw7j8t3Q+7iWWP1PhkHrmsuG8ps+LkD7/X+fNayyoGagXSbXOxGZZS9+8bfze7DiZBEBDVPFKxyauLvEW/iyHWELQtB4io8XBtNF2bP8ymUUMXEhm1zMxMjBkzBkePHlU8fujQIRw6dAhLly7FunXrPDYa9Nfbb7+Nxx9/3CMg2717N3bv3o2PP/4Y69evR2xsrI8rNC7yvdPUnVpC041tqanxuVQhUAOAg18fRfcJXb0GHS6H1x7Dqe1n0Pe23mjdt1W1n9dablPcl2fUCmVlj1qDBoboMDRk3jaJDvWNromIiBqKoGfUbDYbbrrpJhw9ehSCIODBBx/E5s2bsWXLFsyZMwcqlQrp6emYNGkSioqKqr5gBcuXL8fDDz8Mi8WCpKQkLFiwANu3b8eaNWtw/fXXAwD27NmDO++8M9AvLSSJoohyWbdHw83XVPqhlaghMpdakHM61+Px7N9zcPm455oql5LsEmx+bRvSd2Vgw79+9Hv/ITlbhUCt8EIR7FY7gAqNRJK8Z6kaEu+BGtenERERBULQA7XFixdj7969AIA33ngDCxcuxIgRIzBs2DC88cYb+OSTTwAAp06dwquvvlqta+fk5OAvf/kLAKB169bYvXs3/vKXv2DQoEGYOHEi1q9fj8mTJwMA1q1bhx07dgTuhYUo27HzsJ9xf1ANmzQgiLMhqhuXjl4GrsRY+ggdWvRMko4d+vqoj7OAvIwCiHbniSVZJSi+XFLt57ZUaCbisDtQcKEQAFAk20MtKin0yx6r4q30sVl7ZtSIiIgCIeiB2ptvvgkA6Nq1Kx566CGP49OmTcOECRMAAG+99RZsNpvHGF/eeust5Oc7y46WLVuG1NRUxXFBEPDyyy9L91etWlXt+Tc09gx3C211p5ZQJ/Pbb2p85GWPSd2bo9dN3aX7JzefQnmx2et5pTllivtZJ3Oq/dy2Cs1EAPc6tYbU8dEf+kg9dEbl/ovs+EhERBQYQQ3Ujh8/juPHjwMApk+fDpXK+3RmzpwJAMjPz8ePP/7o9/VXrFgBABg+fDhGjhzpdUynTp3w5z//GQ8//DD69etXnek3SGJpuXRbFWMM4kyI6s5FWaDWonsSOgxrJ60Hs5ltOPb9Ca/nleaWKu5n/57tdVxlKmbUACD/yjq1htbx0R/y8kdBJXjsvUZEREQ1E9RAbefOndLtoUOH+hw3ePBg6fbWrVv9uvbZs2dx4oTzw9itt95a6di33noLb775JqZPn+7XtRsyscQdqAnGht3IgMgb0SHi4lFZoNajOTQ6Nbrd0EV67NDXR72uPyvJVgZqNcqomTyz/q6GIoqMWgPo+OgPefljTHI0NPqQ6FFFRETU4AU1UDt27Jh0u0OHDj7HJSUlwWAweJxTmUOHDkm35Zkym82Gs2fP4tSpU02uLT8AiCUm6bYQwUCNGp+8s/mwlDj/2xZUApK6JgIAek5yd43NzyjAhd8uepxbmlv70seK+6gBzk2vPfZQawSljwAQmeTOqLHskYiIKHCC+tVnZmamcxIaDVq0aFHp2JYtW+LUqVPSOVWRB3StW7dGdnY2nn76aXz22WdS98iIiAjcdtttePHFF5GUlOTrUtWWlZWF7OzqlUylpaUF7Pkr4yhxr81RMVCjRki+0XV8uzjowp1rqKJbRqF1v1Y49+sFAEDmoUtI7tNScW5pjjKjVpZXhtLcUhjj/S8TtnrJqOWfK0RZXhlsZvex6EZS+thpRHscXH0EANB1bKcgz4aIiKjxCGqg5mr0YTQaq2xTbTQ6PygVFhb6de3cXHdr7gsXLmDixInIylK25S4pKcEHH3yA9evXY8OGDejevXvFy9TIwoUL8fzzzwfkWoGmzKgZgjgTorqReVi+Pq254lizdvFSoFaS5dnRsaTCGjUAyPo9B22rE6h5yajZzDacP+DO4OkjdNBH6v2+Zihr1bslZv7vDxAdImJbxwR7OkRERI1GUEsfzWZndicsrOrMjqv00XVOVUpK3B/Cbr31VmRnZ+ORRx7ByZMnYTabceLECcyePRuAM7N30003obi42NflGg15MxGWPlJjdOmospGInLzxRXGFQE0URY+ujwCQXc3yR6uXZiIAkL47Q7rdWMoeXWJaRTNIIyIiCrCgBmquLo/+bPrqWvjvqzNkRSaTO3N07tw5vPHGG3j99dfRsWNH6HQ6dOrUCe+88w6effZZAM592t56663qvoQGRyxmMxFqvEwFJuRnFEj3W/RQZtQUgdpl5Rcz5YXlcNgcHtes7jo1eemjoHb/bju7RxaoNZJGIkRERFR3glr6GBHh/NBUXl5exUj3GL3ev3IheZaud+/eePjhh72O+8c//oFFixbh4sWL+N///oe///3vfl2/MrNnz8bUqVOrdU5aWhpuvvnmWj93VRyy0kdVJAM1alwuHnWXNxtiDYhuqQyIIhPdgVrRpRKIoih9UVSS65lNA4Csarbol5c+xreJQ84pZxm2qcD9e66xZdSIiIgo8IIaqEVGOj+slJV5/4AkV1rqXDsSG+vfHj2uawPAxIkTfY7TarUYPXo0PvroIxw5cgRlZWUIDw/36zl8SUxMRGJiYq2uUVcUpY/MqFEjc+mIu5FIi+7NPbL1kbJW8laTFZYSi7RWrFTWml9r0EoljMWXSmAqLJf2YauKvPSxeZcEKVCTayx7qBEREVHdCWrpY2pqKgDAYrFU2SXR1e2xZcuWlY5zkXdxrOqc5ORk6ba8CUljpNhHjc1EqJG5WEkjEQAwxIRBrVNL94tk69TkjUQSOsQrArPs3/0vf7SWu0sfm3fx/oUNSx+JiIioKkEN1Lp27SrdPnXqlM9xFy9elNacyc+pTI8ePaTbru6Svsj3U4uJifHr+g2VYo0aSx8pQERRhKUsuPsSOmwOXDruLn1s0cNzyw1BECqsU3MHavJGIhEJRiR0bCbdzzrp/iLJYXfg6HfHcXpHusem2aJDhE0WqCV2TvA6V5Y+ElFDt2zZMgiCAEEQsGnTJsWxNm3aQBAEDB48uEbXnjVrFgRBQJs2bQIwU++OHDni9fHazp0okIIaqA0YMEC6vWPHDp/jtm/fLt2+7rrr/Lp23759odE4Kzv37NlT6VjXnmtxcXGKksnGiKWPFGgOmwOfz16N9yYuw8GvvP/DVx+y03KkIEmlVqG5jyBJvk5N3lBEvoeaMd6IhE7uQE2eUftp4S5sfHkLvvn7epzdc05xbXk2DXA2LzHEeP53xkCNiCg4zGYz5s6di6uuuirYUyGqUlADtQ4dOqBnz54AgOXLl3t8O+3y4YcfAnCuTxs2bJhf146Li8OoUaMAAOvXr8fZs2e9jrt8+TJ++OEHAMCkSZOqNf+GRrQ7IJbJN7xm6SPVXuahi7h0NAsOmwMHvjwUtHlc+M29T1li5wRo9N6X4MrXqclb9JfKmokYm4UjUZ5RuxKo5abn4bfVh6XHLx1T7s1YcQ81bZgGcW2U62oN0WHQhWurfD1ERBR48+fPxwsvvACr1ftWKkShJKiBGgA8+OCDAICDBw9i/vz5HsdXrFiBdevWAQDuu+8+v7s+AsDjjz8OwPntycyZMxV7qwGA1WrFPffcA7PZDEEQcN9999X0ZTQI8mwawNJHCozyYnfwbyowVTKybp3fnyndTr7K97pUX6WPJbKMWkQzIxI7uTNyBecKYS61YPt/d0O0u79QspQqyz0Ve6gJgEavQVyKMlBjIxEiauzS052l4fKKqFBhs9kqPR7Kc6emJ+iB2n333Seln//2t79h5syZ2LRpE7Zt24ZHH30Ud9xxBwBnzfBTTz2lODc9PV2qjx4+fLjHtceMGYN77rkHALB161b0798fixYtwq5du/DFF19gyJAhUhD4wAMP4Nprr63DVxp88kYiAEsfKTDk+4aZSy0+M+N1yWF34MJBd0at0kAt0fum16WyZiLGZkZEt4yCzqiTHtv32QHFptWAl0BNVvqoDdNCEATEpVYM1NhIhIiIiKoW1Pb8AKBWq7F27VqMGjUKx48fx/Lly7F8+XLFmFatWmHt2rWIjo6u9vXfe+89CIKAJUuW4MSJE7j//vs9xvzxj3/E66+/XtOX0GCIsj3UEKaDoFH7HkzkJ3m5n2gXYTXZ6r20LzstVwqaVGoVWnT3bCTi4i2j5rA5UJbn/u/D2CwcgkpAQsd4XDjgDAD3fLTP41qVZdS0Bud7EJsaoxjD9WlERETkj6Bn1ABn+/z9+/fj3//+N/r27YvIyEjo9Xp07doVTz31FA4ePIju3bvX6NpqtRqLFy/G5s2bcfvttyM5ORk6nQ7JycmYMGEC1q1bhw8//BA6na7qizVwDllGTRXhfwkpUWUU5X4ALKVmHyPrzoUD7rLH5l0SKg0U5YFaaU4p7DY7ygpMEB3uTKAxzggASOwoa0jiJVForjRQc34P5plRY6BGRIHx6quvSpVF27Ztq3TsI488AkEQoNVqkZXlXl9bUlKC119/HePGjUNycjIMBgOMRiNSU1MxdepUfPPNN9WeV1WdEx0OB7744guMGjUKLVq0gNFoxFVXXYU333wTdrvdr+dYv3497rrrLnTu3BkxMTHQ6XRITEzEkCFD8PLLL6OoqEgx3tWl8vnnn5cec713s2bN8nvuALBr1y7MnDkT7dq1Q1hYGGJiYtCvXz/MnTvX5zZP8iqwLVu2wGKxYMGCBRgwYABiYmIQERGB3r17Y968eSguLvZ6jUCp6c+8sk6fcoMHD/batbPie7BhwwZcddVVCAsLQ/PmzTF+/HiPvZVr8l7LnT9/Hs8++yz69u2L2NhYGAwGdOnSBQ8//DDOnDlT9ZsVZEHPqLmEhYXhySefxJNPPun3OW3atPG7zGrEiBEYMWJETafXKHAPNaoLFTsdmkstiPDecLHOyNentepT+b6JkQnuQE10iCjNKVOsrdMZdVKglyjr/OhiiAmDqcD535KlVBmk2iqUPgLOVv+6cC0sZc6xzKgRUaBMnz4dTz75JBwOB1asWIGhQ4d6HecKjADnspDEROcejzt27MDkyZO97mWbkZGBjIwMrFy5ErNmzcLSpUsDMufS0lLccsst2LBhg+LxAwcO4JFHHsGqVavQvLnnPpguxcXFmDp1Kr7//nuPY9nZ2cjOzsb27dvx3nvvYcuWLdKevYFgs9nw0EMPYdGiRYrHzWYz9u3bh3379uGtt97Cp59+inHjxvm8Tn5+Pq699lrs26es1Dh48CAOHjyIZcuW4aeffkKrVq0CNneXYPzMvdm6dSteeOEFKTDPyspCUVERwsPDAQTmvV6xYgXuuecelJaWKh4/ceIETpw4gQ8++ACffvppSDcTDImMGtUPZaDG9WkUGLYKGTVzSf3up1ad9WmAs8FHeJz7i4riyyXKjo/x4dLthAqBmkavwbX3uLcVqbz00fk9mCAI6Da+MwBnNi+5ikCSiMhfLVq0wMiRIwEAK1eu9JmN2rp1Ky5edP6enDFjBgDnHrXjx49HdnY2IiIi8Le//Q3r1q3D7t27sWrVKsyePRtarfMLp2XLluHbb78NyJxvv/12KUgbNmwYVq5cid27d2Pp0qXo1q0btm7dipUrV/o8/+6775aCtBtuuAGfffYZduzYgQ0bNmDBggVISUkB4Mze/PWvf5XOmzRpEvbv369YArN//37s378f8+bN82vu999/vxQ4tG/fHgsXLsTOnTuxceNG/OUvf4FOp0NBQQEmTpxYaYZz9uzZ2LdvH8aNGye9/uXLl6Nbt24AgDNnzuCxxx7za07VEayfuTfz5s1DZGQk3nnnHWzfvh1vvPEGnnjiCel4bd/rdevWYdq0aSgtLUVkZCSefvpp/Pjjj/jxxx/xzDPPIDw8HGVlZZg2bRpOnDhRZ6+ztkImo0Z1TxGosZEIBYilYuljPQdq1Vmf5hKZGCGtSSvOKpayXYAzA+YS2zoGmjCNlCnrO623opTRXKHMs2IzEZehDw9Ctxu6ICY52ue2AURENTFjxgxs2rQJWVlZ2LJli7Q1kdxnn30GAIiIiMDNN98MAPjPf/4jldh9/vnnGD9+vOKcyZMnY+TIkbj11lsBAKtWrcINN9xQq7l+++23UhO36dOn4+OPP4YgCACAgQMH4rbbbsPYsWN9dlw8ePCgFMTddtttWLFiheL4mDFjcPfdd6Nbt264cOECvvnmG1itVmi1WsTFxSEuLg5JSe5/I/r06eP33H/44Qd88MEHAIBBgwZh/fr1iIhwV2iMHj0aU6dOxejRo2EymXDnnXciLS1NCnzkLl26hCeffBL//ve/pccGDhyIm266Cd27d8f58+fx1Vdfobi4OKD7+wbjZ+6Lw+HARx99hAkTJgBwvqcutX2vy8vLMXv2bADO7bq2b9+Orl27SucPHz4cgwYNwg033ACTyYR//vOf+Oijj+rkddYWM2pNiLyZiBDJ0kcKDJuX0sf6VJ31aS6KvdQul6A0R55RcwdqKrUKvW/pAQBI6NgM/ab1gc7ovn7F0kdvzUQAZ1YtoUMzRfBGRPVDFEU4CsuC+qcuu+FOmTJFKherGLgAzhKyVatWAXB+EHeNzczMRPPmzdG/f3+PD+wukydPlrZFysjI8DqmOhYvXgzAGTAuXLhQCtJcwsPDsWzZMqhU3j+eHj16FB06dIBOp8MzzzzjdUxUVJT0esrLy72W+NXEa6+9BgDQaDT45JNPFIGDy3XXXSfNKyMjQyo3rSg+Pt5rFi8qKgq33XYbAMBiseDkyZMBmbtLMH7mvrh6RXhT2/d648aN0tz/+c9/KoI0l7Fjx2LMmDEAnIFhMDpW+4Nf7TYhDtk+aiojm4lQYHisUSup32Yi1Vmf5qJo0X+5BA67Q7pvbBauGDv4/mvQZ0pPGGLCoNaooZe17LearHDYHVCpVdJ9F20Yf70ShQKxyITs3o8EdQ4Jv70BITq86oE1EBERgUmTJuGzzz7DqlWrsHDhQmg07t8/GzduRE5ODgBIWx4BwKeffgrAmdnwRaVSISEhAefPn0d5ebnPcf6w2Wz44YcfAADjxo3z2cm7ffv2GDx4sNdytmnTpmHatGlwOBw+gzkAiqxZbecNOPfd3bp1KwBn1q6ydW/33nsv/vGPf0AURWzYsAHTp0/3GDN06FCf+wK3bdtWul1x/9/aqu+feWWuvvpqr48H4r12ZW0FQfD6/rssXrwYDocDKSkpHl8ahAp+kmhC2EyE6oJn18f6y6hVd32ai6JFf1aJouNjhCyjJj3WzP2YfG81ALCWWaGPdP6DKw9aNQZmz4iofsyYMQOfffYZcnNzsWnTJkVzBVfZY1JSEkaPHu1xrivgKS8vx5kzZ5CWlobjx49j//792LZtGy5cuACg8g/3/rhw4YJUdte7d+9Kx/bv37/SNV6uOYuiiAsXLiAtLQ2///47Dh06hF27dimadNR23gBw9uxZqSGFrwDDpVmzZmjXrh1OnTqFI0eOeB1TWfBhNLr/valqc+6aqq+feWV8NUoJxHvtykS2bdu20q29WrduXd1p1zsGak2IWCwrfWQzEQqQioFafTYTqcn6NMBzLzVB5f4mrWJGrSJduDJQM5da3IGa7L3QMVAjonoyduxYJCQkIDs7GytWrJACNbPZjK+++gqAMxulViv3Ty0qKsIbb7yBFStW4NixY14/mAuCEJCyMPmWAHFxcZWOrazrIwB8/fXXWLhwIXbs2OHR0Q9Apdm2mpC3gXd1zKxMYmIiTp06hby8PK/H5cFYRfLMTl2U49Xnz7wyUVFRXh8PxHvt+rsWHx9fy1kGHwO1JkQsZddHCrxglj7WZH0aUKH0MasEao37H3VjM9//gAKASqOC1qCB1eR83fIMonzzbw1LH4lCghBlQMJvbwR9DnVJo9Hg9ttvx9tvv42vvvoK7733HnQ6Hb799ltpPzFXt0eXtLQ0jB49GmfPnpUec+0x1atXLwwcOBBjx47FyJEjFWPqg7cGHIAzwzNz5kx8/PHH0mOCIKBt27bo3r07+vXrh+HDh2PTpk345z//GbD5yIMWf0rkXN03Q62crr5+5v5k4ny9N4F4r+sqExkM/CTRhCg3vGbpIwVGMEsfa7I+DVA2E6k434gqAjXAmVVzBWry5imuxwBm1IhChSAIdbY+LJTMmDEDb7/9NgoKCrBhwwZMmDBBai7SuXNn9OvXTzF+2rRp0ofxRx99FLNmzUL37t09sm6BWiclXzdWVYOP/Px8r4+/8847UpA2cOBAPPvssxgyZIhHs4mabNJdGXkGUJ4Z9MU1pqrMYX2r7c/c32xfxc3GqyMQ73VsrLM7s6+MZkPCro9NiFjMjBoFnjyLBNRf6WNN16cBzk2r1Tq112PhcVV/oJOvU1Nk1EzMqBFRcAwcOBAdO3YEAKxZswYmkwlr164F4JlN++WXX/Drr78CAO655x689tpr6NWrl9cP7AUFBQGZX8uWLRETEwMA2Lt3b6Vjf/vtN6+Pv/vuuwCcH8Q3btyI8ePHe+0IeO7cudpNtoJ27dpJ5Yq//PJLpWOzsrKQnp4OwBkgh4pA/MzlTWpMJpPXMaIoIjMz0+sxfwTivZbvR+daF+nNihUrkJKSghEjRgS8w2agMFBrQlj6SHXBo/SxnjJqNV2fBji/FZSXP7oYosOg8RHAyfkM1OT7qDGjRkT1zNXVce3atVi/fr20fkve7REATp06Jd2+6qqrfF7viy++kErLaltOplarpXbsGzZs8PlhPicnR9oQuyLXvNu3b+9zf7G8vDxs3LhRul9x3jVZv6bRaDBs2DAAytbv3ixZskS67dqMPBQE4mfuCrQB36379+7d6zMj6o9AvNeu2w6HA59//rnP89euXYtz585h586daNnS/y976xMDtSZEsY8aSx8pAERR9Cx9rKeMWsbe89Lt6qxPc5E3FHGpan2ai7xFv9lHRo2BGhHVN1fm7OLFi9IarUGDBilavgPKJgvr16/3eq2ff/4Zjz/+uHTfYqn97/aHHnoIgiDAYrFg5syZMJuVa5ptNhvuuecen23hXfM+cuQIzp8/73G8qKgI06dPVwQKFectb4tfnbLOOXPmAHC2j7/zzju9NjHZuXOn9L63bNlS2hMtFATiZ96zZ0/p9pIlSzx+fiUlJXjkkdpvhVHb9/rWW2+VGpH84x//wJkzZzzO37ZtG/73v/8BAKZOneo1MxsKGKg1IYqMGvdRowCwWx0Q7co69fpqJnJq22npdsrVydU+X75OzcUY7986Ft8ZNe6jRkTB0759e1xzzTUAILWor5hNA4DBgwdLH2TXrl2LKVOmYM2aNfj555+xevVq3HXXXRgyZIgi4KnNuiOXa665BrNnzwYAbNq0CQMGDMDHH3+MPXv2YMWKFRg8eDDWrFkjbcpd0ZQpUwA4y+6GDx+OxYsXY+fOndi4cSNefPFF9OjRA99//73inIrzlneUnDt3Lvbu3YujR49WOfexY8di1qxZAJwf8vv06YN3330Xu3fvxubNm/HYY49h5MiRMJlMUKlUWL58eaXdHetbIH7mKSkpuO666wA4y1NHjx6N1atXY9euXXj33XfRr18/7Nq1C+3atavVXGv7Xuv1eqlM9vLly7j66qvx8ssvY8eOHdi4cSOeeuopXH/99XA4HIiJicFLL71Uq/nWJX6SaEIcsjVqqkhm1Kj2bBXWpwH100yk6FIxLh93L0bvMKx9ta9Rm4yaP2vUmFEjomCYMWMGdu/eDcDZPdFbVsdgMGDp0qWYPHkyLBYLVq1ahVWrVnmMu/nmm2E0GvHJJ58gIyMDZWVlPoMof73xxhuwWCxYvHgxDh48iDvvvFNxvE+fPpg0aRLmzZvnce68efPw448/4vDhwzh16hTuu+8+jzHJycl4/PHH8eijjwIAjh07hsGDB0vHR40aBYPBAJPJhAULFmDBggUYNGgQtm/fXuXcFy1aBLVajffffx9paWl48MEHPcbEx8fj448/xqhRo6q8Xn0K1M988eLFGD58OLKzs7F9+3aP9+3RRx9FXFwcnnnmmVrNt7bv9eTJk/H+++/jwQcfRG5uLp566imPMc2bN8fq1auRkpJSq7nWJWbUmgjRagPM7g+RXKNGgWAxeQnUyqxw2Otuk0wASNvqzqbFpsQgvm1sta/hbY2aPx0fAUBndAdhllL3e6BYoxbGQI2I6t+0adOk9vbjx4/3uZfUDTfcgL1792LGjBlITk6GVquFwWBA27ZtMWXKFKxbtw6rV6+WslhWqxWrV6+u9fzUajUWLVqEjRs3YuLEiUhOTkZYWBg6deqEuXPnYufOnT7Xn8XExGD37t14/vnn0bt3bxgMBmi1WiQkJGDIkCGYP38+Dh8+jIceekh63RXXKLVu3RrfffcdBg0ahIiICBiNRo8SPl+0Wi2WLFmCbdu2YcaMGUhNTYVer0ezZs0wcOBAzJ8/HydOnFBsOB5KAvEz79atG44ePYonn3wSnTt3ll7/+PHjsW7dOrz22msBmWsg3uu7774bx48fx5w5c9C5c2eEh4fDYDCgZ8+e+Mc//oHDhw/j2muvDch864og1vWOduSXI0eOoEePHtL9w4cPo3v37gG7vqOgFNl9/iLdTzj4BlRRjb9dMdWtvPR8fDRzhcfj96+9C2GRdVde+/ns1bh45DIA4Oo7++K6Pw2o9jUyfj2P1Y+tVTw24rEh6HVT1f/d7V76C35e5uye1eX6Thj7j5EQRRFvj1osBal//HgaYlvHVHteRFR9drtd0bWtU6dOHt3siIiqK9i/W5hRayLEYmUbVcHIjBrVXsXW/C6WOlynVpxVIgVpANBxWM1q4b2VPvqdUQuXlT6WOUsf7VaHIpPI0kciIiKqDQZqTYR8s2shXA9BzR891V7F1vwudbmX2qlt7u5N0a2i0KyD97KeqkQmeFmj5m8zkQhZoHbltVZcr8dmIkRERFQb/CTRRHAPNaoLFVvzu9TlXmq/y9andRjWDoIg1Og6Gr0GhlgDTPnubHNt2vNXXK/HNWpERFRbFovFr66UldHpdNIm0NSwMFBrIuSlj9xDjQLFV6BWV50fS3NLkXnoonS/pmWPLpGJEVKgJqgEhMf499+GouvjldJHm8mdXVTr1FBpmLUmIqLayczMrHSDan+kpqYiPT09MBOiesVPEk2EMqPGPdQoMHyWPhbXzRq1tG1ngCvtjyKTIpDYOaFW14tKcncWC48z+B1cyTNqrtJH7qFGREREgcRPE02EfI2aihk1CpD6Ln1Mk61P6zisfY3LHl3kLfqN8f5vTOqtmQj3UCMiokBr06YN2KC96WJGrYkQS+Slj1yjRoHhK6NWF6WPZQUmXDiQKd3vUMuyRwCISY6Wbke38L5vjzfyZiJ2qwM2s417qBEREVFAMVBrIsQSNhOhwKvY6dClLro+nv7pDESH81vFiAQjkrom1vqaXa7viFa9WyC6VRT6Tuvj93nyjBrg3ORbmVFjsQIRERHVDj9NNBHKQI2ljxQYFTsdupjrYB+19D3npNsdhrWDoKpd2SPgDLhuffOmap+nNWggqAQpcDSXmmE1MaNGREREgcOMWhOhCNS42TUFiLzToaB2B051UfpYXuT+O9ysfc32TgsUQRCgC3cHY5ZSq6KZiIYZNSIiIqolBmpNhLKZCAM1Cgx5cGKMc28WXReljzbFGrDgB0KKTa9LLYrSRx2biRAREVEtMVBrIhTNRCJZ+kiBIS/3M8bLArU6yKjZzHbptkYf/EBNb3Rvc2EusSiaiWhY+khERES1xECtiVDso2bkPmoUGPKMWkQzd3v7ulijZrMoN5QONkXpY5kyoxYKGT8iIiJq2BioNRFiMZuJUOApMmoJ7kCtLtao2cwhXPpYYuE+akRERBRQDNSaCAf3UaM6IA9O5BtG1/UatVAofay46bViHzU2EyEiIqJaYqDWRIil7lI0NhOhQFGUPsoyanaLHTaL3dspNSa/nkYX/EBIb3QHauZSZtSIiIgosBioNQGiKCqbibD0kQJEnkWSr1EDAlv+KDpE2OWBmj4E1qgZK3R9LOc+akRERBQ4DNSaArMNsLo/5LL0kQJFnkUyxBoUe6mZSwPXUES+Pg0Ija6KFQM1WzmbiRAREVHgMFBrAuQdHwEGahQYokNUrBvThWsV67bMxYHLqHkEaiGQUWPpIxEREdUlBmpNgLyRCAQBQjjb81PtVQyetGFa6CtsAh2w56qw3i0U1qh5lD6a5M1EGKgRERFR7TBQawKUrfnDIAhCJaOJ/CNfkwU4y/0UWaYq9lLLPHQRZ/ecgyiKVT6XPChUaVRQaYL/q6tiUKrMqAU/kCQiqkvLli2DIAgQBAGbNm1SHGvTpg0EQcDgwYNrdO1Zs2ZBEAS0adMmADP17siRI14fr+3cG4Ldu3fjxhtvRGJiIsLCwpCamoq5c+dWes7SpUshCAJmzJhRT7MkAOCniSZAsdk1yx4pQOSBCQRny3x9hDtba64ko5bx63msfmwtAGD4Xwaj9+QelT5XqLXmB5QbXptLrWwmQkTUAJjNZrz44ot4+eWXYbEEfiuZUPfLL79g+PDhMJvdX6ZmZGTAYPDdaC49PR1//etf62N6VEFofOKhOqXo+GhkoEaBocgghWkgqASPTaB9Ob/vgnT78DfHqg7UzPJALfjr0wBAZ5QFpcXmkNuQm4iIPM2fPx8vvPBCsKcRNK+88ooUpD333HMYM2YMzGYzOnbs6HX8pUuXMGbMGOTm5tbnNOkKfppoAhwl7owa91CjQJFnkFxdGJWlj74DNUuZO8jLOZWLokvFiEqK9Dk+1PZQAwCd0Z01U2QXwTVqRNS0paenB3sKPtlstkqPh/LcA+H48eMAgOuuuw7PPvtspWP379+PW265pdG/J6Es+As9qM6JskBNiOQeahQYFTNqQMV1W77XqFnKlEHcmV1nK30uRUYtRLJV8qC0IgZqREQUikpLSwEAbdu29TnGarViwYIFGDRoENLT06FSMVwIFr7zTYAiUGPpIwWItdyzHb2iHLCSNWryjBoAnNlZRaAWimvUfARqgkqAWhca5ZlERERyDocDAKDReP+3ND8/H7169cJjjz0Gk8mEiIgIrF69uj6nSDIM1JoANhOhuqBoR+8qfYzws/SxQhB3fv8FjyybnCKjFiJBkFqr9hqQacM07KxKRPXi1VdflTovbtu2rdKxjzzyCARBgFarRVZWlvR4SUkJXn/9dYwbNw7JyckwGAwwGo1ITU3F1KlT8c0331R7XlV1TnQ4HPjiiy8watQotGjRAkajEVdddRXefPNN2O12r+dUtH79etx1113o3LkzYmJioNPpkJiYiCFDhuDll19GUVGRYryrS+Xzzz8vPeZ672bNmuX33AFg165dmDlzJtq1a4ewsDDExMSgX79+mDt3rs+1XOnp6dLzbdmyBRaLBQsWLMCAAQMQExODiIgI9O7dG/PmzUNxcbFf74G/5M999qzzi9EPP/zQ6+svLCyUyiOHDRuG3377DZMmTQrofMh/ofHVNNUpsVjWTISBGgWIt3b0/jYTqbimy2514Owv59FxWDuv4xVr1EIkowY4s2omi0nxmIYdH4monkyfPh1PPvkkHA4HVqxYgaFDh3od5wqMAGDMmDFITEwEAOzYsQOTJ09Gdna2xzkZGRnIyMjAypUrMWvWLCxdujQgcy4tLcUtt9yCDRs2KB4/cOAAHnnkEaxatQrNmzf3eX5xcTGmTp2K77//3uNYdnY2srOzsX37drz33nvYsmULUlNTAzJvwLm+7aGHHsKiRYsUj5vNZuzbtw/79u3DW2+9hU8//RTjxo3zeZ38/Hxce+212Ldvn+LxgwcP4uDBg1i2bBl++ukntGrVKmBzr47evXvjmWeewZQpU4Ly/OTGjFoTwGYiVBe8lT4qmolUVvpYavV47MzOdJ/jQ3GNGuB9nRr3UCOi+tKiRQuMHDkSALBy5Uqf2aitW7fi4sWLACDtg3Xx4kWMHz8e2dnZiIiIwN/+9jesW7cOu3fvxqpVqzB79mxotc7f7cuWLcO3334bkDnffvvtUpA2bNgwrFy5Ert378bSpUvRrVs3bN26FStXrvR5/t133y0FaTfccAM+++wz7NixAxs2bMCCBQuQkpICwLOl/KRJk7B//37cf//90mP79+/H/v37MW/ePL/mfv/990tBWvv27bFw4ULs3LkTGzduxF/+8hfodDoUFBRg4sSJlWY4Z8+ejX379mHcuHHS61++fDm6desGADhz5gwee+wxv+bkj5YtW0qvtUWLFgCAiRMnen39KSkpOHDgAIO0EMFPFE2AsvSRzUQoMGzlnu3oFfuoVbLhtbcyx/RdGXDYHVCpPb8/UnSYDJGuj4D3dWpsJEIUWkRRrDTDXx90Ebo6K4meMWMGNm3ahKysLGzZsgWjRo3yGPPZZ58BACIiInDzzTcDAP7zn/9IJXaff/45xo8frzhn8uTJGDlyJG699VYAwKpVq3DDDTfUaq7ffvst1q1bB8CZDfz444+l92XgwIG47bbbMHbsWGzfvt3r+QcPHpSCuNtuuw0rVqxQHB8zZgzuvvtudOvWDRcuXMA333wDq9UKrVaLuLg4xMXFISkpSRrfp08fv+f+ww8/4IMPPgAADBo0COvXr0dERIR0fPTo0Zg6dSpGjx4Nk8mEO++8E2lpaVKwK3fp0iU8+eST+Pe//y09NnDgQNx0003o3r07zp8/j6+++grFxcWIjPTdEdlfOp1Oeq06nfPfrbi4OK+vn41DQkvofOKhOqMofWQzEQoQi6Lro6uZiLzro//NRADAVFiOS8ey0LJHkscxuyX09lEDGKgRNQSWEgvenRCYsr2aemDtXdBH6qseWANTpkzB7NmzUVZWhhUrVngEajabDatWrQLgDL7Cw8MBAJmZmWjevDlat27tEaS5TJ48GXq9HmazGRkZGbWe6+LFiwE4A8aFCxd6BK/h4eFYtmwZOnXqJDW9kDt69Cg6dOiAjIwMPPPMM16fIyoqCuPHj8eSJUtQXl6O7OxstGzZstZzf+211wA4m3B88skniiDN5brrrsMzzzyDv//978jIyMAXX3yB6dOne4yLj4/3msWLiorCbbfdhtdeew0WiwUnT55Ev379aj13argYNjcBoqxNuhDJQI0CwyZvJmLw0kyk1AJRFD3OE0VRkVGLTHL/Y3d6R7r35zKH5ho1r6WPIVSaSUSNX0REhNTsYdWqVR77hG3cuBE5OTkAgDvuuEN6/NNPP8WlS5fw888/+7y2SqVCQkICAKC8vNznOH/YbDb88MMPAIBx48YhOjra67j27dv7bOQxbdo0/P777zCZTOjRo4fP55JnzWo7b8DZrn7r1q0AnFm7yta93XvvvVIAWnEdnsvQoUOh13sP3OVt80tKSmo6ZWokGKg1AY4Sd0ZNxdJHChDlGjXP0kfRLio6Q0rnmWyALH7rPKqjdNtXm/5QXaMm3/TaRctmIkRUz1zrznJzc7Fp0ybFMVfZY1JSEkaPHu1xrqvUrby8HMeOHcM333yD+fPnY/r06UhOTsb58+cBwGuGqzouXLgglVr27t270rH9+/ev9LhrzqIo4vz589iyZQsWL16MOXPm4Oqrr8ZLL70kja3tvAHg7Nmz0v5jV199daVjmzVrhnbtnI2xjhw54nVMZYGe0WiUble1OTc1fqHziYfqDPdRo7og79yo8VL6CDg3vdaFayucpyyJ7Dy6I/Z+sh8AkJeej8LMIkS3jFKeE6Jr1Lw3E2GgRhRKdBE6PLD2rqDPoS6NHTsWCQkJyM7OxooVK6SOg2azGV999RUAZzZKrVaWjhcVFeGNN97AihUrcOzYMa9BjSAIXqsjqku+JUBcXFylYyvr+ggAX3/9NRYuXIgdO3ZIAZRcoNdZyVvuuzpmViYxMRGnTp1CXl6e1+PyYKwieTloIN53athC5xMP1RlFoMbSRwoQefCkuxKcaHTOvcXsV9rpm0ssiEhQnifv+CioBcS3jUVcaizyzuYDAE7vTMdVt/ZSnNOg1qiFUMaPiJwffOtqfVio0Gg0uP322/H222/jq6++wnvvvQedTodvv/1W2k/MlXVzSUtLw+jRo6V9tQDAYDCgS5cu6NWrFwYOHIixY8di5MiRijH1wVsDDsCZHZs5cyY+/vhj6TFBENC2bVt0794d/fr1w/Dhw7Fp0yb885//DNh85AGTP01hXN03uacm1RY/UTRyoihCLGEzEQo8eemjvBxRH6FDWZ7z75y3Fv3y9Wm6cGcntLbXpUqB2pmdZz0CNcUatRAKhLwGauHMqBFR/ZsxYwbefvttFBQUYMOGDZgwYYLUFbFz584eTSmmTZsmBWCPPvooZs2ahe7du3tk3QK1Tkq+bszbvm1y+fn5Xh9/5513pCBt4MCBePbZZzFkyBCPxh412aS7MvIMoDwz6ItrTFWZQ6KqcI1aY2eyAA7ZN0Fco0YBIl9/Js8iKTo/emmJLe/46CqLTLk6WXos94znP9CKoDCESh914cyoEVFoGDhwIDp2dK75XbNmDUwmE9auXQvAM5v2yy+/4NdffwUA3HPPPXjttdfQq1cvr0FaQUFBQObXsmVLxMTEAAD27t1b6djffvvN6+PvvvsuACA2NhYbN27E+PHjvXZfPHfuXO0mW0G7du2kcsVffvml0rFZWVlIT08H4AyQiWqDgVojJ9/sGgBULH2kAJGvUZNnkaraS61iRg0ADNHuv5fy67rYLCHa9dHLuhM2EyGiYHF1dVy7di3Wr18vrd+Sd3sEgFOnTkm3r7rqKp/X++KLL6Qyvto2tlCr1ZgwYQIAZzfEzMxMr+NycnJ8dkt0zbt9+/Y+9xfLy8vDxo0bpfsV512T9WsajQbDhg0D4OyiWdlWBUuWLJFuuzYjJ6opBmqNnLzsEWoVoOeHSAoMq2LDa1mgZlS26K9IvkbNlVGTZ6as5VaIDuUCaru5Aa1RYzMRIgoSV+bs4sWL0hqtQYMGKVq+A859vFzWr1/v9Vo///wzHn/8cem+xVL7TcMfeughCIIAi8WCmTNnwmxWfplns9lwzz33+Gyp75r3kSNHpG6UckVFRZg+fbqidLLivOVt8atT1jlnzhwAzlb9d955p9cmJjt37pTe95YtW+K2227z+/pE3jBQa+QUe6hFhHFhKwWMTd6e31fpY1Vr1K6MVQQ3orIdP9DA1qiF0PyIqGlp3749rrnmGgDAvn37AHhm0wBg8ODBUvfCtWvXYsqUKVizZg1+/vlnrF69GnfddReGDBmiCHhcTUlq45prrsHs2bMBAJs2bcKAAQPw8ccfY8+ePVixYgUGDx6MNWvWSJtyVzRlyhQAgMlkwvDhw7F48WLs3LkTGzduxIsvvogePXrg+++/V5xTcd7yjpJz587F3r17cfTo0SrnPnbsWMyaNQsAsG3bNvTp0wfvvvsudu/ejc2bN+Oxxx7DyJEjYTKZoFKpsHz58kq7OxL5g58oGjmxWNZIJIJljxQ4itJHeUZNvul1FWvUXAGaax82aYzJqgjerObQXKPmtT0/m4kQURDNmDEDu3fvBuDsnugtq2MwGLB06VJMnjwZFosFq1atwqpVqzzG3XzzzTAajfjkk0+QkZGBsrIyn0GUv9544w1YLBYsXrwYBw8exJ133qk43qdPH0yaNAnz5s3zOHfevHn48ccfcfjwYZw6dQr33Xefx5jk5GQ8/vjjePTRRwEAx44dU2ygPWrUKBgMBphMJixYsAALFizAoEGDsH379irnvmjRIqjVarz//vtIS0vDgw8+6DEmPj4eH3/8MUaNGlXl9YiqwoxaIydfo8bNrilQHDYH7Fb3fjvVWaNm9ZJR0+g1gCzZW3GdmiKjFkJr1Lxn1BioEVHwTJs2TWpvP378eEWZo9wNN9yAvXv3YsaMGUhOToZWq4XBYEDbtm0xZcoUrFu3DqtXr5ayWFarFatXr671/NRqNRYtWoSNGzdi4sSJSE5ORlhYGDp16oS5c+di586dPtefxcTEYPfu3Xj++efRu3dvGAwGaLVaJCQkYMiQIZg/fz4OHz6Mhx56SHrdn3/+ueIarVu3xnfffYdBgwYhIiICRqPRowTTF61WiyVLlmDbtm2YMWMGUlNTodfr0axZMwwcOBDz58/HiRMnpH3siGordD7xUJ0QS2V7qDGj1uQ4bA5k/HoecamxiEry/g9fTci7MALKcr/qZNRca9QEQYDOoJWOVQzUGtIatVAqzSSipic+Pt7v9WQ9e/bERx99VOmYyZMne914edasWVIpYEWuroeVGT16NEaPHu312BNPPIEnnnjC6zGj0Yi5c+di7ty5lV4/JyfH57Fhw4b5zKD5M/chQ4ZgyJAhVY6Ta9OmjV8bWFf2vgaCP6/PG26+HRz8RNHIcQ+1pm3Hop+xb8Vv0IRpMOvT6TDG1a5kxaViIKWRZZGqtUZN1kREW0mgFrJr1AxaZyZQrPAYERERUS2x9LGRE4tlGTW25m9y0n92thC2ldtw7lfPDlk1Je/4qFKroNa6f5VU2fXRS0YNUDYUke/RZrfZ4bC7yyxDaY2aoBI89lLTGEJnfkRERNRw8RNFI6csfeQataZGntEqzS0L2HXlgZTGoFF0E9VHuteoWbzto1bquUYNUAZqFllGTZ5NA0JrjRrgDDblr4lr1IiIKFAsFotfXSkro9Pp0K1btwDNiOpTaH3ioYBzyEofVSx9bHLkAU9gAzXvrfkBZfDlbY2a4lyDj4yarDyyYqv+UFqjBjjX5JVku/fT4T5qREQUKJmZmZVuSu6P1NTUGq9No+BioNbI6QZ2BkTnWjVN7zbBng7VI1EUYS2ro0Ct3HuwBShLH72uUZNveK3IqLl/HSlKHysGaiFU+gh4NhThPmpEREQUCPxE0ciF3dAPYTf0C/Y0KAhsZhtEh7vLRVlAAzV38FSx1E/e9dFSZoXD7oBKrZI9Jm8mImtCIlvrJc8EWmWBmlqnhqAKrU3b5fNW69SK10pERFQb/naLpMaJnyiIGil50w4AKMkt9TGy+iotfZTto+ZtHspmIt7XqFl9rFELtfVpgDIwZdkjERERBQoDNaJGSp65AgKcUfOxzgy4kiWTJb3Ki9wNbURR9J1RU5Q+ygO10NxDzUURbLLskYiIiAKEgRpRI2X1ksmqmN2qKZu89LFCoKZSqxAm6/xYXugO1Kwmm3LPMR9dH30HaqEXCOmYUSMiIqI6wECNqJHyFpSV5gWm/NFSSekjABhi3FtBmBSBmjLL53sftQYUqDGjRkRERHWAgRpRI+UtUAtU+WNlGTUAMES7t4IwFbi3iJB3fFSpVVDr3KWM/uyjFoqBmrzLJfdQIyIiokAJmUDNZDLh5ZdfRt++fREREYHIyEj06tUL8+bNQ0FBQcCfb/fu3VCr1RAEAZs2bQr49YmCzVtr/EC16K+smQhQMVBzZ9Tk69O04VrFRtn+ZNS0IRioJfdt5fU2ERERUW2ExKeezMxMjBkzxmPn9UOHDuHQoUNYunQp1q1bF7Bd1a1WK+699144HI6AXI8oFMmDHZe6CNQ03jJqPkofFR0fjRWakCgCNXdwJg/U1CHYTKRZuzjc/t/JKLpUjPZD2gZ7OkRERNRIBD2jZrPZcNNNN+Ho0aMQBAEPPvggNm/ejC1btmDOnDlQqVRIT0/HpEmTUFRUFJDn/Ne//oXDhw8H5FpEoapi10cAKM0JzBo1+T5qOq+Bmo/SR3nHR0OFjaLlpY+ycaG+Rg0Akro1R6eRHaDWhl4gSURERA1T0AO1xYsXY+/evQCAN954AwsXLsSIESMwbNgwvPHGG/jkk08AAKdOncKrr75a6+c7fvw4XnrppVpfhyjUeW0mEqiMWrkso1ZVMxF56WOp74xaQ91HjYiIiKguBD1Qe/PNNwEAXbt2xUMPPeRxfNq0aZgwYQIA4K233oLNZvMY4y9RFHHvvffCbDajWbNmNb4OUUNQsT0/AJTmBSpQk60b89JAQ7FGrdD7GjV5t0SgYqDmvfQxFNeoEREREdWFoAZqx48fx/HjxwEA06dPh0rlfTozZ84EAOTn5+PHH3+s8fP997//xfbt25GQkID/+7//q/F1iBoC76WPddBMxFBVe3556aMsoxbuZaNs2fVF0bnhWqhveE1ERERUF4IaqO3cuVO6PXToUJ/jBg8eLN3eunVrjZ7rwoULeOqppwAAr732GuLj42t0HaKGwnvpY4DWqCkCtarWqMn2UVN0ffSdURMdIuwWZ8mjspkIM2pERETUNAQ1UDt27Jh0u0OHDj7HJSUlwWAweJxTHbNnz0ZRURFGjx6NGTNm1OgaRA2Jt4yaucSiCHxqSrGPmpc1amGy0kdLqQV2q/3Kbf/WqAHuvdS4Ro2IiIiaoqAGapmZmQAAjUaDFi1aVDq2ZcuWinOq4/PPP8eaNWtgMBjw7rvvVn+i1ZSVlYUjR45U609aWlqdz4uaFm9r1IDANBRR7qPmmVELl5U+Au51ahZTJV0fKwR8VilQ4xo1IqJQs2zZMgiC4HU/2jZt2kAQBEVFVHXMmjULgiCgTZs2AZipd0eOHPH6eG3nHsqee+456WfGz50NQ1A/9eTn5wMAjEajYuNbb4xGIwCgsLCw2s8xZ84cAMDcuXPRvn37Gsy0ehYuXIjnn3++zp+HqDLeSh8BZ/ljdMuoGl9XFEVlMxEvpY8avQZag0ZqCmIqMCGimVGZUauwRk1QCYpzXIFmQ2jPT0REDYPZbMaLL76Il19+GRaLZ+UJUSgJakbNbDYDAMLCwqoYCan00XWOvx5//HFcvnwZPXv2xOOPP179SRI1UN5KH4HaZ9TsFjtEhyjd99ZMBFCWP7rWqSm6Php1Hud4a9Ef6hteExFRwzF//ny88MILsFq9f5lJFEqC+vW0q8tjVdk0AFIHOF+dIb358ccfsXTpUqhUKixatAharec3/0SNlbz0MSxKj/Ii55cctQ3U5GWPAKDxUvoIAIZoA4ovlQBwlz5W1YTE+ZizS6TXNWo6ZtSIiEJdenp6sKfgU1XbPIXy3KnpCeqnnoiICABAeXl5FSPdY/R6vV/XNplMuO+++wAADz74IK655poazrL6Zs+ejalTp1brnLS0NNx88811MyFqkiyyoCi2dQwuHrkMIACBWrnyHzlvzUSAip0frwRfimYifmbULJU3LiEiIiJqjIL6qScyMhIAUFZW9QfH0lJnW/HY2Fi/rv3cc88hLS0NLVu2xEsvvVTzSdZAYmIiEhMT6/U5ieTsVrvU3h4AYlpHywK12rXot5a7gy21Tg2V2nuWW95QxGvpY7ivjNqV53GtUSvnGjUiIiJqeoK6Ri01NRUAYLFYkJ2dXelYV7dHV/fHyhw6dAivvfYaAOC+++7D6dOnceDAAcWfjIwMafypU6ekx7mwlBqDio1EYpJjpNuBLH30Vr7oolijdmXTa+WG154ZNZ3sehYva9S44TURkdurr74qdfHbtm1bpWMfeeQRCIIArVaLrKws6fGSkhK8/vrrGDduHJKTk2EwGGA0GpGamoqpU6fim2++qfa8quqc6HA48MUXX2DUqFFo0aIFjEYjrrrqKrz55puw2+1ez6lo/fr1uOuuu9C5c2fExMRAp9MhMTERQ4YMwcsvv4yioiLFeFeXSnmzN9d7N2vWLL/nDgC7du3CzJkz0a5dO4SFhSEmJgb9+vXD3LlzkZub6/Wc9PR06fm2bNkCi8WCBQsWYMCAAYiJiUFERAR69+6NefPmobi42K/3oC5kZ2fjueeeQ///Z+++w6OqtgYO/8709IQkEJIAofcq0otSFBFEBBuo2BW91nuvvXeveu3YP5BrAaWIFJUO0nvvoRMggfQ67Xx/DDkzk5lUQkLIep/HxzPn7H1mT4bArFl7r921K2FhYQQEBNC4cWPGjRvH6tWrS+ybnZ3N+++/T+/evbX3pH79+lx99dV88cUXJdaZ2L59Ow8++CAtW7YkICCA4OBgmjVrxl133VXqn+1LWbV+Pd26dWvtODExkejoaL/tTp48SV5enk+f4mzcuFGbg/zKK6/wyiuvlNj+wQcf1I4PHTp0QcvBClEVbEUKiUTEh2nHOWfON1DzrPhY/F8hAUUyaqqqli+jpk19lH3UhBDCnzFjxvDUU0/hdDqZOnUq/fr189uuMDACGDx4sDbrZ+XKlYwcOdLvl+VHjx7l6NGjTJs2jTvvvJOJEydWyphzcnK44YYbmD9/vtf5LVu28NhjjzFjxgzq1atXbP+srCxuvPFG/vrrL59rKSkppKSksGLFCr766iuWLl2qJQUqg91u5+GHH+brr7/2Ol9QUMCmTZvYtGkTn376KT///DNDhgwp9j5paWn07NmTTZs2eZ3ftm0b27ZtY9KkSfz999/ExcVV2tjLYtasWdx5552kp6d7nT98+DCHDx9m8uTJPPzww3z88cfo9d5fnB46dIiBAwdy6NAhr/OnTp3i1KlTzJ8/nw8//JCFCxfSsGFDrzbfffcdDzzwgE+QnpiYSGJiIpMmTeKBBx7giy++KFNdi0tJtWbUunXrph2vXLmy2HYrVqzQjnv16nVBxyTEpcAzc2UMMBAcHaQ9zkmtvDVq/vZQK+S1Ri0jzxXguYtFlmGNmut5ZOqjEEL4V79+fQYMGADAtGnTis1GLVu2jJMnTwJw2223Aa4vwa+55hpSUlIIDg7m6aefZu7cuaxZs4YZM2bw0EMPaUXYJk2axLx58yplzDfffLMWpPXv359p06axZs0aJk6cSJs2bVi2bBnTpk0rtv/dd9+tBWlDhw5lypQprFy5UgsECoOAw4cP8+9//1vrd91117F582YeeOAB7dzmzZvZvHkzr732WpnG/sADD2hBWtOmTZkwYQKrVq1iwYIFPP7445hMJtLT0xk+fHiJWaCHHnqITZs2MWTIEO31T548mTZt2gCuoOfJJ58s05gqy6JFi7jxxhtJT0/HZDLx+OOPs2DBAlatWsWECRO07a0+//xzxo8f79N/3LhxHDp0CKPRyDPPPMPChQtZt24dM2bMYPjw4QDs37+fe++916vf3r17GT9+PA6Hg2bNmvHNN9+watUq/v77byZMmKAlT7766it+/vnnC/tDuBip1ax9+/YqoHbo0EF1Op1+21x77bUqoEZERKj5+fmV8rwTJ05UcX1sVBcsWFAp9zwfO3bs0MYDqDt27KjuIYka7MT2k+pH/b5QP+r3hfr19d+rGUkZ2uOP+n2h2q32Ct97z8L92n1+vn9ase0O/H1Qa/e/cVPU7DPZXmOw5dt8+iz58G/t+rJPV6pOp9OrT/qJjAqPWwhx6bLb7equXbu0/+z2iv8dV9NMmjRJ++ywcOFCv23uv/9+FVCDg4PVnJwcVVVV9cknn9T6zZs3z2+/adOmaW3uuecer2slfY5q1KiRCqi9e/f2Oj937lytz5gxY3w+9+Xk5Kh9+vTR2jRq1Mjr+tatW7VrN910k98xZ2RkqHFxcSqgWiwW1Wq1el1/+eWXtXv4U9zYFy5cqPXr3bu3mpWV5dN35cqVakBAgAqoDRs29HruQ4cOeX3Oe+qpp/yOPT4+XgVUk8mkZmZm+h1jRXm+9v3792vnbTab2rhxYxVQAwIC1JUrV/r0zcrKUnv37q31/+uvv/y+trffftvvc994441am+PHj/uMSa/Xq8eOHfPpd/z4cTUiIkIF1EGDBp3Py6+Q6v67pVozaoAWlW/bto333nvP5/rUqVOZO3cu4FpvVtaqj0LUZrZc742lAyODvK6fT1bNs5hISWvUPKc+5qbne1V81Ol16E2+682Mgd5THz0LooCsURNClJ+qquRmF1Trf6qqlj7QCho1ahSBgYGA6zNTUXa7nRkzZgAwcuRIrW1SUhL16tWja9euXHPNNX7vPXLkSO1zl+fa/or65ptvAFfV7wkTJvhMYwsMDGTSpEnFbsW0a9cumjVrhslk4sUXX/TbJjQ0VHs9+fn5pdZAKKvC2gcGg4Eff/xRq1zuqVevXtq4jh49qk03LSoyMtJvFi80NJSbbroJcNVv2LdvX6WMvTQzZ87Upiy+8MILfmevBQcH89NPP2lZ1vfff1+7dvr0ae24SZMmfp/jmWee4eGHH+a///2v17TJwr7BwcF+p7zGxcXx6quv8swzz3itJawtqn0e0f33388333zD5s2befrpp9m5cye33347JpOJmTNn8umnnwKuxZ3PPvusV9/Dhw/TuHFjwJU+X7p0aVUPX4iLkjXHey2YwaT33kvtTC6h9UIqdG97XvmnPuZn5lOQ415EbAw0+p1n7rnmzZpn81qfBjL1UQhRfnk5Vh6/6adqHcNHv4whMPjCfNEcHBzMddddx5QpU5gxYwYTJkzAYHD/XblgwQLOnDkDwNixY7XzhdPInE5nsffW6XRER0dz/PjxMm2lVBK73c6iRYsAGDJkCGFhYX7bNW3alD59+vidOnjLLbdwyy234HQ6S9xXNyYmRjs+33ED2Gw2li1bBrjW+JW07u2+++7j+eefR1VV5s+fz5gxY3za9OvXr9jEQ+HnWnAV56gKCxcu1I7vueeeYts1bNiQIUOGMHv2bJYvX05+fj4Wi4WmTZtiMBiw2+3885//xGKxcM0113jtX9ylSxe6dOnic8+WLVsCkJGRwc0338xbb71Fq1atvNo88sgj5/sSa6xqz6jp9XrmzJmjvSmTJ09m8ODB9O/fn48++giHw0FcXBxz5swp9pdaCOHNa43auSxVUGSgdu58Kj9a87zXvxUnMMydUVMdKlmn3f/gmIL8B3hFi4nY84tsri2BmhBC+Chcd3b27FmvD90AU6ZMAVzBy6BBg3z6FgY8+fn57N69m9mzZ/Pee+8xZswY4uPjOX78OFByQFcWJ06c0KoZduzYscS2Xbt2LfF64ZhVVeX48eMsXbqUb775hkcffZTLL7/ca1um8x03wJEjR7Rtoi6//PIS20ZFRWlZpZ07d/ptU1KgFxTkngFT2ubclaVwnA0bNiyxkAu460sUFBSQmJgIuF5zYbbr+PHjjBgxgujoaEaPHs0333yj/Rny5/bbb9cC65kzZ9K6dWtatmzJY489xrx58yol0K7JLopPPbGxsWzevJlPPvmEqVOnsn//fqxWK02aNOH666/nX//6F3Xq1KnuYQpRY1jzPDJqAa6iHUGRQZw9lAac315qnpUbDSVk1EzBJnR6HU6H6x/JjCR3qeTCMRVVdB81e4E7o6boFHSGav9uSQghLjpXX3010dHRpKSkMHXqVK3iYEFBAb/99hvgykYVrdSXmZnJxx9/zNSpU9m9e7ffoEZRlEqZuum5JUBpn+lKCxZmzZrFhAkTWLlypRZAeSop21YRniX3y7JPbt26dUlMTCQ1NdXvdc9grCjP2SYXcsqsp8LXV9bXVsjz9X322WfodDq++eYbVFUlIyOD6dOnM336dMAV4N5zzz3cc889XhnfyMhI5s+fz2233ca2bdsA2LdvH/v27eOTTz4hMDCQ4cOH88QTT9C9e/dKeb01yUURqAFYLBaeeuopnnrqqTL3SUhIqPAf4jvvvLNWznUVtYPXGrWgys2opR9N145D6vnO0S+kKAqWMAu559bDZZx07wtTXEat6D5qRfdQq21leYUQ5y8gyMRHv/hOP6vqMVxIBoOBm2++mc8++4zffvuNr776CpPJxLx587T9xAqzboUOHDjAoEGDOHLkiHucAQG0atWKDh060L17d66++moGDBjg1aYqeE6Z8+R0Ohk3bhw//PCDdk5RFBo3bkzbtm257LLLuOKKK1i4cCFvvPFGpY3H87NmWf4dKqy+WVP+zSp8feV5bUXbm81mvvrqK5555hmmTp3K7NmzWbt2rdZ+/fr1rF+/nkmTJjF//nxCQtzLL9q3b8+WLVtYvHgx06dPZ968edqfudzcXKZOncovv/zCO++8U6444VJw0QRqQojK47lGzXhuY+nASgrUzhxyf4MW1SSyxLYB4R6B2okM7by/za7Bz9RHq5TmF0KcH0VRLtj6sIvJbbfdxmeffUZ6ejrz589n2LBhWnGRli1bctlll3m1v+WWW7QPw0888QR33nknbdu29cm6VdY6Kc91Y6UV+EhLS/N7/vPPP9eCtO7du/Pyyy/Tt29fn8IeFdmkuySeGUDPzGBxCtvUlNlgheMsz2vz7OepcePGPPPMMzzzzDNkZmayZMkS5s6dyy+//EJGRgZr1qzhnXfe4c033/TqpygKAwcOZODAgYDri4T58+czc+ZMFi5ciKqqPPvss4wYMUJb11YbyDyiWkAtyEctZm8VcWmyFqn6CEUzahWb+mjNtXlNYYxqUvI/QgFh7oIimZ4ZNT+bXYO/NWoSqAkhRFl0796d5s2bA/D777+Tl5fHnDlzAN9s2vr169m4cSPgKh7x3//+lw4dOvgN0opuflxRsbGxhIeHA7Bhw4YS227dutXv+S+//BKAiIgIFixYwDXXXOO3+uKxY8fOb7BFNGnSRJuuuH79+hLbJicnc/jwYYAaE1C0b98ecK3FKy1YW7duHeDKehYWPlFVlSNHjrBkyRKvtqGhoYwYMYKvv/6aTZs2aT/DP//8U2uTn5/Ptm3bfNbzNWvWjIceeogFCxbwzjvvAK6MatGN0i91EqhdolRVJfedl8m6+2YyrxuA8+jh6h6SqEKeBT8KpxMGe5Toz61gRi31cKq2abXBYiAsNrTE9p4l+j2LiRjLmlHzWKMmgZoQQpSssKrjnDlz+PPPP7X1W57VHgGtCARA586di73fr7/+qk1dO9/CFnq9nmHDhgEwf/58kpKS/LY7c+ZMsR/GC8fdtGlTr6lznlJTU1mwYIH2uOi4K7J+zWAw0L9/f8BVRbOkrQq+/fZb7bhwM/KLnWeRmf/7v/8rtt2RI0e096Znz54EBLj+jX/mmWdISEhgwIABWpBaVJMmTbQvEvLy8rTzjRo1omPHjjz88MPFPm/hmsuifWsDCdQuUYqi4Dh4AOexI+B04jh0oLqHJKqQzU/VR8+pj9kVDNTOHHRPe4xMiEDRlTyf3bNEf2FREShhjZrXPmp276mPfvZdE0II4VaYOTt58qS2Rqt3795eJd/BVcChkGd2w9PatWv55z//qT22Wq1+25XHww8/jKIoWK1Wxo0bR0FBgdd1u93OPffcU2ylv8Jx79y5028lwczMTMaMGeM1dbLouD3L4pdnWuejjz4KuEr133777X6LmKxatUr7ucfGxmp7ol3sRo4cScOGDQF4/fXXWbt2rU+b7Oxsxo4dqwW+jz/+uHZt6NCh2nFxa8h27NihZc08p+EW9l2+fLmWAS6qsHJp0b61gQRqlzB9k2basfOgBGq1iWdlRlNQYdVHd6CWl56H017+ksVnD7orX0WWsj4NvKc+eipL1Uenw0lBlmeFScmoCSFESZo2bUqPHj0A2LRpE+CbTQPo06ePVr1vzpw5jBo1it9//521a9cyc+ZM7rrrLvr27esV8BQWJTkfPXr04KGHHgJce3d169aNH374gXXr1jF16lT69OnD77//rm3KXdSoUaMAV1bliiuu4JtvvmHVqlUsWLCAN998k3bt2vHXX3959Sk6bs+Kki+99BIbNmxg165dpY796quv1orQLV++nE6dOvHll1+yZs0aFi9ezJNPPsmAAQPIy8tDp9MxefLkEqs7XkwMBgMTJ05Ep9ORm5tL//79efLJJ1m0aBFr1qzhyy+/pHPnzqxcuRKAcePGMXLkSK1///79ueKKKwBXFrZPnz7873//096bt956iyuvvBKbzYbJZOJf//qX1vfpp5/GbDajqiqjRo3iH//4h/Zn8bfffmPs2LG8++67gGt6b+EattpCPvlcwvSNm2HD9ReWQwK1WsVrjVqA7xo1VMhNyyU42nduf0FWASe2nSS2fQyWUO9AyzOjVtr6NPCe+uipLPuoAeSmu6c4yNRHIYQo3W233caaNWsA1zoif1mdgIAAJk6cyMiRI7FarcyYMYMZM2b4tLv++usJCgrixx9/5OjRo+Tm5hYbRJXVxx9/jNVq5ZtvvmHbtm3cfvvtXtc7derEddddx2uvvebT97XXXmPJkiXs2LGDxMRE7r//fp828fHx/POf/+SJJ54AYPfu3fTp00e7PnDgQAICAsjLy+PDDz/kww8/pHfv3qxYsaLUsX/99dfo9Xq+++47Dhw4wPjx433aREZG8sMPP9S4gGLAgAHMmDGD22+/naysLO1nU9Rjjz3Ge++953P+559/ZvDgwezYsYOVK1dqQZ2n4OBgJk6c6LWPXqtWrfjhhx+4/fbbyc/P5/PPP+fzzz/36duxY0dmzZp1nq+y5pGM2iVM55FRk0CtdvHKqJ1bD2a0GDEFuzNZ2Wd8pz+qTpUZ/5zD7Of+ZNqjs7yybqqqctZz6mNZArXiMmrFrVErkjXL8wzUTBKoCSFEaW655RatvP0111zjNc3R09ChQ9mwYQO33XYb8fHxGI1GAgICaNy4MaNGjWLu3LnMnDlTy2LZbDZmzpx53uPT6/V8/fXXLFiwgOHDhxMfH4/FYqFFixa89NJLrFq1qtj1Z+Hh4axZs4ZXX32Vjh07EhAQgNFoJDo6mr59+/Lee++xY8cOHn74Ye11//LLL173aNCgAX/88Qe9e/cmODiYoKAgnymYxTEajXz77bcsX76c2267jUaNGmE2m4mKiqJ79+6899577N2712tNVU0yYsQIDhw4wPPPP0/nzp0JDQ0lMDCQVq1acd9997Fx40Y++ugjv9snxMTEsGHDBj777DMGDBhAdHQ0BoOBsLAwOnbsyLPPPsvu3bsZPXq0T9/Ro0ezY8cOHn/8cdq3b09ISAhGo5GYmBiuvvpqvv32WzZs2FDq/nqXIkWtqt30RIl27txJu3bttMc7duygbdu253VP55kUNo95gENKDKeVOvR99jY69G91vkMVNcC3N0zWSvCP/nQEcR3qAzD59imkndsHbfhbQ2jSO8GrX9L2k/z6D/c3Vjd+fj2x7VwllXPO5vLtDZO1a/f9No7ACP8Zs0LHNp1gxhO+ZZKHvDiQloOa++3z+VXfavuntbqqBXvm7wOg+ZVNGfrK4BKfTwhROzkcDvbt26c9btGihU8FQyGEKK/q/rtFMmqXMCUyirXmDszXX85WXVP2r00svZO4JPgrzw8QFOWeL+9vL7X9yw56PT663l3i+IzH+rTAOgGlBmlQfEatuKqPAMYAd+bMK6Nmlg9dQgghhKg9JFC7hCmKQv1I94fepENnS2gtLhWqU8XmWZ4/yB0Uea5Tyz6T49PvwNIigdoGd1Utr2mPjcu2iWdxa9TMxaxRA+91arlpMvVRCCGEELWTfPK5xNVvEA7n9i48mVy2OdiiZvPcQw3cxUQAwuq75/0fWnmYHnd1RVFcJfZP70kmO8U7eDu1O5mC7ALMwWavjFpUGSo+AljCzH7PG4up+ui65h5vnhQTEUIIUYtZrdYyVaUsiclkok2bNpU0IlGV5JPPJS6uTTxsdJXXPZunx1pgxyQfeC9ptlzvPWM8pxm2GNCMdZNdJZtTDpwladsp4jq61q8VnfYIoDpUjm9OomnfxuUuJAKgN+gxB5soyPYeU3FVH8E7sMzLcO+lI+X5hRBC1DZJSUklbkpeFo0aNSp2I2pxcZOpj5e4ep1aYFBdhRlUFE4eTi2lh6jpPNen6Qw6r42iIxvXocFlcdrjLdO3A66Kjp7THj2zV0c3HMfpcHL2sHs/nbKU5i/kb/pjcVUfwTuj5rA6PMYka9SEEEIIUXtIoHaJMzZuQl3StccntvlmTcSlpbhCIoU6jWqvHSf+fYjM01kk7ztD5qks10kFuo51f3t3dMNxMk5kuoMmBeokRJR5PAHhvgVF/I2rkLGYa7JGTQghRG2TkJCAqqrn9Z9k02ouCdQucYrFQkyAe9rZiV0nqnE0oir420PNU0KPhoTFhgKuAiLbftvJAY9pj7HtYmg9pIX2OP14BodWH9Eeh8eFYbQUH2gVFRDmnVHT6XXoTcVnx0wBxQRqMmVXCCGEELWIBGq1QP267oIOSef20BKXLptHRs1fdkqn19HxBo89+2bvZt9i94boza5oQmi9EMIbhGnnNk/brh2XdX1aoaIZNVOQUStg4o+xuEBN1qgJIYqh03l/nLHb7dU0EiHEpcThcHg9Lvp3zYUmgVotEOtRSv3kWfnH61JX2tRHgDbXtNT2KyvIKiDzZJZ2rVm/JgA06tpAO5ednK0dRzUtW8XHQkXXqBU3tVG7XlygVkIWTghRuymKgsnknkGQk5NTQmshhCibvDx39WmjseQvmi8ECdRqgbi2jbTjVKuJ/CLl28WlpbSpjwDmYDOth7T0OR/Tph4hdYMBaHh5vN++5SkkAr6bXpdUSARKCNRk6qMQogQhIe7tR9LS0iSrJoQ4L6qqkpGRoT0OCgqq8jHIJ59aoG6XVhjVRGyK6wPwyX0nadyxYTWPSlwonptdFxf0AHS6oT3bZu70Otf8iibacVynWBS9gupQvdqUe+qjT6BWWkbN/19LEqgJIUoSGhrK2bOu/R6tVitHjhwhIiKCoKAgDAb5+0MIUTaqqmK1WklNTfXKzkugJi4Iff1Y6ukyOa66pqwd37xfArVLmDXHI6MWVHz2KqJhOI26NeDIumPauWb9GmvH5iAT9dvUI2n7Ke2cwWwgrH5oucZTdOpjaRm1YouJyBo1IUQJzGYzkZGRXsHa6dOnq3lUQohLQXBwsFfWvqrI1MdaQFEUYkLciyGTdh2vxtGIC60sa9QKdb6pg3Zcv109QosEYQ27ek9/rJMQgU5fvr82fIqJVHiNmgRqQojiKYpCdHQ0kZHlW0crhBAlMZvNxMbGVvn6NJCMWq0R26gOnCvcd+KQbHp9KStPoNbo8gYM+Gc/Tu1OpuvYTj7XG3aNZ83EDdrj8q5PA9+pj8YKr1GTYiJCiJIVBmuhoaFkZmaSlZWF1WotvaMQQhQRGBhIeHg4ISEhVV7tsZAEarVEfPc2sH0fAKeyDTgz0tGFhVfvoMQFUdY1aoXaX9eG9te18XutXqu6mIJNWLNdH3TKuz4N/Ex9DKpgRk2mPgohykBRFCwWCxaLhbp166KqKk6ns7qHJYSoQXQ6XbVk0IqSTz61RHyv9vCtK1BLV4LJ+nsFYcOGVfOoxIVQlqqPZaUz6Gg1qDnbftuJoldo3KNR6Z2KMAYYMZgN2AtcFdhMAVL1UQhRdRRFQa+XjLwQouaRTz61RGS9EMx6JwUOV+r2+LKNEqhdoqw5ZZ/6WBZ9xvegfrt6RDQIJ6JheIXuERBuIeu0ay+20jJqxY1Z1qgJIYQQojaRYiK1hKIoxMQEao+Tdh1HtRZU44jEhWLLK1vVx7IyWoy0GtyCeq3qVvgenuvUKr6PmnwjLoQQQojaQwK1WiSuVax2fMoejH3LxmocjbhQPIuJlGWNWlWI7VBfO45pU3LA5688v96oK3e1SSGEEEKImkzmEtUicY0jgYMAnFIisK/+G2O3XtU7KFHpylP1sar0uPtywmJDCY8PI7pZVIltDX42vNbL+jQhhBBC1DLyFXUtEtsoQjs+rdTBtnoFqlTCuqSoqlqpxUQqiznIRKdR7UnoXvpG63qDHr3Je5qjrE8TQgghRG0jn35qkdhG4dpxphJE7tkMAvfvwdDSf2l2UbVUVSX1cBoF2QXYrQ4cVgeqUyW2fQyWUEvpNwBXH4eqPTZeJBm18jIGGHFY3Zu0y/o0IYQQQtQ2EqjVIuGRgVgCjeSfmxp3SokgZMpkDC+/U80jEwBznv+LgysP+5y3hFkY98MtZQrWPKc9wsWTUSsvY4CB/AyPx7KHmhBCCCFqGZn6WIsoikKcx/THU0od7CuWYt+8oRpHJQCyz+T4DdIA8jPyObz2WJnu4zntEcUV8NRERQuK6GXqoxBCCCFqGQnUahnP6Y+JSiwniOT4J1+Sm5lbfYMS5Gfkux8oEBYbijnYnQ07c/Bsme5j8ywkEmBEUZRKG2NVKlqtUja7FkIIIURtI59+aplYjw2Lt+qasVXXDE4Dt0yl3eXxjH2oJ5H1gqttfLVVQbZ7T7vgqCDu/HkM63/czKqv1wJw9mBqme7jmVGrqevTwF+gJmvUhBBCCFG7SKBWyzQuYdPiHeuP88p9vzCyh5nel0egCw1H37gpurr1qnCEtVNBtjvAMgebAYhqWkc7d6bMgZpnaf6auT4NfKc+GiWjJoQQQohaRj791DJNWkVz/bgubPj7MLkZeeSezSAf9wf6ArvClBVWNi7fwI2O5UQoOQS++SHGy3tU46gvfQVZ7oxa4ZTHqCaR2rns5GzyswqwhJhLvM/FuIdaRRTNqMk+akIIIYSobeTTTy009OaODL25IwAFv/1C7ucfsllpxix9L/IUV2XB/boG/FcZzVjHItpO+gpD1+41dr1TTeAvoxYcHYQp2IT13LWzB1OJ61i/xPvYvKY+1tyMmqxRE0IIIURtJ8VEajnTiBsJ+fBLej8ykueGK7SNzNOu5Stm/k8/hMUHjNh3ba/GUV76PNeoFWbUFEUhqrHn9MfSC4pcMhm1QJn6KIQQQojaTQK1Wk5RFAztOmIefgP1HnqQRyeP565/9sV8bt8qVdExR9+TSf9Zis1qr+bRXrr8ZdQAopq6pz+WZZ2aZzGRouu8ahIpJiKEEEKI2k4CNeFFURR6DmzG0x9cS50w94fj9SkhvP/P2eR4rKUSlSffc41aiHvKYmQTd0btbDkzajW56qPPPmqSURNCCCFELSOBmvArvnEdnvv8BpoYzmjnDiWm89v3G6txVJcuq9fUR4+MmlegloqqqiXex2sftSBZoyaEEEIIUVNJoCaKFVonmEduqUc3527t3JrFieR5TK8rpNrt5L75Itn/uBvH/j1VOcxLQnFTHyM91qhZc21kncoq8T6XzBq1AO/ATNaoCSGEEKK2kUBNlChg2Aiu128gQM0HoCDfzppFiT7tbIv/wrZ0AY69u8if/E1VD7PG8yom4jH10RxsJiTGvQF5aevUvNeoXUoZNVmjJoQQQojaRQI1USJdaBhBg6/icude7dyy2Tt9puDZd27Tjh0HfQO50lgXzCPntee87lObFJdRA4hqXPaCIpfKGjWZ+iiEEEKI2k4CNVEq8y130MN4UHucdDyL/TtOe7Vx7HVPj1RTTqMW5Jf5/s4zyeR98Cb2vxeT9/4b5z/gGshfef5CUU3LXlDEcx+1mjz1UYqJCCGEEKK2k0BNlEoXE0uDB+6kufO4dm7J/y3RjtWCfJyHPLJoqooz6USZ7+84cggcDgCcx4+i5uSc/6BrENWpUpBTfEYtskk5Mmp5l0gxEdlHTQghhBC1nARqokxMw0bSu5E767Nlbw7pB13BmOPAPnA6vNo7jx8t873VlGTvviePF9Py0mTNsYLHTFKfjJpH5ce0Y+nYrd4/a697eRYTkX3UhBBCCCFqLAnURJkoisLlL48nlFwAHOhZ/vEM17HHtMdCjhPHynxvZ9FA7UTtCtQ8pz2CbyYsvEEYeqPrV1V1qKQdSSv2XlaPzJwxsOZm1IoGmbJGTQghhBC1jQRqosyMsXH07hKmPV55QMFhteHYt8unbXkyas4z3oGaI6m2BWoe68qCTOj03r+WeoOeiEYR2uPipj867U4cHtm2mrxGrWhGTdaoCSGEEKK2kUBNlEv/+69CpzoBSFOD2DZzpd+MmrMcGTWfqY+1LlArvpBIoSiP/dTOFFNQJC/Tu4BLTQ7U9EY9AWEWABSdQmB4QDWPSAghhBCiakmgJsqlTsO6tAvP1B7P/2039uO+QVm5Mmq1fupj8YVECkU28az86D+jdnq3++cYEG7BHOL/XjVF34d7EdEwnO53diUoMrC6hyOEEEIIUaVkPpEot/5XxLNtVjYAiRkWFuq6cJVzIxiNYHMVs1DT01Czs1CCQ0q9X9Gpj5JR8xXV1LPyo/+MWtL2U9pxbPv6KIpSSSOsHq2vbkHrq1tU9zCEEEIIIaqFZNREubW5aSDt1EPa44W6y9ijNMDQvhNY3FPUylJQRM3Lhews73NnU1Dz8iptvBe7smTUPCs/5pzJJS/Dd5+6pB2egVpMJY5QCCGEEEJUNQnURLnpI+pwa6s0otR0AFRF4Wf9ADLi26CLb4AKHFLqMfOnHaxasL/EeznPpPg/f6rs+7DVdPlZpWfUgqKCvKYyFt342l5gJ3mPOzMZ20ECNSGEEEKImkwCNVEhof37cYd9AUbVNdUxV7Hwf1vC2GBsy8eGG5hguJ4F63OY9OEK1i07WOx9ihYSKVSV69QcNgfJ+86gOtXSG18A1jJMfVQUhaimHgVFEr3XqZ3em4LD5iryYjAbiG4edQFGKoQQQgghqooEaqJCjH36U59URjn+1s4dPVnAzwfqckKJ9mo79+etOIsJgpwpp/2fr6J1aqpT5ef7pvPzfdNY8M6SKnnOorymPpZQACS6mTv4OrrB++dz0mPaY0zruugNskG0EEIIIURNJoGaqBBdVF30rdtxmbqfHo6dJbY9eTSdLauP+L1WtJCIdj7pOHa7k2Vz97BtXdlL/ZfXmUOpnD3kyk7tXXQAVa36rFpZ1qgBNO7VSDs+sv4YBR5TJpO2eaxPk2mPQgghhBA1ngRqosKM/QcCMMK5iqYhruIfRqOO7s7dPGn7lQ6KOzibN3Wb3yBITfFYo+ZViOQ43/1nGT9+vprPXlnItrUXJljzDHacdie2PPsFeZ4Sx1CGqY8A8R1jCQh37S3mtDtJXHkYcGUFvQuJ1L8wAxVCCCGEEFVGAjVRYabhozAOHIK5UxeeePcanvtoGO98MZTRjuXUJ5UB1vVa26MHzrJjve/eap4ZNUO7jtrxlsMONq44rD3+49dtF+Q1WHOsXo89g6aqUtaMms6go1m/Jtrj/UsSAUg9muYOOBWIaVO3wmNxpqWiOp0V7i+EEEIIISqHBGqiwhSTicBnXiH4vc8wJTQmoUU0IbHRKGHhAMRxljYxDq393E/m+9zDc42avmMXAHIxMTO7nVe7xF3JHNl/ptJfg2dGzd/jqlDWjBpA8yubasdH1x8nP6vAa9pjVNPIEoO9Esfxyw9k3Xwt2Q/ejmqv+syiEEIIIYRwk0BNVDpdXAPt+Mrjs7Xjg6kG9m7yzqp5Vn00tOsAej1z9D3JUgJ97rto1q5KH6tnNsv1uBoCNa/y/CUHWXEd6xMQ4Zoi6nQ4Sfz7UJGNriu+Pq1g9nRQVZyHErFv2Vjh+wghhBBCiPMngZqodJ6BWoJ6mqZO955o8/5vtXas5uejZmW6+9WL5UBEO9brWmnnWngEHuuXHyIjNbdSx1o0MCsauF1oTrsTa65Ne1xaRk2n19G8v/f0R8+Kj7HtKhaoqaqKmuou+a8WU41TCCGEEEJUDQnURKXTxTf0ejzAuVk73n0wl8W/7yInq8C74qNOhzUwhF/zOmunGkQqPPraYMIjXdk1h93Jsnl7XcfHj5L/40QciSVvqF2aooFZfhVPfbTmej9/SeX5C3lNf9x4nIwkd7Ab26GChUTy88DqUVjlbOVPMxVCCCGEEGVnqO4BiEuPvmVr94PAINpe3ZcGv5/mmK4eAFO+XMuv366nbbMgmurakEUgKZZ6nHxsLqlWV6CiUx3c2ioLk9nAlcNaM/N711S8ZfP2MGR0WwqeewLnyRNYp/9MyOTpKMEhFRqrb0atagM1r0BRAVNgyRk1cE1vDKwTQG5qHqrDXUkzpF4wIXWDKzQONT3d+/GZFP8NhRBCCCFElZCMmqh0hi7dMN9xL4a+Awj+4AsCxt3LEGWTVxuH3cm2PVnM1Pdlof4yttriSU7K0q5f6dxK/WxXef++17TAaHJt4JyVns+6qStxnnRNp1SzMrEtW1ThsfqsUavijJpXIZEgE4pOKbWPTq+jmcf0x0Lnsz7NmZHm/fisBGpCCCGEENVJAjVR6RRFwXL7vQS99Bb6Zi1QgoJp07E+j9mm09uxg2BTyeXfGzpPM9C5CWfScQCCQy1095jut/ivQ3juyGZdMK/CYy0oWp6/ygO1spXmL6qFx8+jUP0Krk8DUDPSvR5LoCaEEEIIUb1k6qOoEobe/YnfuJZ45xmuCz5E0hOfsPqL2Zw6nkEE2cS0iqPByKupa8wl7OV70QHOkydwpiSji67LgOtas+KvfQAcT1c4pMTQRHUV0XDs3IbjxDH0HkVMPNntTlBVDEa9zzVrNRcTKU9pfk/128UQFBlIzll3cZUKr08D1HTvjJoqa9SEEEIIIaqVZNRElTD26qcd604n0So8j1vrH+ARx2/c5ljIsN6hdOvfhEY926KPiXU1VFWsi/4EIL5xHVp1cgci0/X9yMWdgbIt+MPr+VRV5cDO03zxxmIeHvE9j47+kS/fWsLmVUew2dx7u1V3MZH8cpTm91R0+qMp2ERkQkSFx1E0o6ampcpeakIIIYQQ1UgCNVEldJFR6Fu7N7G2rVyKM8U9vU6Jruv6v6Jgumqou938uaiqa6LjkNHttfPJSgST9Fdjw5Ulsy6ch+p0Yj9xgjXvfs/bD/3Kf/49j82rjqCqYLc52LTiMF+8sZh/j53CTxNWU5BvoyAr32ucVZ1Rs3pOfSxDxUdPba5phaJ3rWlr2icBnb7iv85F16ihqqhpqf4bCyGEEEKIC06mPooqY+zdH8fuHQAU/PIDngvNdNH1tGPToKEUTP4WAOexIzj27MTQuh1tusRxdYM0/jrmyhwd0tVnCgMY61iAcvoUG1/5jFkb7JwmAsgpdhy52VaWztlD+tlc7FkFgLuAR7UWEynH1EeAui2iuOG/wzl7KJVWg5uf1zjUjAyfc84zrmmnQgghhBCi6l00GbW8vDzeeecdunTpQnBwMCEhIXTo0IHXXnuN9CKlw8srLS2Nt956i549e1KnTh1MJhMxMTEMGzaMX3/9VcvYiAvLeOVVYLG4HhQUeO3b5RkQ6OrHou/g3k/Ndq5YiOpwMCj5Ty537tGubdM14Vf9FXylH8bXG0LOBWluzdvV46EXB/CPVwbRrX8TTGb3dxNbVh8lW+f9K5BfJMMG4DyVRM7Tj5L7+vOoeXkVeOXFK8iqWDGRQvGdYuk4sl2F+npSi2bUkHVqQgghhBDV6aLIqCUlJTF48GB27drldX779u1s376diRMnMnfuXNq0aVPue69evZpRo0Zx8uRJr/OnT59m7ty5zJ07lyFDhvDLL78QElKxvbhE2ejq1iPo1ffIef05yHaX4kdRUOpEebU1Db6WvG2ujbKtSxZgefAxHAf2Q3Ymo/ibDCWIfYqreMgGXUuvvoqq0kFNpL9xL21fnoQS5NpbrEO3BuTn2fjohb84uNs17fKUyUCTfJuWUytIz6Wo/P99h33TOgD0LVphvvn28/5ZFDqfjFplKrqPGoBT9lITQgghhKg21Z5Rs9vtjBgxgl27dqEoCuPHj2fx4sUsXbqURx99FJ1Ox+HDh7nuuuvIzMws172PHDnCtddey8mTJzEYDDz44IP88ccfrF27lh9//JEePXoA8OeffzJ27NgL8fJEEYYulxP86Xfo4htq53QxsSgG7+8MjP2udGffsrOwr16Bff1qAPQ4ubPlaRo0reNz/5ZBGTxm+oPbHItokH8c67xZXtctAUZuuq+b9jhfryPD4P41sNvBduqUVx/7NvcecLaNa8v5it1Uq5Wc154l66Fx2PfsBCpenr+yFS0mApJRE0IIIYSoTtUeqH3zzTds2LABgI8//pgJEyZw5ZVX0r9/fz7++GN+/PFHABITE/nggw/Kde8XXniBtLQ0FEXh119/5YsvvmDIkCF069aNMWPGsGLFCkaPHg3A7NmzmTt3buW+OOGXPr4hwZ98i3HQNejqx2G592GfNkpgEMY+V2qPc99/nYKp/9Meh/ToziOvDqZ+w3AA4gLyeHiohSenPEKT4Vdo7QpmTvWpXtikVV269musPT5tNOC5s1vmV19px86zZ1BPubOxjp3bUa0VKzhinTcL+99LcO7fS+7LT+PMzLhoMmo+xUSQvdSEEEIIIapTtQdqn3zyCQCtW7fm4Yd9P7DfcsstDBs2DIBPP/0UexlLhufk5DB9+nQARo0axfXXX+/TRq/X88UXX2AyuT4g//DDDxV5CaIClJBQAp9+mZDJ0zH2G+C3jemqa90P8vPB5g6QDN16El4nkBc/G8HbE2/kxV/H0/Eft6Lo9ZiuvxH0rmqQakoytuWLfO498pa26FRXeGbXKZz12GMt5++V2PfuBsCxa7t3R2sBjnPZsPI6tXw9p4jAgYKaeob8zz/wDtT8VH20b1pP/uRvcKYkV+g5y0K1WiHXd8qnKoGaEEIIIUS1qdZAbc+ePezZ4yoMMWbMGHQ6/8MZN24c4CoKsmTJkjLde/PmzeSdK/wwfPjwYttFRUXRvr2r7Pv27duLbSeqnr5jF/SduvqcN3S5HF3jZq5jg47IesHodO7Kjbqouq7CJecU/PKjTxGQ8JRE2juOao/PGPXYzt3CqpqwLfkLAPu5KpX5GLWtAOxbN5b7tcybvJ43d7fiA+NNvGS4iy/0w5m5NIOkM1kU7upWdOqj81QSOc8/QcH/viP39efK/Zxl5W/aI7iyiUIIIYQQonpUazGRVatWacf9+vUrtl2fPn2042XLljF48OBS712vXj1efvlljh8/TseOHUts63S6MisFBVVbml2UTNHpCHr3E5xHDqHm54HDAUYT+mYtUBSlxL7m0WOwLXRtgu1M3EfWPTcT8ODjGPpeiaIo2Devp539ODv0jXAoCk5F4ZjZSJTNQR5m7KtX4LjvUXavP8JK/WB2KgkYcHC5cw/91+0goZR6Iqrd7iqSotfz95/7+G3KDjg3Zqti5KASy0FiwQlKoI5gh5Pdu5OJaBZJYJArw2tbt8q1aA5w7N6B48A+9M1anOdP1c9Y/Ux7BCkmIoQQQghRnao1UNu9e7d23KxZs2LbxcTEEBAQQF5enlefkjRv3pxXXnml1Hapqans2OHKmjRo0KBM9xZVR9Hp0DduWu5++qbNMfYbqE17VFOSyX39OQzdexP43GvYN61HwUK01c4psxGAPL2OY3od36hX0f70QQ7fPZWUlLZa3tmKjpX69qxKVOn46gIGj2pPs7b1fIJG++4d5L7+HGp2NntufJ4fphylJKqikGXQ88vkTcz4aQtd+iQweGRbos9VvSxkXTiPgAsQqDk9Kz5aLK5ppgA52ah5eSgBAZX+nEIIIYQQomTVGqglJSW5BmEwUL9+/RLbxsbGkpiYqPWpLB9++CE2mw2Aq666qpTWZZOcnExKSvmyEQcOHKiU5xZuAU+9iC4unoJpP8G599i+diXZ/xyPM3EfVrUrdRxOsvUOsg3uNWr5ion1SitI8d1TDUBFYcva42xZe5y6sSF0v7Ip3a9sSt3YUBwH95Pz3BOQncVupQGTfj6Mei7SMzhVGuXbsIToaHF5IPtW7mW3LgGnR6BntztZt/Qg65YepIkulP5KI1qpR9ABtkV/Ybn3Hz4VMs+XZ0ZN3yABR+I+OJdldp5NQe9RoVMIIYQQQlSNag3U0tJcHxCDgoJKncoWFBQEQEZGRqU9/6ZNm3jvvfe0+99zzz2Vct8JEybw6quvVsq9RMUpZguWu8djvOpa8j99X9sLzXlgHwBWTChAwwI7OTYH6UY9mXodqsefRUVVaaUepVd8Hhm6YJYdCyZFCdeuJydlMfvHLcz+cQsNG4UQcHwfqq0v6OGgUh9nYZCmOmiU78CiqpDp4IpHR9Ip/12mrLSSo1fI0uuwBhvI8Zh9e9BZl4OGIcQ7Uxjn+Ivw9DTsG9di7N67Un9OnmvUlDqRKKmRWiER9ewZkEBNCCGEEKLKVWsxkcI1YZbC/bJKEHBu+lVlrSM7duwY119/vXa/V199lejo6Eq5t7i46OMbEvj2R5iuG+113qq61oIpQLBTJb7ATstcK12th7nMuZdBjo08a/+Jux1/0qFbPP0HJvAv+1Tusv9B88B0n+c5eiSLvY767NM1YJ+uAXbF9T2IWbXSIj/LFaSdc+ZgGsqt96MDQhwqja05vJj9DXdfH01CC+/Nv4/rovnMcD0nqYNt/jy/r1E9lzGsCDXdnVFTwsLRRbqfX0r0CyGEEEJUj2oN1AqrPJaWTQNQz33ILa4yZHmcOHGCQYMGcezYMQCGDh3Kk08+ed73FRcvRafD8o9/Yr7lDu2cFfe+ZYV7mOmBBgYbtziWcrVzAxFkA2Bo0x59p8vQAW3Uo9yfPY3X7qrDsIRU6uG/GAeAQbVzl+NPTE691/mU/WcoyHfv3mbCih4nbZZ+zTPvDeHR9ido6zykXc9QgplguI7dq/ehZrk3fledTnLffIHMoX3J/e9bqA6H1/OcPp7Bolm72LvdexNvT2qmO0uthIejRLm/sJBNr4UQQgghqke1Tn0MDg4GID/f/1ogT4VtzGbfvabK4+DBgwwaNIhDh1wfgrt168bUqVPLFCyW1UMPPcSNN95Yrj4HDhzwu9ebqDyKomC55yGUsHDy//ctNms4nNs+LLR+KCn7XUGJLaI+FIlP9G3ao4SFQWAQ5OaAw0HA12/TH+gHnKQOR3T1UBUdpisGYejQGcffi2i44Xci1Uz+xnsz6+R9KYTUDdYem3SujJiakox96QIaHl7NnY6zrFZbM9PQD1WFfMXMt+rV3PLVPHo9OhqjyYB95TJsSxe6xv3H7yhGE5Z//JODe1KYP307W1YfRVVBp1N48u0htGgf4/NzcXpk1HRhEeCxlYFUfhRCCCGEqB7VGqiFhIQAkOtns92icnJyAIiIiKjw823cuJGhQ4eSnOzaPLhr16789ddfWsBYWerWrUvdunUr9Z6i8phHj8E08mZst0+FXFd2KjQmxB2oBUV4BWpKTCy6OpEAGDp0wr5mpdf9FCAuxE7CoA6Yho5An9AEAGevemTdMZ38XO8gDSBl/1niO8W6x1QnBM4lyvK/nYCaehaAns7d1Hvy33z74RpsTgWHoufHxXn8vOwHYhuGU//0TqJ0nXCgw4YB25wkkjZM4mCR+MrpVJkxcQNPf3Ctz5cSXmvUwsJRPKZRyqbXQgghhBDVo1oDtUaNGgFgtVpJSUkpcY1YYbXH2NjYYtuUZP78+YwaNYrsbNdUtj59+jBnzhzCwsIqdD9Rsyl6PdYcq/Y4NCZEO7Zi9ipTb2jbXrtmGjYK+7o14HSgxNTH0PlyjF17YOjRB8XkHZDpwiMw3ziGjO9/8Xn+tGPpZJ/J0R4HNG4AW13Haqo7StQlNKHzwDY87sxjwn/XkqO41mo6HSrHD6VxnFjQF/mdKCa2Orgnha1rjtKpZyOv816BWngEOvdSOsmoCSGEEEJUk2pdo9a6dWvtODExsdh2J0+eJO/cdCzPPmU1a9Yshg8frgVpQ4cOZf78+RKk1WKqqlKQ7T9QK8i1Ybi8l/bY0PEy7djYvRchP/5GyI+/Efq/mQQ++RzGfgN8grRC5lvGoV5xre/zO1VObD2pPbbUjcDQw7eao6FDFwCaD76Mx7un09J5FINqL9NrbBKaz0NP9aRDN/f+gDO/34TT4fRq51VMJDTMa42aU9aoCSGEEEJUi2rNqHXr1k07XrlyJT169PDbbsWKFdpxr169/LYpzoIFC7jpppuwWl0fyseNG8e3336LoZL3ohI1i73AjtPuDlhC63sEalkFBDz4KHlOB7o6kRgHD/Xqq4sqe3VQxWhEHTQSFvhWazy587R2bA42Yx5ym8+0SkOHztpxw38/xn0P3ob91J8kE8EJJYrjShSZShAB3XtgDjTBqsWY87NorR6l4dlklK//IOKOp9i+HlQVTh5NZ83iRHoNbg6A6rB7FSfRhUegWtwbXKtnz6CqaqWu4RRCCCGEEKWr1oxas2bNaN/eNa1s8uTJWmXHor7//nvAtT6tf//+Zb7/0aNHufHGG7Ug7R//+AcTJ06UIE14ZdOgSEYt24oSVY+gV94l4NGnznuD6fws/1tKOKzuCo3mYBP69p3Qt2rr1UbfoZN2rAQFEfj0K+h1CvVJpau6j+udq7int5N7X72W258ezNivHmFoGxsNVdc6TDUlmfAPn6JrY3eg9fsPm7FZXVk5z4qP4FqjdjrfzHzdZSRRB2xWr0BOCCGEEEJUjWoN1ADGjx8PwLZt27TNpz1NnTqVuXPnAnD//feXq+rjXXfdpW2Qfeutt/Lpp59KZkAAUJDtDp70Jj1BkYHaY9WpYs2r+L5kPs9VTKDmyRxsRlEUzDfdpp3TNWmOLiLSq52hXUfMt45zn1AUzLe7N2rXxcQS9P4ELPc8BIUBptPJoL0/oldcX4SkpuSw8JVJ2PfuQk1Pd9/LaOLE6QLeeXEpC/Rd+cxwPYeUeqhnksv/ooUQQgghxHmp9tTS/fffzzfffMPmzZt5+umn2blzJ7fffjsmk4mZM2fy6aefApCQkMCzzz7r1ffw4cM0btwYgP79+7N06VLt2uLFi1m8eDHgysSNHz+eLVu2lDqeTp06VcrrEhc3z4yaOdiEOdj7C4CCrALMQf7XnZWXZ0YtomE4aUfTfdqYQ1zPZehzBeY77sWxczvmcff5vZ/5tntwnjiGbeVyzGPv1KpMFlL0esy33IHhsm7kvPYs6qmT1CGLHvYdrNS7Mth/bbbSefsT1Hn0ca1fZmgMn72ykPxcV5BqU4z8n/4aHt+YSJMmzc/nRyCEEEIIIcqp2gM1vV7PnDlzGDhwIHv27GHy5MlMnjzZq01cXFy5KzR+99132nFaWhr9+vUrU7/ipl+KS4tnRs0cbEZn0GEMMGI7l0lzXQ8ppnc5n8sjUIttH0NGUqbX+rjCMcC5vd5uv7fE+ykGAwHPvU4AJW8Wr2/eiuDPJ5H3zsvY169hkHMT63WtsCpGcpUAPrNfza0/zCIeyMfId3l9SMvM8bpHvmLmsx+O8XSvDOrFSfEdIYQQQoiqUu1TH8FVcn/z5s28++67dOnShZCQEMxmM61bt+bZZ59l27ZttG3btvQbedi6desFGq24FFiLZNQAzCHurFpZpiuWVX6We0P3oMhAIhN89wIsHENZKYpSpmm8utAwAl//APOd9xMSHcaVoUe1a8lKBJ8mX86fusv5UT+IJJs7MO3eMQSd6goms20GPnz2T9LO5Pjcv5DqsGP7ezH2LRvL9TqEEEIIIYR/1Z5RK2SxWHjqqad46qmnytwnISGh2AzYjh07Kmto4hLkOfXRdG6KoznYRPa55Vj5WVZ/3Sr2XFmeQaGZ6OZRpBw469Wm6NTLyqTo9VjG3o1l7N2MdDhxTPibRfMSURUFp6Jjkb6LV/vBI9sy+q4uNB71MFNsrkqsqWdy+c+/5zH05g50u6IpZov3Xx3WaT+T/+3nAAS+8i7G3mUv+iOEEEIIIXxdFBk1Iapa0amPABbPjFp2ZWbUPJ4rxBWoFVXejFpF6fQ6bnqkP/9od4I6aobP9S69GzHqnstRDAZ6DmzGCId7u4Czp7P53yerePqOqUz7bj1nTmVp16yL/9KO8yd9jer0ntoJkJqcRfqcuVj/muP3uhBCCCGEcJNATdRKRYuJuP5/YaY+FnhMfbSEmqnbwl+gduEyav60vvVqnrRPo6djp3aucTTc/a9+6HSuKZXGQUPo49zBNY61KB6Z69xsK/On7+CFe6czd8pW7GfP4jx4QLvuPJyIfY1770OAOT9t4dk7p/HM5yf57MMNrHxnMgX5lVdZsziqqnLyWDrWgrJtEi6EEEIIcbG4aKY+ClGV/GXUCisvuq5X3tRHz4yaJcRMVNMoUIBzsY9Or8NgqdpfRX2nrgTExnBD0gp6OHeRrERw+ejRmMzucehbt0NXP44BJ7fQznmYNe1vZd0RHQV5rqDH6VSZNXkTe5bt5CYCCCVP61vw0yQMPfuiKArL5u7h9x82A6AqevYojdizQmXK+p+5rG9jRt55GeF1Aqls1gI7X765mB0bThCXEMHTH1yLJcBY6c8DYN+9g/wvPkLfpDmWR/+NopPvwIQQQghxfiRQE7WS34zahSomkuk59dGCKdBIeHwY6ccyzp0zVfn+fopOh2noCPK//ZxYUolVUzHWqePdRlEwDriagh//j7qkM+LUdK5p1Ih1ibCUdqTbXD+3vUcK+NAwmlsdi2mhngDAsXcX9k3r2Gmrz09frPE7BmuBg9ULD7B7cxLjXxhA45bR2jWnU2XN4gOsXXKQ3KwCbDYH1gIHdpuDRs2juOvJPgSWkIW0We1MeH0RuzYlAXDicBqLftvJtbd2Op8fW7HyP/sAx77dOHbvwHDZ5Rj7DrggzyOEEEKI2kO+9hW1kjXHu8CH5/+h8tao2QvsOKwO7bHlXNaursc6taqe9ljIeNW17k2xAaVOpG+bgVdrx2pKMsbt6+mdu54nc3+kYwP3dMhsJZBv9dfyvWkIu5WGOFDY++2vfPPuUlSnq12Yms0d9vlc7tyDRXX/fNPP5vLeU3+wepFr+uThfSm8+8+5TPrvCnZvTuLIgbMkHUnnzKks0s/msnXNUb79zzKcDv/r3Gw2B1+8uUQL0gr9NW0HWRn5fvucD+fZMzj27XY//7rVlf4cQgghhKh9JFATtZL31EdX8GTxCJjyKylQKxrwFWbtPAuKmCppY+3y0kXUwXzbPQDo23dC37KNTxt9g0boW7fzOR+AldvSZnDLLS3Qq65AVFUUdqiN+D/DNbxlGMu3h5tiK3Bds5gU7rH/QXv1EDc5lvGS/X9c51hF4QxBu83BxA/+5r1/z+PtJ+ZwaG9KiWPfseEEs/632ee83ebg67eXsGP9cZ9r+Xk25k2p/G077Ou9M4b2jetkP0YhhBBCnDeZ+ihqJe+pj4UZNY81apU09dFz2qPepMdwbg1Y076NWf3depx2J40uj6+U56oIy9i7MN9wM1gCip1+GfDUS1rpfX2L1hT88B3YbJB2hm7bfybGfoifDANJVtz7w2UqQdqx3qDj3s4Z1F+R6jqhKBhVB32d24kLM/A/R1+yz/2c9u887fXcl/VJ4LK+CZhMBgwmPUtm72brGtdecH/8so0GTerQtV9jAJJPZvLzhDXs3HhC69+ldyMaNY9i5iTX/m5L5+5h4Ig2RMVUzmbmAPYN3hk0NeU0zmNH0DdMqLTnEEIIIUTtI4GaqJU8M10mf2vUKqmYiFdp/mCzFgxFNAhnzLejyUjKJKF7w0p5ropSAkou5KGPb0jQK+9qj53Hj2JbMA8Ax5YNxAFP2n/lYM9b2GBuz9bVR/CclXjnY71oMvNtCieAmkbejHXmVFBVmiRv5qnHRvLNn3kcO5iq9anfMJxbH+xOq06xXmNp0jKat5+cw8mj6QBM+nAFgcEmNq8+yt9/7sXpcGeyOvZoyL1PX4HqdLJ07h7SUnJw2J3M+t8m7vl35ezzpjrs5G9YzzJdJ9II5krnFuqQjX3jWgnULiFOhxOdXiagCCGEqFryL4+olaqqPL/nfSyh3mvRIhvXoUnvBHSGmvVraL7+Jp9zelQ6Xt2JB58fwH++GcF1rKGj8wB32OfTJfAUjgN7tbamwdd4FdsImvEdT717NQOua02jZpHceN/lvPjZCJ8gDcASaOThlwYSeO49sxbY+eiF+Sybu6dIkNaA+5+9AoNBh9Fk4Lrrm2nX1i45yNHEsz73BsjPtbF51REWzNhB2pmcUn8Wjt07+SuvFX/ou7NG35ZPDSM5pkRj37iu1L7i4qeqKl+/s5RHRv/Awt92lt5BCCGEqESSURO1TtECH+7y/Bc4oxZSPUVDKpu+RSv0bdrj2LXdfVKnw9CxCwAh9etw5WUh2NcsAiB/4pfgOPfztgSga9wU89g7sf29GFQV57EjsGgOtzw4GuepJHJff57cKckEPv8Ghg6dfZ6/bmwo9/y7H5+9spCiS8HqRFq4/q5udLuiibYfnJqXS9vpbxCj9uKU4iqY8tNTPzDwulaYW7VBb9Bx8lg629cdZ9+OUzjsrnTgn79u5x+vDPKqRlnUpt/WskTvHmO2EsgX+uHctuVvethsOJNPkf/dBHR1Y7Dc/4iU7a9hDuw8zYblhwCY+f1G+l/bCqNRX82jqjyqqoLTiaK/dF6TEEJcSuRTg6h1igZhWjERj33U7AV27B7BXIWfyzOjVk3VHS8E00jvrJq+RSuUYPe6L2PfK7Vj56FEd7uWrVH0BvRNmmO8eph2vmDytziSjpPz3BM49u1GTT1L3kfvoDp9KzuqdjtN5n3ONXZ3EY8gNY/rHCt5rvkOegxoqgVpALbli1FOJzHU4c5yHcwL5ZupSXz26kI+fnE+v3y9jt1bkrQgDSArI5/3n/mDLefWxBV1+kQGP6z2XddnU4xMclzB4i//JPfZx7H/vQTr9J+xLV3g9z7i4rV7y0nt2Fbg4PC+M9U4msrlzMwg+/7byLp1OPad26p7OEIIIfyQQE3UOp6l+XUGnVbgo2iZ/MqY/uhZTKTo1MeazNjnSpRId6bJ0Oly7+s9+4Kfb+kNrdtrx5Y7HwBLAABqRjrZD9zuyq6d4zx2BPuaFT73KPh5EvYVS7nSuZV77PO4mb95xv4zfZ07UP9ehH3HFq/21r/mANBKPUoTZ5LP/YoK0DvQn5uOaitw8MUbi1k6Z7dXm4J8O1++toB89dyfHdXKmDHNMesKK2DqmPrHGX5Kbk4qrgDWsd17XCWx2RxM/Xotz909jUWzdpW5n6hcuzd7/3nZt+1kMS1rHtuCeTgPJ6KmpVLw64/VPRwhhBB+SKAmap2ipfkLC3wYLAav9WLl3UvN375eBZfg1EcAxWDActcDrgcBgRivGe59PSQUQ5fLffrpW7fVjnWRUZhvud19MT/Pp33BtJ+8HjsO7KPgx4na47ad6nHl5HcIatncfZsJH2mZOMfxo1qApAB3P38VHeoVEO9Mob56lnrGHOrGhtIwPogrHZt5yD6Ll/O/Y/xABUugEQDVqfLThDV88vICZkzcwPI/9jLpv39z4liW9pw3BW2h/9jePHGdhTA1Wzu/UdeSdw0386u+H8m7D/v5SbqoNhuq3Q5AZloe/33mTxb9toszp7L45eu1HDlw6WRyaoq8XKvPNhH7tp8upnXN4zx2lBTC+EXfn1V7z3/2gBBCiMongZqodfyV5gdQFKXCJfoXvb+ML675P9b9b5PXec/92CyXUKAGYLp6GCE//kboT7PQx/puMWDsc6XPuaJ7splHjUGJ8l4DZujSTTt2bN+CffcOwBXM5L73urbeTakbQ9DL76KLqotl/OPuPvv3YJs/F0D7P4AuoSnRfboy/pWrecwxgyft0/hX7g+8+kw7nohZz1DnOhqrp9Cj0mj+d/zr35cREeWuiLlj/XH+/HU7P3y6io0rDmvn+zm20rVPIxRFodHgnvzD/huxqjuwcip61ula8/bRrkz6YDlJR9K8Xq99724yb7yGrJuv5fDfW3nzsd9J3J2sXVdVmPLlWtmbrYrt234Kp9P7Z35g92nstksjqHGePM5U/RWs17Xi14y2l1S2UAghLhUSqIlax6s0f5HNpitSUCTzZCY7Zu/GXmBn3fcbcHh8kMvPzPd770uFrm6M19o0T4be/UHnnv6oi41HF1HHq41isWC5/1F3n179CHzrQ/RtO2jnCn79CVVVKfhxIs6D+7Xzgf98DiXItV+boW0HjFdepV3L+/y/2Hduwzp/nnbOdPW1KIqCvlFj9G3cUzDzvvoE++q/vQdvLaDOtM/5172NqG/ILPb1N3aeZKhzHYZuvVyvMaEJEXUCeNQ+kxvtS4lQ3Nk1JzpWLUrklfG/8clL89mzJQmHw8mpb/6Pg7nBLM9uwHvvbiDtTK7P8yTuSmbd0oPFjqM8nBnp2LdsRLVWTsGcvFwrv36zjilfriGnkqqlXgyKTnuES2udWtrxVI7oYrTHq+btqMbRCCGE8EeqPopax19p/kKeWa/8Mn7ozDzt/jDusDnJOp1NeHyY67my3M91qWXUSqMLC0ffsTOOzRsA32xaIdOVg1FCQlEz0zH2H4ii12O+cSy55woc2FcuJfuum3CeOObuc+1Ir8wbgOXeh7GtWQF5uZCfR86/H3ZtzA2g12McOMTdf8hw8s5VrXRs2aidV0LDUDMzzp3fgGnLBh5Fxy6lEclKOGeVMFIJIVUJIUrN4FbHYsy9+2Lo0cfVX1EwDRuJOvkbuhkO0e+Vh/j7o+ksSI0jTQnVnmfHhhPs2HACRQFVbQOGNq4L52bP6nQKN93fjd1bTmobfE//vw106tkQs8VY9jehCDUrk+wHbkc9m4Kh7wCCXnqrwvcC11q6z19bxL5tpwA4mpjKE29dfUlURvQsJKLoFNRz2bV9O07RrG296hpWpVDtdnadNXt9VbtpbRJjrXaMJvlYIIQQFwvJqIlax3uNmnfw5LWXWhnXqOUU2W8r46Q7A+P1XJdQMZGyMo8eox2bhgwvtp2xa3dMA65G0bs+JBp69kUX18B10en0CtKUejFY7v+Hzz10desR+NLbYDj3QbMwSAMMPfp4ZfOM/QdqhUw8BTz+NIaefb3OGXDSQT3EIOdmbnYsZbxjNs/bf+IB3XzqPng/gS+97VV23zzmToLe/5zgb37EfHl3enUO5Wn7VG62LyY2xO51b3+zGQPMCo++fhUDrmvDjfddjuHcusn0s7n8MdW7Op9qt5P39afkvPwUjqQT7N1+inVLD3I2Odv3xoB16QLUs651V/a/F2MvR4GTopxOlYkf/K0FaeAqZz/5o5U1fppm+tlcbVN1gB5XNtGO93q83ppKTTnNLhp4ncsvcLJt3fFqGpEQQgh/5KszUeuUlFGrUKB21nuqWkaSO1DznPp4KZXnLytjt16ETJ4BCuhifDewLo6i02G+cSx5H73jdV7friMBjz+DEhjk//m6difwudfIfeMF8Cjtb/LYCgBACQzCeMUgbH/O1s7pEppg6H0F+tbtyNq6CXJdAbiuSXMCHnsaNTMd6+/TsW9ci65pcwL/9QL6Js0pStHrMXS8zD3mZi3R/zWHrup+usUEcOzpl5k/Ywe7Nrmn1gWruUSoWdQnlUHN9CR0vhPVWkDoHz9yZSOVBYmuoHL+jJ30vqoF0fVd000Lpkwm59cpbFRasGLLb5y2udfUJbSIokvvRnTpnUDdWFc2z7bwT6+x5k38ihPjXkSn09G8XT2tsE5ZTP9uvbbHmKe1SxKpFxfKsDGdynyvi83uLe73JiommF5XtWD1Itc2E4m7krHbHBhqcNYw78hR9itxPufXLU3ksj4JVT8gIYQQfkmgJmodryxXSNGMmmcxkbKt4ckuklHLPBeoqarqdY9LcY1aWejqlz1A82S85jqcKck4Dh3A0KEzxr4D0NUtfcqZse8AAp54jrwP3gBcRUcM3Xr6tDMNGe4VqJlvvRNFp0OJqkvQ6+9T8NuvGDp0wjTsBpRzWTpjjz6oNhsYDGUOavTNW2rHzoP7ad0xhjZd4jjz91pSX3uZCLIx4ZFp267HmZlB/tefYvtrDv0xsC7wDjJsRuw2B1+8uZiEZpEo1nwKlu5np2EsOUoA2Lyf9/C+Mxzed4YZEzfSon0MVw+MpcGu7RSO+oASy9xdCRx/9i8A2nSJZezDvbQgsFBGai4H96RgDjASFhFAaEQAqxcdYMHMnVqblg2M2JNOkOioC8DvP2wmun4I3a9sWqaf0cVmj0eg1rpTLE1aRmEw6rHbHFgL7BzZf4ambWru9Me9G49hU3yn0G5fd5ycrAKCaunfVUIIcbGRQE3UOiVm1EJ8M2o5qbks/3QV5mATfR/uibHIGqGcIsUfMk66yrbb8uxeJfstoZbKeQG1hKLTYbnz/gr1NQ0ZhhIZiX3dKkxDR2hTKj3p27TH0OcK7CuWYrisu2s65DmGDp0xdOjsf1zG8q0R0zdpDjqdK8OXn4fzxDH0DRMIXP0XRtJdbTp3xXnkMGrqGXA4yHv3VezrVgFgxs61ecv5yeAa3/GDqRw/mHru7i2gSLwYERXoU5Bk3/ZT7Nt+ivqGUfR07mKXksAeXUOvNrs2JfHKQzMZPqYzA0e0ZtfmJFb8tY/t6477VD/01KBBCLcd+Qynw8lnhutJUSIA+P7DFWxedQRLoAlLgIGAIBMNmtShRfsYgi/i3wVVVb3Wp7XuFIvRZKBJq2j2bXdNe9y3/XSNDtR27EoDXH/3tXEe5ohSjxwlALvdyaaVR+g7pEX1DlAIIQQggZqohUpeo+YO3AqLiSz8z1IOr3YVdIhuHkX769p49ck5W2SN2rmMWn5Wvtd5c4h3UCguLOPlPTFe7ptJK6QoCoHPv4F6JhklMhrFzwbdlUEJCEDXoBHOI65pgo4De9FFRWP7e7HWxnTNCBx7d2KdPgVAC9IKdVIPsEZpx0HVf3BgVG10de6jLztp/J/vODzpFzYt2c82XRNOK+61eSeVKGbo+xU7VluBgxkTN/D7/zZht/vuC1hUVEww90ZuwnLQ9Wf9bvuffGYeTY7TqH3o9/l5KBDXuA4tO8QQ1yiCoBAzgSFmgkPM5OVYOXE4zfXfkTRsVgedejbkimGtCQy6ML8/TocTRadoGdJTxzJI95jO3LJjfQBatI/RArW9209yzc0dfG9WA6iqys4T7ui+rfMwEUo2K/WuYj9rlyZKoCaEEBcJCdRErWMtY9XHgqwCkraf1II0gDMHz/rcr+jUx4ykzHPTHt0BoTHAiN5Qc9e0XKoUgwGlHGvnKkrfrKU7UNu/FzUnG/LPBfLBIRh790MXG68FapqAQFAUlNwc7rTOY3PDweQ3bIFt104c6ek40BEZFUAX5RCW04cByHvjBaL27uIq4CrnRg4pMSwJ6M5uWwxFJThPcq1zLekEM8vUj2yn6/ehaJAWGhGA0aQnMy0Pm9W1/UREVCAPj4kn8N0PtHZRZDLOOo+vLddjt/vPwqlq0axgyQ7vO8Ofv27nyuGtGXR9W0LCKicbl5Gay5/TtrPir32ERQRw2z960apTrNf6tAZN62jP17JDDHPO7b+euCsZu92pFXqpSU4cTiPN6s4Kt1KPUo80VuIK1PZvP0XamRwiovyvAxVCCFF1JFATtY7n1EdTUJGMWpFAbdW367yuZ53yrqanqqpPMRFrjpWCrALyM4tfCydqF33zltgWuQp52NeswHkmRbtmuvIqFJMZfYtW6OIaeFW4DHjkX66pkB+8SQBWeh2dC0fnet076NkvcezbS/4XHwLg2LvL63pj9RSNc2dxigiW6DuxTd+Meo0iGTYgiqZffXVu5uRpWhQcY56+B2t1rV1j1it07NmIvle3oHWn+uj0OhwpyWQtWUJueCyRPbtge/phim7/3Fg9xb9a7uPYFXeQl2sjP9dKfp6N9NQ89u84RVZ6PuWVn2vjj6nbWPjbTrr0akTrzrG06RxHeKSreEramRwSdydzaE8KTlWlbZc4WnWs77fgR1ZGPn9N286S2bu1oDM5L4sPn/+Lq0a14/gh94bkrTvFoubl4kxJJqF5LAaDDrvdSUG+naMHztCkVV2ve6uqyp5PJ7Ny4QEK6jXi2n+PIKGF94buF5LqcJSaGd62zv3nq4EzmVDyCFHzqKNmkKqEoaqwftlBrhrVvoS7CCGEqAoSqIlax3vqY/FVH88eSvNaYwaQVaTsuTXbij3fu+Q6uLJqns9T2/ZQE970zTwKihx3Z2gJCsZ801jg3B5sQ4aT/90EAIxXDMY46BoAbMsXY1+/2ue+xoFXY2jXCX2T5uR//7VWqbLw3oYu3bCfm2IZQxq3OpYw7oEuBIy8HgBr4LMU/PIDzhPHCMTKaMdyejh2cVqJoKXtGHWCr8XSqR+KXofz5AlyHrsPJS2VICDfZAarx+/S2Lso+HEiAJFbFtPoH/ehb9QK59kzOJNPoaYW4OxoIVmpz4GCCBL3pZOelktuVgE5WQVkZxVgMOiJbRhGXKNw4hpHkpGay5I5e8jLcX25YitwsHbJQdYucW3+Xb9BGAX5dlJTvLPai37bRWCwiQ7dGtCqU30y0/JIPpHJ6aRMjuw/i7XA93dWVeGvad6bPrduXYfsxx/AeXA/xiHDadyqM/t3nAZcZfoLA7Xc7ALWLE5k2cwtnDztBJrACdj2xBwG39CO4WM7Y7YYPJ5LJTUlB6NJT3CoBZ3OPRWxIN/O6RMZnDqWQXZWPkajAZNZj9GkJzDIRONmESgrFqFE1MHYvZfWz751E7lvPA9GE0Hvf44+Nt7nNQJsX3NYO26lHkWpEwWpZ+jsPMAivata6dolEqgJIcTFQAI1UesU5HhOfSw+o1Y0SAPIPJ3l9Ti7yPq0QhlJmVjz3GX4JFCr3fTN/K/5CXzmFa9tC0w3uoI21eHAfOMYbd1UwL9eIPeN53EeO4KuXn109ePQNW2OecSNgGu7AdM112Gd/rP73k+9hOGybmSfOIrz4AHXSZ0e85WD3M83dISruuaRQ9hWLMX6x+/EJ58iXj0DgHX2dNSsDCwPPk7Os4+jpnlMV/QI0oz9BmIedz+2lctxHnaVsc994wXU/DzUU+6phAChQBeDgctbtcU0ZDjGq0Zor9O2dhW5b70A2/NQwsJR6kTRv2dH1sQMYOHsvWRleGfjTh7LKPZnnpttZc3iRNYsTiy2TbCaSy/nTrbompF8rghKIYNBR8Mjq3Ee3O8a25+zaT68O/vPxXKbVx3BYXeyZ+tJDu5O9rumT1Vh/vQdbFl9hJvu705ejpVdm06wa3MSGal5gGtz85BwC6HhAeTmWElNzva7v16hEIONKwvW08O5i5D7H8J841hS9hzmrxd/YZNtGE4UWj87k673Dadd1zivDdKzMvI5uM/9HrYxJ6Nv1Qb7quV08QjUjh1M5cThNOISInye//C+M2xde5SW7WNo2bF+qdVPnU6V5fP2cPTAWQaMaEN84zolthdCCOEmgZqoVRw2h1cGzDejVnLBAmu2lYLsAi3AK1rxsVDGySyvb8ll6mPtpgQFo4uNx5nk3lDYfNs9GHv08W6n12O+5Q6f/ro6kQT/98sSn8N841hsK5agJp/GfOcDGHu5ioYEvvAmOU88gJqRjunaEejCvT98K4qCPqEJ+oQmmG+5A9vSBRRMmaytqbMtXYhtzUrIz3N38symWSxY7n0IRVEwj76VvPdd2yIUBmx+2e04dmwlb8dWnGmpWG65A+fJE+S+9SLkun6n1PQ01PQ0dAf3c8VNZgZMHM+ODcfZvTmJXZtPcKbINOTIesE0bV0Xa4GdneuPYStmjRxAoJrHlc6t9HLuxISd/s5tzNb1YI2+rdamSaso1N8mePVLOLEBcBV0Kdz+wJ+mzhNkKEGcUcIBSE7K4rNXFvpt63SqZKTmaYFbabLsRn7X92KJriN9v13OqVV6tuzNxUlLrQLoxhTY+NYSjCY9bbrE0rlXIzp2b8jOjSe0IDBUzSG+fgC6+q791OqSTnyIleNZrr8DP35xPvc/cwXN2rper8PhZN6Urcz5eSuqU2Xuz1tp3rYew8d2KjZgs9kcTPzgb22/vXXLDvHAc1fQ/vIGPm2FEEL4kkBN1Cqe2TTwzaj5y3w17tmIoxuO4bC5vjHPOp3tEaj5z6hlJmVi8Sh6YAmVQK2207frqAVqhm69MN9+T6XeXxcZRch3U1Czs9FFRrmft0Ejgr+b4toWoEXrEu+hGAyYBl2Dsc+V5L7yNPaNa10XPII0810PYr7hFuyb1+M4lIixaw/tw75xwNXkT/oK1WMNHoASEopSJxIlJBTHoUTIcQdZBf/3Bbr6sVh/+dF76qYH68xfCRlxI5f1SdA2ZE5OymDPxJkoyxfQyHmSCHM8pqbXYN+5jZy8v9mjNGS7rjFnjFFEdmlLvfhwwveuJWLbUhqppzFhxzTqFgztOsEbzzPKsYJW6jFmGvtRYAnlqsY5qJu9iwfFb52HwXSP3+xZEPl0deyhu3M30WRiQ898XVeWGTqVmCHzx2DUExMfSnhkIHabE2uBnYKkU5zOULErrn+2s5Qg5ul7wN48fPZoOMdmdbB1zTG2rjmGTqcQ4PFFVCv1KIa4OHSx7o2v+4Yc5+esJgCkn83l/af/4Ia7u3JZ7wS+e385B3ae9rr//p2n+e9zf9G8XT2uuakDbTrHotO7Cqzk59r44o3FXsVZrAV2Pn91Ebc90os+V0tlSSGEKI0EaqJW8Vw3pugVjAHevwImPyXAe97XjdSjaWSccJXdzzydTVTTSACyzxaXUctEkYya8GC58wHUnGyU0DAC7n8ERVf5FQMVswXF7FsVURcWji4svOz3sVgIfO09ct94Hvvqv7XzpuGjMN86DkVRMPbsi7FnX+9+RiNBr71H/k+TUIKCMXS6DEOny9BFuYtuqA47jr17yHvnZZwnT4CqkvfGC173CXjiGXQNE8h940XUsylgs5I/8SsCn37ZdY+cbIK+epOOq5ZrfZyJ+8hP3AeAGeioHqSj4yA4wNL6IQy9W5I9bQqorgIi5rF3a/v0Bb74FrmvP0dbxxFaW/+H0qQDulUpFI2vjA4rfRLyWXrAhMGop1mburRsE0XC1tnU3zofPSoEh6CPbQ37dnOtcy2X9WrMLycbc+xgKhHRQbTpHEvrzrG0ahyIMTiQzDyVzLQ8MtLyMJkNxDQII6pusBbwAORPmUzBd9+SSSBLdJ1YY2iLXfX+81NHzaCfcT8h+ans1CWwS9+YfNU97dHpVMnxKHDU2nkEXf0+6Oq717J1ydmG/b5bmP5/63E6VJxOlWnfrmfGxA04He6fhiXQSH6ue2r3/h2n2b9jARFRgfQY2IyO3Rvw84Q1HDngWyXX6VSZ/PFK0s7kMGxMpzJvHC+EELWRBGqiVinI8l6fVvRDgk6vwxRkwnou89ZiQDOim0YSGhOiBWpZHuvUPDNqYXGhWpuMpEyvDa6LZu5E7aOLrkvQK+9W9zDKTDGZCHzpbfI+fQ/b/LkYBw/F8vCTpX6w1jdvRdDL7xR/X70BQ5t2BL7+PtmP3uuTRTNdOxLT0OsBsNx5P3kfvAmAbdGfOEbdgjMtjfzP3veaRur3ecLCUTPSAcj/+XsMm9eD0xWk6erHYR57l9bW2Ls/loeeIP/T99EB7NnmDtJ0ekwjRmOdORWAa4/8wpAvpxBcLwJl23ryPn4D9fQp7V6WOx9AMRrJ27cbgPpb/uL5n38nJ8dBULARx6rlFPz8Go79e7Dq9QS1bENYxy40rBPpWtSWpMPZui26lq79Gq1/zqbgXIGZUHIZ2fQsw58bwR/fr2H9yqPUdabT17md9rEOgl9+m+wHbqOD4xB2xzJO3PsG21MsbF59lMw0d1bUpNporp5AVz/eK6PG2WQGXducJq2i+frtJdrG6Z5BWvcrmzDm4Z4cP5jK7B+3sGere3PwtDO5/DF1G39M3eb1XnTr34SB17fh89cWaeOY/eMW0s7kMvYfPdHra942B0IIURUkUBO1SkkVHwvVb1uPI+uOYbAY6HF3VwBC6gZr17NOu6dteZbmj21f3x3MJWcTWi9EuyZTH0VNpBgMBD7xLDzxbKXfW9+oMYEvvEHuC/8Ep2sqoa5JMyzjH9PaGAcPpWD6zzgPHwRVJftfD3tNmwQwXN4T8x33Yl+5DOvi+WAtwHzHfRj7DSDrzhshOwtyc7BvWq/1Md8zHsVo9LqPafgoHPv3Yvtzttd548Crsdx5P9b5c13PnZuD8c0nKSjI99pKAVzTW03DrofcXPI+fQ/sdtT0NBwb12POSCNnymScxzw2AXc4cOzajmPXdp+fj+HyHhi69iT/q4+1c7rYeILe/BBdRB3GPD+MG/9eQu57r6MEBhH00n/RN26K4fKe2NetwoCTZvsW0eHFN7n1oZ4c2pvC5lVHODprIT3yt2DGjq5+LLp69UGndwWxqorzVBJNWyfwwqcjmPjBcnZsOOH6mQUYGPNQT3oObAZA83YxPPn2EPZtP8VfP65n5/YUnKpvED/gmqbc9HBfdDqFZ/57LZ+8tIBT54rArPhrH5F1g7j21k4+/YQQQkigJmoZq2fFRz/THAGGvDiQvQsPUL9dPSIahAMQEuMOuorLqMW2q8fuv/aCCqpDJSXRPe1Hqj4K4ct4eU8CHnuKvE/fR4mIJPCFN72mbip6PZZ7/0HuC0+6TngGaYqC+fZ7MI+9G0Wnw9CqLZZ7HkJVVS3rZxlzJ/lff+r1nPpWbTH2G+gzFkVRCHjkXzgPJ+LYs8v9HDffjhIYhHnYSAqm/g9AqwTpvqke8023YR57F4reACGhGLr2wL5mBQC5L/1by+aVlX39Guzr17jHVyeSoHc+QRfhrppo7Hslod17g17nel7ANGwk9nWrALCtXIrjwF50TVvQtHVdmjQNI3PqoxQumtPHxrs2fa9bT6vO6Tx5An3DBELCLPzjlcGsWXyAk0fT6TukJXVjQ73GqDocNNqziDu2fUWWTc9mXTPW61pySolEUVWudq5n4KqZOAfVQde6HVH1Qnj6/Wv57MU/SdyXSqPmUQy8vi1CCCH8k0BN1Cpem10XMx3REmqh4w3tvM6F1nNn1DI9qs15lucPrR9KcHQw2ef2WivIkg2vhSiNaej1GPsPBqMRxeT75YmhW0/0nS7DsWWjdk7f6TIC7nsEfYtWPu09p2aaRoymYNav3lMTH3ik2OmbislM4EvvkPOvh3AmHcc04kb0jRq77nX9jRT8Ph3yvNel6lu3I+CJZ9A3buZ13njlVVqg5hWkKQrGvgMw3XQbAI5tm7Dv3gFWKyjgPJOC88A+74EFBRP09kfo6sdSVNGfmaFbT5TouqgpyeBwkD1+HLrYeAx9+mNo1U4L0tDpUaJdFR31sfHYCwM1jymlOp1Cr0HNtcdqTg6OfbtR7Taw2SiY9jOO7ZsBCAH6ObfTr85pzoY1Qk3cSxSZcMYVqIZMno4SEEhQiJkH4nby+75UhjRugMmaAwHhft8PIYSo7SRQE9VGVVVseTZMgSWXxK9MeenudRrlyXKF1PPNqKmq6lWePygqkLDYEC1Q8yQZNSGKpwQFFX9NUQh44lny3n7JtQ/cmDsxdOtVpiIUismM5b5/aMVKDH0HuKo8lkAXXZfgr37AeSYZncem0bqougT951Psq5ajhIahi41HF98QXYNGfsdi7NmXvMBAbbsBDAaMg67BfNNt6Bs00toZWrbG828HVVWxr11J/sQvXfvfBQQS9Np76Js0pywUvQHTdaO1dW3gCr6sv/yIZ81bpW49FIPrI4AuNg42uc7bVi1H36gJ+rbtteymqqrY5s0i7+tP3K+n6PPWicJy70MYr7yKUEXB+uuP5E/6ChwO1PQ0rPNmYR51K46jh1EWzmGE0wlzwdasIeZhI8v02oQQoraRQE1UC9WpMuOJ2ZzYfpI+D/Sgy80dq+R5c1M9AqvIwDL3C/HIqOWczcVudWDLteL0KNMdHBlEWP1QTmw56dNfMmpCVJw+Np7gT/+vQn1N/Qeh6A04T5/ENPyGMvVRLBb08Q19zhtatcXQqmxT9ZSAAAKfepmC6T+jb9EK8w23oqtbr/R+ioKxRx8M3XrhPHgAJTLKa7pjWZhH3Yqi02FdMt83O3eO3qOIiGdA6tiykZwtG8FoRN+6HYZOl2HfvgXH5g3FPp/xikFYHvk3utAw9xhuuQNnRjrWaT8BUDDtJ0zDR5E/8Uv3msS4BpiGDC/XaxNCiNpEAjVRLU7tOs3xc/vrbJyytcoCtRyPTWUD65QjUKsb7Nqq6NysoeyUbGx57vLUBrMBU7CJsCJrOAp5VoAUQlQtY58rqud5e/fH2Lt/hfoqOh36ZhXba0wxGl1r5m66DWfyKWwrl2Fb9BeOvbu0NnqPgNPQvTdM/ApsHjk3mw3Hts04tm32vrklAF10XdDpUELDMV13A6YrBvsdh3nUrVhn/Qo2G+qZFPInfIh9xVL39bse0LJ6QgghfMnfkKJaZHlMD8xNy8Vhd6A36C/483pl1MoRqOmNeoIiA7Wpjlmns3FY3etOgqICURSF0Pp+AjWl+MIlQghxIenqxmAeeTPmkTfjOHII29KFoKqYbxyrtdE3TCDkuynYVizBvnkD9h1bfdbiARh69SPg0ae8NlQv8bmjojEOHopt3iwArHNnup+zeSuMfQec56sTQohLmwRqolrkemS2UCEvLY/g6ODiO1SSHI9ALTAyoFx9Q+qGeARqWTid7r2FgiJda2z8ZdTMQSavza+FEKI66Bs1Rj/uPr/XdPVjMd84FvONY1Htdhz7dmPfshH7lg2QnY1p9BiMVw4u9wbV5ptuc2154HR6nbfc+9AF2fRdCCEuJRKoiWrhGTC5HldNoOYZIJYnowYQGhPMqV2nAcg8ne31gSU4ynUvf4GaTHsUQtQkisGAoU17DG3aw5g7z+te+rgGGPsNcGXyzjF0uRxDl27nOUohhLj0yddZolp4bhTt7/GFYMu3ee2jVp5iIlCk8uOpLK891IKiXBm1gHALxgDv7z+kkIgQojYz33y712PLPQ9X00iEEKJmkUBNVIvcIhm1oo8vzHN6TLdUICCsnFMfPSo/ZiVnewWXwecCNX/r1KQ0vxCiNtM3a4nlgUfRNWqM5eEn/e5/J4QQwpdMfRTVoujUx6oI1DyfMyA8AJ2hfN9TeGbUMk9lYfbYMNszOxcWG8rZg6naY8moCSFqO/PoMZhHj6nuYQghRI0igZqoFkUDs6qY+ljRio+FQj0yatnJ2djy7O77Rbk37A0rklGTQE0IIYQQQpSXTH0UVc5pd5Kblud1rmiG7ULwDAYD65Rv2iNASIw7o+awOb0DvyjvjJonmfoohBBCCCHKSwI1UeXyMvK0jaMLVfUatYpk1MxBJkzB/vdDC6rjzqiFxoZ4XZNATQghhBBClJcEaqLK+ZvmWNVTHwMrEKgBhNYL8TlnCjJhCjRqj32mPoZKoCaEEEIIIcpHAjVR5fxNc8xNzUNVVT+tL8zzVmTqI3hXfixUtMx/aEwIeOwJKxk1IYQQQghRXhKoiSrnL3tmL7BjzbVd0Oc932Ii4D+jFhwd5PXYYDZ4tavocwkhhBBCiNpLAjVR5Ypbj5Z7gac/5nisUavo1Ef/GbUgn3Pd7+qKKchEk94JxLStV6HnEkIIIYQQtZeU5xdVrrj1aDmpuUQ0DL8gz6mqaqVk1PwGalG+92ozpCWtBjdHp5fvQoQQQgghRPnJp0hR5TyrL3qfv3AZtYKsApx2p/Y4MLKCUx9j/Ex99JNRAyRIE0IIIYQQFSafJEWVyzmbU8z5CxeoeRYS0Rt1mIsps1+aED9r1Pxl1IQQQgghhDgfEqiJKueZUQuu655KeCE3vc4tsj5NUZQSWhcvMCIAvdH71yYoyn9GTQghhBBCiIqSQE1UKVVVvTJndZtHaccXspiIZxavooVEABSd4hVcAgRXcBqlEEIIIYQQxZFATVQpa64Ne4FdexzVLFI7royMWubpLNZOhoGUrgAAJ49JREFU2kDSjlNe5z0zaudbLr/oOrXAYtaoCSGEEEIIUVESqIkq5Zk1U/QKUY3ruK8VU2SkrJx2J7/9ay5rJm5g5j/nkJvuvl9lbHZdKMQjo2YJs2Aw6c/rfkIIIYQQQhQlgZqoUl4BU0QAQR6bRRdXZKSs9i1JJO1oOgD2fDsnt7uzap4BYtB5TlUM8cioBcv6NCGEEEIIcQFIoCaqlNdeZpFBXtMQ8zLycdgdFbqv6lTZ8OMmr3PJ+89ox94ZtfML1KKbuqdr1kmIOK97CSGEEEII4Y9seC2qlGchkcA6Ad7TEFXIS8+vUJbq4MrDnD2U5nUuxSNQq8w1ak16J9D5pg5kJmXS4+6u53UvIYQQQggh/JFATVQpz8xWUJ1AjBYjpiAT1hyr6/rZ3HIHaqqqsv6HTT7nvQO1ysuoKTqFfg/3Oq97CCGEEEIIURKZ+iiqlL+1Yp5ZtdwKVH48uuE4p/ek+JzPTskhLz0Ph91BXka++3nPs5iIEEIIIYQQF5oEaqJK+Vsr5lncoyIl+j2zaQ0vjycg3KI9Tt5/hrw072qS55tRE0IIIYQQ4kKTQE1UqRw/GTXPNWPl3fQ6aftJTmw5qT3udnsXoj020U7Zf4Ycj/VppmATBrPM+BVCCCGEEBc3CdRElcotskYNvDNc5c2orf9hs3Yc2z6GuI6xRQK1s36fUwghhBBCiIuZBGqiyjjsDvLS3WvFAiN9pz6WZ9PrtOPpHF5zVHt8+e1dAKjrGagdOFOpm10LIYQQQghRFSRQE1Wm6Foxvxm1cmx6fXi1O0irkxBBo24NAIhu5g7U0o6lk3Ei0+c5hRBCCCGEuJhJoCaqjOf6NM+1Yl5r1MqRUTu81h2oNe7VCEVRAAiPD8MYcG4dmuqqCllICokIIYQQQoiaQAI1UWVyitl0OtCz6uPZXFRVLfVetnwbJ7a6i4gkdG+oHSs6haimkdrj5L3u0v0SqAkhhBBCiJpAAjVRZXI9pjV6Bmqex/YCO9ZcW6n3Or4lCYfVAYApyET9dvW8rnsWFPEke6gJIYQQQoiaQAI1UWU8M2qema2AMAuKXtEel2XT6yNrjmnHDS6LQ2/Qe10vLlCTjJoQQgghhKgJLppALS8vj3feeYcuXboQHBxMSEgIHTp04LXXXiM9Pf287z9lyhQGDhxInTp1sFgsNG3alPHjx7Nv377zH7woE89CIZ6VHhWdQmBEgEe70gO1w+vc69M8pz0WqltsRk0CNSGEEEIIcfG7KHb+TUpKYvDgwezatcvr/Pbt29m+fTsTJ05k7ty5tGnTptz3ttlsjBkzhmnTpnmdP3jwIF9++SWTJ0/m+++/Z/To0ef1GkTpcr0yat5TEIPqBJJzJvdcu5IDtbTj3pUcC6s9eqqTUAedXofT4fQ6Lxk1IYQQQghRE1R7Rs1utzNixAh27dqFoiiMHz+exYsXs3TpUh599FF0Oh2HDx/muuuuIzMzs/QbFvHvf/9bC9IGDx7MrFmzWLVqFe+//z7h4eHk5uZy2223sXHjxsp+abWKvcDO4TVHycvIL7aN18bTkUFe17wKipQSqHlOe4xsXIeQusE+bQwmPXUSIrzOKTqFgHBLifcWQgghhBDiYlDtGbVvvvmGDRs2APDxxx/zyCOPaNf69+9Pz549ufXWW0lMTOSDDz7g1VdfLfO9d+zYwaeffgrAqFGj+PXXX7US7j179uTaa6+lZ8+epKen8+STT7Js2bJKfGW1hzXXxrRHfiPlwFlC6gVzy9ejCAz3LdrhOaXRX0bNXzt/PMvyJ3T3zaYVim4eyZnEs9rjgPAAdPpq/25CCCGEEEKIUlX7p9ZPPvkEgNatW/Pwww/7XL/lllsYNmwYAJ9++il2u73M9/7ss89wOp0YjUY+/PBDLUgr1KpVK1544QUAli9fLlm1ClBVlYX/WUrKAVdAlHU6m4XvLPUpsa+qqlemzCej5rWXWvGBmr3AzvEtSdrjRn7WpxUqWlBEKj4KIYQQQoiaoloDtT179rBnzx4AxowZg07nfzjjxo0DIC0tjSVLlpT5/rNmzQKgX79+NGjgP/Nyxx13aMczZswo872Fy+ZftrF/SaLXuUOrj7B9lvd6Q2u2VSunD75Bk2dxkZI2vfYsy28MMBLbPqbYtkULisj6NCGEEEIIUVNUa6C2atUq7bhfv37FtuvTp492XNbpiQcPHuTUqVOl3js6OpqWLVuW697C5dimE6z4co32WG90/3Fa/vkqzh5O1R57ZtN0Bh2WUO+1YoFlnPp4xGPaY8PL4tAb9cW2jWomgZoQQgghhKiZqjVQ2717t3bcrFmzYtvFxMQQEBDg06cy7g3QuHHjct1bQObpLP54ZQGq0zXFMaReMLd+M1or1uGwOvjz9UXYz2W/vNanRQSg6LynoXpm2EoqJnJ4rbuQSEnTHgHMQSbC4kLdzxEpgZoQQgghhKgZqrWYSFKSa62RwWCgfv36JbaNjY0lMTFR61PWewM0bFjyB/q4uDgAUlNTKSgowGw2l+k5ipOcnExKSkq5+hTdmuDAgQPnNYZCZxLPkHYs47zvo6pQkF1AdnI22WdyST+WTn5WAQB6o55Od7TlVO5JGt0Ux5IP/wbg5NYkMh9LI6JhBKlH0jiZ43pPIuvWYefOnV73zzqTrV0nF6Z//BuoKqpTxZ5vpyDHSn5mAYf2HtH65IXn+NynqILIPE7uc933jCOu1PZCCCFqvqZNm2KxSJVfIUTNVq2BWlpaGgBBQUE+hT6KCgpyFZ/IyChb0FF4b4DgYN/y7f7uXXj/unXrluk5ijNhwoRyVaf05/rrrz+v/lXtreGv+L/grz7LRnhqmp/znjaU/pzv9n+99EaljUUIIcQlZ8eOHbRt27a6hyGEEOelWqc+FhS4MjJl+darcOpjYZ+y3rss9y+8d3nuL4QQQgghhBAXSrUGaoVVHkvLpgFauffiKkMWd++y3N+zlHxZ7y+EEEIIIYQQF0q1Tn0snJKYn59fatvCNmVdP+Y53bG0+3teP9/1aQAPPfQQN954Y7n6ZGZmsmHDBkJDQwkPD6dBgwYVGsuBAwe8pk3+9ttvpRZTETWLvMe1g7zPtYO8zxdG06ZNq3sIQghx3qo1UAsJCQEgN7f4Kn+FcnJyAIiIiCjXvT37lnZvRVEIDw8v0/1LUrdu3Qqtc+vZs+d5P3dRzZo1k3n6lzh5j2sHeZ9rB3mfhRBCFKrWeX6NGjUCwGq1llolsbCKY2xsbLnuDXDixIkS2xZej46OxmCo1thVCCGEEEIIIao3UGvdurV2nJiYWGy7kydPkpeX59OnMu4Nrs2xy3NvIYQQQgghhLiQqjVQ69atm3a8cuXKYtutWLFCO+7Vq1eZ7h0bG0t8fHyp905JSWHfvn3lurcQQgghhBBCXEjVGqg1a9aM9u3bAzB58mSv6ouevv/+e8C1Pq1///5lvv/IkSMBWLBgQbEbZRfeG2re3mVCCCGEEEKIS1O116IfP348ANu2beO9997zuT516lTmzp0LwP3331+uSoj3338/er2egoICHnzwQRwOh9f1vXv38uabbwLQvXt3rwyfEEIIIYQQQlSXag/U7r//fjp37gzA008/zbhx41i4cCHLly/niSeeYOzYsQAkJCTw7LPPevU9fPgwiqKgKApXXHGFz73btWvHQw89BMDs2bPp168fM2bMYPXq1Xz44Yf07NmT9PR0jEYjn3322YV9oUIIIYQQQghRRtVe4lCv1zNnzhwGDhzInj17mDx5MpMnT/ZqExcXx5w5cwgLCyv3/T/44AOSkpKYPn06q1atYtWqVV7XzWYz33//PV27dj2v1yGEEEIIIYQQlaXaM2rgKvyxefNm3n33Xbp06UJISAhms5nWrVvz7LPPsm3btgrvK2M0Gpk2bRpTpkxh0KBBREZGYjQaiY+P584772TTpk3cfPPNlfyKhBBCCCGEEKLiqj2jVshisfDUU0/x1FNPlblPQkJCsQVIirr55pslIBNCCCGEEELUCBdFRk0IIYQQQgghhJsEakIIIYQQQghxkblopj6KyhEdHc3LL7/s9VhcWuQ9rh3kfa4d5H0WQghRHEUt6yIvIYQQQgghhBBVQqY+CiGEEEIIIcRFRgI1IYQQQgghhLjISKAmhBBCCCGEEBcZCdSEEEIIIYQQ4iIjgZoQQgghhBBCXGQkUBNCCCGEEEKIi4wEakIIIYQQQghxkZFATQghhBBCCCEuMhKoCSGEEEIIIcRFRgI1IYQQQgghhLjISKAmhBBCCCGEEBcZCdSEEEIIIYQQ4iIjgZoQQgghhBBCXGQkULtE5OXl8c4779ClSxeCg4MJCQmhQ4cOvPbaa6Snp1f38GqNAwcO8Nhjj9GuXTtCQkKwWCw0btyYO+64g/Xr15faf8qUKQwcOJA6depgsVho2rQp48ePZ9++fWV6/rVr13LTTTcRGxuLyWQiLi6OG264gUWLFpWp/7Fjx3jkkUdo3rw5FouFqKgo+vbty3fffYfT6SzTPWqj//znPyiKgqIo2O32EtvKe1wzzJ8/n5tuuokGDRpgNpuJjo5m6NChzJ49u9S+8h4LIYSoFKqo8U6cOKG2adNGBfz+l5CQoO7cubO6h3nJ++qrr1STyVTs+wCozz33nN++VqtVHT16dLH9AgMD1V9//bXE53///fdVnU5X7D3+/e9/l9h/6dKlalhYWLH9BwwYoGZlZVX453OpOnDggBoQEKD9nGw2m9928h7XDDabTb3jjjtK/D2+//77VafT6dNX3mMhhBCVSQK1Gs5ms6ldu3ZVAVVRFHX8+PHq4sWL1aVLl6qPPvqo9g9+06ZN1YyMjOoe7iXrt99+UxVFUQG1Tp066uuvv64uW7ZMXbVqlfrxxx+r8fHx2gel999/36f/Y489pl0fPHiwOmvWLHXVqlXq+++/r4aHh6uAajab1Q0bNvh9/pkzZ2r9mzdvrk6aNEldvXq1OnnyZK8g/rPPPvPb//Dhw9qHu5CQEPXdd99VV65cqc6ePVsdOnSo1n/06NGV+nO7FAwcONDrg3BxgZq8xzXDXXfdpf0sOnfurP7www/q6tWr1e+//15t3ry5du29997z6SvvsRBCiMokgVoNN2HCBO0f308++cTn+s8//6xdf+mll6phhJc+u92uJiQkqIAaFRWlHjhwwKdNSkqK9iEvMDBQPX36tHZt+/btWkA9atQon2/qd+/erX3I69evn8+98/Pz1QYNGmgBeWpqqtf17Oxs9bLLLlMBNSwszOe6qqrqTTfdpAKqyWRS169f73P9wQcf1P4cLV68uMw/m0vdd99955Ox8BeoyXtcMyxcuNAr0MrPz/e6npaWpv2uh4aGqrm5udo1eY+FEEJUNgnUarhWrVqpgNq6dWvV4XD4bTNs2DAVUCMiIor9tl9U3KJFi0r9pltVVXXWrFlauy+//FI7/8ADD6iAajQa1aNHj/rt+/7772t9i34b7xmMT5kyxW//DRs2FJvRO378uKrX61VAffDBB/32z83NVevWrasC6rBhw4p9jbXJqVOn1IiICC1ALylQk/e4ZujXr58WCJ09e9Zvm6+++kr7Of/+++/aeXmPhRBCVDYpJlKD7dmzhz179gAwZswYdDr/b+e4ceMASEtLY8mSJVU2vtpi1apV2vHw4cOLbTdgwADtePv27drxrFmzAOjXrx8NGjTw2/eOO+7QjmfMmOF17bfffgMgMDCQkSNH+u1/2WWX0bZtW7/9Z8+ejcPhAGDs2LF++wcEBHDjjTcCsGDBArKysvy2q00eeeQR0tLSGDBgANdee22JbeU9vvglJSXx999/A/DEE09Qp04dv+2uu+467rjjDp588kmioqK08/IeCyGEqGwSqNVgngFCv379im3Xp08f7XjZsmUXdEy1Uc+ePXnmmWe47bbbiIuLK7adZ7W1goICAA4ePMipU6eAkt/D6OhoWrZsCfi+h4V/Drp3747JZCr2HoV/DtatW0deXp5Pf7PZTPfu3UvtX1BQwNq1a4ttVxvMnj2bX3/9FYvFwpdfflliW3mPa4YFCxagqioAo0ePLrZdTEwM33//PR988AE9e/YE5D0WQghxYUigVoPt3r1bO27WrFmx7WJiYggICPDpIyrHwIEDefvtt/nf//6HXq8vtp3nB7PCb9zL+h4CNG7c2KdPTk4Ox44dK1d/u93OgQMHtPOF92vYsCFGo7HU/kXHUNtkZWXx0EMPAfDCCy/QvHnzEtvLe1wzFGa5AwICaN26tXY+Ly+P/fv3c+zYMS2QK0reYyGEEBeCBGo1WFJSEgAGg4H69euX2DY2Ntarj6h67733nnZ81VVXAd7vR8OGDUvsX5itS01N1TJyFelftF/hcUX71zZPP/00x48fp23btjz11FOltpf3uGYoDFri4uLQ6XRs2bKF4cOHExoaSosWLWjYsCFxcXG88cYb2ntTSN5jIYQQF4IEajVYWloaAEFBQSiKUmLboKAgADIyMi74uISvzz//XFv/0q1bN3r06AG430OA4ODgEu9R+B6C+3083/6e96ho/9pk5cqVfPnllyiKwtdff11i5qKQvMc1w9mzZwEIDw/np59+olu3bsyZM8drA/OTJ0/y4osvMmjQIDIzM7Xz8h4LIYS4ECRQq8EKv421WCylti2c+lj0m2Bx4c2fP58nnngCcGU/P/74Y+2a5/tR2vtY+B569jvf/p7HFe1fW1itVu677z5UVeWBBx6gV69eZeon73HNkJ2dDcDRo0e56667CAgI4JNPPuHkyZPk5eWxatUqrrzySgBWrFjBvffeq/WV91gIIcSFIIFaDVZY5bG0bBqgra0orjKkuDCWLl3KDTfcgM1mA+DNN9/Usmng/X6U9j56ro8p7He+/QFtXV1F+9cWb775Jrt376Z+/fq88847Ze4n73HNUFiYIzk5GUVRWLhwIY888ggxMTFYLBZ69uzJ/PnzGTRoEAC//vqrVsBD3mMhhBAXgvwtXYMVTnHJz88vtW1hG7PZfEHHJNzmzZvH0KFDycnJAWD8+PE+a5o8pymV9j56Xi98H8+3P7inQlW0f22wa9cuLTj75JNPCAsLK3NfeY9rBs9M1AMPPMDll1/u08ZgMPDhhx9qj3/66SdA3mMhhBAXhgRqNVhISAgAubm5pbYtDBYiIiIu6JiEy6RJkxgxYoT2Lf19993H559/7tOu8D0E93tUnMLriqIQHh5e4f7g/eeg8B4V7X+pczqd3HvvvVitVoYNG1Zi6XZ/5D2uGTx/ztddd12x7dq1a6dVbV2/fr1PX3mPhRBCVBZDdQ9AVFyjRo0A19qZlJQUoqOji21bWN2rsPqjuHDeeustnn/+ee3xo48+ykcffeR3SlLhewhw4sSJEu9beD06OhqDwfWrGx8fj16vx+FwlLk/eP85aNSoEceOHatw/0vdV199xerVq9Hr9dx3331s2bLFp01qaqp2vHXrVvR6PSaTiTZt2sh7XEPExMRox6W99vj4eI4dO8aZM2cA+T0WQghxYUhGrQbz3OsnMTGx2HaFi+GL9hGV7+mnn/YK0l555RU+/vjjYteNlPU9BNemukX7mEwmmjRpUq7+RqPRa6+mwvsdPnwYh8NRav+iY7jUFW4K7HA4GDFiBJ07d/b5b/bs2Vr7rl270rlzZ4YOHQrIe1xTtGvXTjv2rMLoj9VqBdAyYvIeCyGEuBAkUKvBunXrph2vXLmy2HYrVqzQjstaqU6U36uvvsp//vMfwLVI/8svv+Tll18usU9sbCzx8fFAye9hSkoK+/btA3zfw8I/B2vWrMHpdBZ7j8I/B127dvUqK1/YPzc312+2qGh/o9Hod/2O8E/e45rB8+/TdevWFdvO6XRq71NCQgIg77EQQogLRBU1Wvv27VVA7dChg+p0Ov22ufbaa1VAjYiIUPPz86t4hLXDzJkzVUAFVIPBoP78889l7vvII4+ogGo2m9UTJ074bfPee+9p91+7dq3XtenTp2vXpk+f7rf/+vXrtTbvvvuu17Xk5GRVr9ergPrII4/47Z+bm6vWrVtXBdRrrrmmzK+tthg3bpz287XZbD7X5T2++OXn56thYWEqoLZr10612+1+2/3222/az3nSpEnaeXmPhRBCVDYJ1Gq4CRMmFPsPt6qq6pQpU7TrTz/9dDWM8NJ35swZNTo6Wvs5f/XVV+Xqv337du0D1vDhw30+IO7Zs0cNDw9XAbV79+4+/fPy8tSGDRuqgJqQkKCeOnXK63p2drbatWtXFVCDg4PV5ORkn3vcdNNNKqCaTCZ1+fLlPtfHjx+vvb4//vijXK+vNigtUJP3uGZ47rnntJ/BP//5T5/rSUlJaqNGjVRAjY6OVjMzM7Vr8h4LIYSobBKo1XB2u13t3Lmz9o/vHXfcoS5YsEBdtmyZ+vjjj2sfHBISEtT09PTqHu4l6cUXX9R+/t27d1c3b95c6n/79+/3ukfht/GA2qtXL3X69OnqqlWr1P/+979qRESECqhGo1Fdv3693zFMmzZN69+gQQP1q6++UlevXq3+8MMPatu2bbVrH3/8sd/+hw8fVoODg1VAtVgs6iuvvKKuWLFCnTt3rpaRBdSRI0dW+s/vUlBaoKaq8h7XBLm5uWq7du20n8WgQYPUadOmqWvWrFEnTJigxsXFademTp3q01/eYyGEEJVJArVLwIkTJ9RWrVpp/wgX/S8uLk7dsWNHdQ/zkuX54a2s//Xv39/rHlarVR01alSx7c1mszplypQSx/Hee++pOp2u2Hs8/vjjJfZfvHixGhoaWmz/vn37emUQhFtZAjV5j2uG5ORktXv37sX+jPR6vfrZZ5/57SvvsRBCiMokgdolIi8vT3333XfVLl26qCEhIarZbFZbt26tPvvss+rZs2ere3iXrJSUlHIHaf4CtUJTpkxRBw0apEZGRqpGo1GNj49X77zzTnXnzp1lGs/atWvVW2+9VY2Li1ONRqMaGRmpDh06tMzTnE6cOKE+9thjavPmzVWLxaIGBwerPXr0UCdMmFDsmh1RtkCtkLzHFz+Hw6FOnDhRHThwoBodHa1aLBa1WbNm6j333KNu27at1P7yHgshhKgMiqqqKkIIIYQQQgghLhpSnl8IIYQQQgghLjISqAkhhBBCCCHERUYCNSGEEEIIIYS4yEigJoQQQgghhBAXGQnUhBBCCCGEEOIiI4GaEEIIIYQQQlxkJFATQgghhBBCiIuMBGpCCCGEEEIIcZGRQE0IIYQQQgghLjISqAkhhBBCCCHERUYCNSGEEEIIIYS4yEigJoQQQgghhBAXGQnUhBBCCCGEEOIiI4GaEEIIIYQQQlxkJFATQgghhBBCiIuMBGpCCCGEEEIIcZGRQE0IIYQQQgghLjISqAkhhBBCCCHERUYCNSGEEEIIIYS4yEigJoQQQgghhBAXGQnUhBBCCCGEEOIiI4GaEEIIIYQQQlxkDNU9ACGEKI99+/Yxd+5c/vrrLw4dOkRycjJ5eXlERkYSHR1Nu3btuPrqqxkyZAjR0dFlvu/vv/9OQEAAgwcPvoCjF0IIIYQoG0VVVbW6ByGEEKU5ePAgjz76KHPnzi1Te5PJxN13381bb71FREREse2OHj3Ko48+yqxZs5g4cSJ33nlnJY1YCCGEEKLiJKMmhLjorVixgmuvvZbM/2/v/mOqrB44jn8uVySHaBo/A0kQpcLVpEaEV4Vcu1BQblquTU1ksYphyaK2lFWImyv/CLHUrVRInVqmVI4kU8ryR5S1aZhAgS0pQCVQVBDw+0frGVcul4v6ree292tje87hnPOchz/YPjvPc05bm1FntVoVGRmpoKAg+fj46Ny5c6qpqVFLS4skqbOzU2vWrFFZWZl2796t6Ohop2NPnz5dtbW1/8hzAAAAuItv1ACYWkNDg2bMmGGEtMDAQK1du1ZnzpxRdXW19u/frz179ujw4cM6e/asKisrNXPmTKP/yZMnlZKSotbWVqfjX758+R95DgAAgMEgqAEwtVdffVVnzpyRJIWEhKiyslKZmZkaOXKk0/b33nuvPvjgAy1dutSoq6ur07Jly/6R+QIAANwIBDUAptXZ2anNmzcb5aVLlyo8PNytvkuWLJHNZjPK77zzjrq7u2/4HAEAAP4fCGoATOv48eNqb283yvHx8YPq/8wzzxjXLS0tqqqqumFzAwAA+H8iqAEwrUuXLjmUGxsbB9XfZrNp/Pjxuv/++5Wamiovr7/+5VVUVMhischisejkyZNG+/T0dKM+MTGx33E7OztVXFysWbNmKTIyUr6+vvLz81NUVJTmzJmjDz/8UANtqFtfX2/c6/bbbzfqS0tLlZqaqjFjxmjo0KHy9/dXYmKi3nzzTZ0/f35Qzw8AADwX2/MDMK3GxkYFBwcb5QcffFBlZWWyWq3XNW5FRYWSkpJctpk2bZoqKir61JeXl+vpp59WXV2dy/6xsbEqKSlRTEyM09/X19crIiJCkhQdHa0ffvhB6enp2rJlS79jBgUFacOGDUpOTnZ5bwAA4PlYUQNgWkFBQbr77ruN8meffabExETt3bt3wBUrV0aPHi273S673a6bbrrJqP/7sGy73a64uLg+/datW6eHHnrIIaQFBATIZrPJZrPJ39/fqD9y5IgSEhK0f/9+t+bUO6RZrVbFxsZqypQpuuWWW4w2jY2NSk1N1Y4dOwb9zAAAwLOwogbA1LZt26bZs2f3qQ8NDVVaWpqSkpI0depUh5W3wRg7dqzx+qOrA6+/+uorJSUlqaurS5IUFRWloqIi2e12WSwWSVJPT48++eQTLVy40BgzMDBQ33//vW699VaH8XqvqPU2c+ZMFRYWKjQ0VJLU1dWl4uJiPf/888arj35+fjp27JjbG6sAAADPw4oaAFN7/PHH9dJLL/WpP3XqlNasWaPZs2crJCRE48eP14IFC7Rx48ZBf8s2kJ6eHi1YsMAIaRMnTlRlZaWSk5ONkCZJXl5eeuSRR3Tw4EEjhDU1NTmdvzMZGRl6//33jZAmSUOGDFFGRobKysrk7e0tSTp37pxee+21G/V4AADAhAhqAExv+fLlWr9+vUaNGtVvm9raWq1fv15z585VSEiIJk+erHXr1t2QA61LS0tVU1MjSbJYLNq4caNuvvnmftuHhISoqKjIKG/ZskWnTp1yeY+oqCitWrXKIfj1ZrPZtGjRIqO8devWPputAACA/w6CGgCPMH/+fFVXVys/P18TJkxw2fbKlSs6cOCAMjIyFBMTo6+//vq67r19+3bj2mazOXw315+UlBTj+7Kuri6Vl5e7bJ+dne3wvZwzWVlZxnV7e7s+//zzAecBAAA8E0ENgMfw9/dXXl6eTpw4oaqqKhUWFmrGjBkOG25craamRg888MB1bcDRO+jdd999bvXx8vLSpEmTnI7hzMMPPzzgmOHh4Q7ftX355ZduzQUAAHgeghoAj3THHXdo4cKF2rFjh5qbm3X06FGtXLlSaWlpGjZsmEPbzs5OzZkzRz///POg79Pd3e1w1tqKFSuM888G+tmzZ4/Rz9Wrj8OHD9e4cePcmk/vM9caGhoG/TwAAMAzENQAeDyLxaKJEycqOztbH330kf744w8VFBTI19fXaHPhwgUtW7Zs0GO3tLRc11EAfzt79my/vxs9erTb4/T+Nu5Gb5oCAADMY8i/PQEAuNFGjBihxYsXKzk5WdOmTVN7e7skaefOnXr33Xf73bDDmas3I4mNjVVAQMCg5+Rqxezv3Rzd0d3dbVxzugoAAP9dBDUAplRdXa2cnBw1NjaqqalJO3fudPjmyx333HOPsrKy9Prrr0v6a3Xs9OnTgwpaV692ZWdn93vW2rVqa2tzu+2ff/5pXLv6Ng8AAHg2Xn0EYErd3d3atWuXvv32W/366686ePDgNY0THx/vUO7p6RlUfx8fH4ew9uOPP17TPFxpbm52CGCuVFVVGdcD7X4JAAA8F0ENgClNmDBBfn5+Rrm4uPiaxmlpaTGu/fz8rum1xYSEBON6165dbvfLzc3Viy++qLfffluVlZUu27pzhEBNTY1+++03ozxlyhS35wIAADwLQQ2AKVmtVs2aNcsof/PNN9q0adOgx+ndx263y8vL8d/e1WVnUlNTjevjx4+rtLR0wD4HDhzQihUr9MYbbygrK2vArfTXrl074Ji9D9EOCAhQYmLigH0AAIBnIqgBMK3FixfLx8fHKKenp+u9995zu39BQYH27t0r6a+dIXNzc/u06X3I9KVLl5yOM2/ePAUGBhrlzMxMl1v9t7W16amnnjLKw4cP19y5c13O9eOPP3b5bOXl5XrrrbeMclZW1qA2IQEAAJ6FoAbAtMaNG6fCwkKjfPnyZc2bN09Tp07Vtm3bnG7CcfHiRX366adKSkpSXl6eUZ+Tk6O4uLg+7XtvyFFWVuZ0J8Vhw4Zp1apVRrmpqUnx8fHasGGDOjs7HdqWl5dr8uTJDt+S5eXlOQS9/qSnp6ugoEAXLlww6jo6OrRy5Uo9+uijxvd1UVFReuGFFwYcDwAAeC7LFfZ3BmByRUVFysnJUVdXl0O9t7e3wsPDFRwcrCFDhuj06dOqra1VR0eHQ7vMzEytXr3a6WuOzz77rFavXm2Uo6OjFRYWppEjR2r79u0ObfPz8/XKK6841Pn5+SkmJkZWq1W1tbV9zjZ74okntGnTpj5HAtTX1ysiIsIoh4aGGodi+/r66q677pLVatXRo0fV2tpqtAsICNC+ffsUExPT798LAAB4PoIaAI/w3XffKTc3V/v27XO7T2RkpJYvX67HHnus3za//PKLJk2a1Gd1ztvbW+fPn9fQoUMd6rdu3apFixbp999/d3lvHx8fvfzyy1qyZInTgHh1UDt8+LCee+45HTp0qN8xbTabSkpKHPoBAID/JoIaAI/y008/qbS0VIcOHVJ1dbUaGhrU3t6unp4ejRgxQhEREYqLi1NaWprsdrusVuuAY544cUL5+fn64osv1NzcLG9vb4WFhWn37t267bbb+rS/ePGiNm/erLKyMh05ckTNzc3q6OjQqFGjdOedd2r69OmaP3++wsLC+r3n1UGtrq5OY8aMUUlJiUpKSnTs2DG1trYqODhYCQkJevLJJ5WSknJtfzQAAOBxCGoA8C9wFtTGjh37700IAACYCpuJAAAAAIDJENQAAAAAwGQIagAAAABgMgQ1AAAAADAZghoAAAAAmAxBDQAAAABMhu35AQAAAMBkWFEDAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDIENQAAAAAwGYIaAAAAAJgMQQ0AAAAATIagBgAAAAAmQ1ADAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDIENQAAAAAwGYIaAAAAAJgMQQ0AAAAATIagBgAAAAAmQ1ADAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDIENQAAAAAwGYIaAAAAAJgMQQ0AAAAATIagBgAAAAAmQ1ADAAAAAJMhqAEAAACAyRDUAAAAAMBkCGoAAAAAYDL/A0RNUrDduAHDAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Evaluate results\n", + "model_version = 0 # 'v_num' from training output above\n", + "model_path = models_dir / 'lightning_logs' / f'version_{model_version}'\n", + "metrics = pd.read_csv(model_path / 'metrics.csv', index_col=1)\n", + "\n", + "fig, ax = plt.subplots(1,1, figsize=(2,2))\n", + "\n", + "for col in metrics.columns.difference(['epoch']):\n", + " metric = metrics[col].dropna()\n", + " ax.plot(metric.index, metric.values, label=col)\n", + "\n", + "ax.legend(loc=(1, 0.33))\n", + "ax.set_ylim(0, 1)\n", + "ax.set_xlabel('Step')" + ] + }, + { + "cell_type": "markdown", + "id": "e73687e1-ee8f-46e9-8bd2-1ddc571ef94b", + "metadata": { + "id": "e73687e1-ee8f-46e9-8bd2-1ddc571ef94b" + }, + "source": [ + "# **3. Make Predictions**\n", + "\n", + "Once the Segger model is trained, it can be used to make predictions on seen or unseen data. This step involves using a trained checkpoint to predict cell boundaries and refine transcript-nuclei associations.\n", + "\n", + "Key parameters for making predictions:\n", + "- **`--checkpoint_path`**: Path to the trained model checkpoint, which stores the learned weights.\n", + "- **`--batch_size`**: Batch size used during inference.\n", + "- **`--score_cut`**: Defines the score threshold for classifying predictions. Higher values of `score_cut` make the model more conservative in associating transcripts with nuclei.\n", + "- **`--receptive_field`**: These parameters once again define the nearest neighbors for nuclei (`k_bd`) and transcripts (`k_tx`) and their distances (`dist_bd` and `dist_tx`) during the prediction stage.\n", + "- **`--use_cc`**: Used when some **transcripts are not directly associated with any nucleus**—a common scenario when a nucleus isn't captured on the slide or within the field of view. In these cases, Segger uses **connected components (CC)** to group such \"nucleus-less\" transcripts into distinct cells. Even though these transcripts lack a directly associated nucleus, they likely still represent a real cell, and grouping them together ensures that these cells are not discarded.\n", + "\n", + "The predictions can be saved and visualized to assess the segmentation quality.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d4279c71-4660-46fc-a9e5-834e25d31f53", + "metadata": { + "id": "d4279c71-4660-46fc-a9e5-834e25d31f53" + }, + "outputs": [], + "source": [ + "# Checkpoint directory for Lightning model above\n", + "model_version = 0\n", + "\n", + "# Load in latest checkpoint\n", + "model_path = models_dir / 'lightning_logs' / f'version_{model_version}'\n", + "model = load_model(model_path / 'checkpoints')\n", + "dm.setup()\n", + "\n", + "receptive_field = {'k_bd': 4, 'dist_bd': 12,'k_tx': 15, 'dist_tx': 3}\n", + "\n", + "# Perform segmentation (predictions)\n", + "segmentation = predict(\n", + " model,\n", + " dm.train_dataloader(),\n", + " score_cut=0.33,\n", + " receptive_field=receptive_field,\n", + " use_cc=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9807abf3", + "metadata": { + "id": "9807abf3" + }, + "source": [ + "### **3.2 Faster Prediction with Segger**\n", + "We introduce a faster and more efficient pipeline for making predictions using a segger model. This new method accelerates the segmentation process by using CUDA-accelerated **nearest neighbors search** using [CAGRA](https://docs.rapids.ai/api/cuvs/stable/python_api/neighbors_cagra/) and **parallel processing**.\n", + "\n", + "**Note**: The previous prediction method will soon be deprecated in favor of this optimized pipeline.\n", + "\n", + "#### **Requirements for the Faster Prediction Pipeline**\n", + "The pipeline requires the following inputs:\n", + "\n", + "- **segger_data_dir**: The directory containing the processed Segger dataset (in PyG format).\n", + "- **models_dir**: The directory containing the trained Segger model checkpoints.\n", + "- **benchmarks_dir**: The directory where the segmentation results will be saved.\n", + "- **transcripts_file**: Path to the file containing the transcript data for prediction.\n", + "\n", + "#### **Running the Faster Prediction Pipeline**\n", + "Below is an example of how to run the faster Segger prediction pipeline using the command line:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3e802c3f", + "metadata": { + "id": "3e802c3f" + }, + "outputs": [], + "source": [ + "from segger.prediction.predict import segment, load_model" + ] + }, + { + "cell_type": "code", + "source": [ + "dm = SeggerDataModule(\n", + " data_dir='data_segger',\n", + " batch_size=1,\n", + " num_workers=4,\n", + ")\n", + "\n", + "dm.setup()\n", + "\n", + "model_version = 0\n", + "model_path = Path('models') / \"lightning_logs\" / f\"version_{model_version}\"\n", + "model = load_model(model_path / \"checkpoints\")\n", + "\n", + "receptive_field = {'k_bd': 4, 'dist_bd': 12, 'k_tx': 15, 'dist_tx': 3}\n", + "\n", + "segment(\n", + " model,\n", + " dm,\n", + " save_dir='benchmarks',\n", + " seg_tag='segger_embedding_1001',\n", + " transcript_file='data_xenium/transcripts.parquet',\n", + " receptive_field=receptive_field,\n", + " min_transcripts=5,\n", + " cell_id_col='segger_cell_id',\n", + " use_cc=False,\n", + " knn_method='cuda',\n", + " verbose=True,\n", + ")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PEOtAs-t9CiY", + "outputId": "8b7a5375-9ebc-4bb4-9421-254410319120" + }, + "id": "PEOtAs-t9CiY", + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Starting segmentation for segger_embedding_1001...\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "id": "0a823035", + "metadata": { + "id": "0a823035" + }, + "source": [ + "#### **Parameters**\n", + "Here is a detailed explanation of each parameter used in the faster prediction pipeline:\n", + "\n", + "- **--segger_data_dir**: The directory containing the processed Segger dataset, saved as PyTorch Geometric data objects, that will be used for prediction.\n", + "- **--models_dir**: The directory containing the trained Segger model checkpoints. These checkpoints store the learned weights required for making predictions.\n", + "- **--benchmarks_dir**: The directory where the segmentation results will be saved.\n", + "- **--transcripts_file**: Path to the *transcripts.parquet* file.\n", + "- **--batch_size**: Specifies the batch size for processing during prediction. Larger batch sizes speed up inference but use more memory (default: 1).\n", + "- **--num_workers**: Number of workers to use for parallel data loading (default: 1).\n", + "- **--model_version**: Version of the trained model to load for predictions, based on the version number from the training logs (default: 0).\n", + "- **--save_tag**: A tag used to name and organize the segmentation results (default: segger_embedding).\n", + "- **--min_transcripts**: The minimum number of transcripts required for segmentation (default: 5).\n", + "- **--cell_id_col**: The name of the column that stores the cell IDs (default: segger_cell_id).\n", + "- **--use_cc**: Enables the use of connected components (CC) for grouping transcripts that are not associated with any nucleus (default: False).\n", + "- **--knn_method**: Method for KNN (K-Nearest Neighbors) computation. Only option is \"cuda\" for this pipeline (default: cuda).\n", + "- **--file_format**: The format for saving the output segmentation data. Only option is \"anndata\" for this pipeline (default: anndata).\n", + "- **--k_bd**: Number of nearest neighbors for boundary nodes during segmentation (default: 4).\n", + "- **--dist_bd**: Maximum distance for boundary nodes during segmentation (default: 12.0).\n", + "- **--k_tx**: Number of nearest neighbors for transcript nodes during segmentation (default: 5).\n", + "- **--dist_tx**: Maximum distance for transcript nodes during segmentation (default: 5.0)." + ] + }, + { + "cell_type": "markdown", + "id": "b0917be9-4e82-4ba5-869d-5a9203721699", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-11T23:06:23.977884Z", + "iopub.status.busy": "2024-09-11T23:06:23.977517Z" + }, + "id": "b0917be9-4e82-4ba5-869d-5a9203721699" + }, + "source": [ + "### *Troubleshooting #2*\n", + "\n", + "In the cell below, we are visualizing the distribution of **Segger similarity scores** using a histogram. The **Segger similarity score** reflects how closely transcripts are associated with their respective nuclei in the segmentation process. **Higher scores** indicate stronger associations between transcripts and their nuclei, suggesting more accurate cell boundaries. **Lower scores** might indicate weaker associations, which could highlight potential segmentation errors or challenging regions in the data. We expect to see a large number of the scores clustering toward higher values, which would indicate strong overall performance of the model in associating transcripts with nuclei.\n", + "\n", + "The following would indicate potential issues with the model's predictions:\n", + "\n", + "- **A very large portion of scores near zero**: If many scores are concentrated at the lower end of the scale, this suggests that the model is frequently failing to associate transcripts with their corresponding nuclei, indicating poor segmentation quality.\n", + "- **No clear peak in the distribution**: If the histogram is flat or shows a wide, spread-out distribution, this could indicate that the model is struggling to consistently assign similarity scores, which may be a sign that the training process did not optimize the model correctly.\n", + "\n", + "Both cases would suggest that the model requires further tuning, such as adjusting hyperparameters, data preprocessing, or the training procedure (see below)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a450d3ca-2876-4f48-be89-761147b17387", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-11T22:07:04.216273Z", + "iopub.status.busy": "2024-09-11T22:07:04.215965Z", + "iopub.status.idle": "2024-09-11T22:07:08.177601Z", + "shell.execute_reply": "2024-09-11T22:07:08.177158Z", + "shell.execute_reply.started": "2024-09-11T22:07:04.216257Z" + }, + "id": "a450d3ca-2876-4f48-be89-761147b17387", + "outputId": "0576e0b8-4823-4701-b661-dc6b513841f2" + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAApEAAAJ1CAYAAACB/qtfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAC4jAAAuIwF4pT92AABZpUlEQVR4nO3dd3hT5d/H8U/a0pYyy96UpRQqq2yQIlNkiANRxIeKC7fy0597II8DUFyg/hS1oqKgqCAoIihlKVJBhbL33rtAoe39/MHT82uapskpSdPxfl1XrivJuc+db3Jykk/OyTm3wxhjBAAAANgQFOgCAAAAUPgQIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGBbSKALQNF39uxZbd682brdoEEDhYeHB7AiAABwsQiR8LvNmzcrJibGur169Wo1bdo0gBUBAICLxe5sAAAA2EaIBAAAgG2ESAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2EaIBAAAgG2cbBw+k5CQoISEBJf7U1JS8r8YAADgV4RI+My2bduUmJgY6DIAAEA+IETCZ6KiohQXF+dyf0pKipKSkgJQEQAA8BeHMcYEuggUbcnJyYydDQBAEcOBNQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjWEPUahd3qaNDuzbm2ubKtWqa9Hy5flUEQAAxQMhEoXagX179efgAbm2iZ06M5+qAQCg+GB3NgAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbAsJdAEoOhISEpSQkOByf0pKSv4XAwAA/IoQCZ/Ztm2bEhMTA10GAADIB4RI+ExUVJTi4uJc7k9JSVFSUlIAKgIAAP5CiITPxMfHKz4+3uX+5ORkxcTE5H9BAADAbziwBgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2hQS6AHcWLlwoSapQoYJiYmLy1MfPP/+s3377TampqXrxxRd9WR4AAECxVmBDZNeuXeVwONS7d2/98MMPeerjhRde0NKlS1WlShVCJAAAgA8V6d3ZISEhMsboyJEjgS4FAACgSCmyIXLFihVavny5JKlUqVIBrgYAAKBoCeju7FGjRumTTz7JtU1iYqLq169vq9+zZ8/qwIEDysjIkMPhUMOGDS+mTAAAAGQT0BB5//33a8KECW53NxtjdObMGW3fvt1Wv8YYSZLD4ZAkDRs27OIKBQAAgJOA7s6uUKGCxo4dK2OMyyWrnKbndsnqnnvu0b333pufTwsAAKDIC/jR2bfeeqvCwsJ0/vx56z5jjIYPHy6Hw6GYmBiNHDnS6/6CgoIUHh6uihUrKiYmRlWqVPFH2QXW008/7fZI9NjYWCUlJeVzRQAAoCgKeIiUpCFDhrjcN3z4cElSzZo12R1twz///KPg4GA9/fTTLtNq1KgRgIoAAEBRVCBCZE66dOkih8OhZs2aBbqUQuXvv//WJZdcoueffz7QpQAAgCKswIbIBQsWBLqEQufYsWPasWOHBg8eHOhSAABAEVdkzxNZHP3zzz+SxNZbAADgd4TIIuTvv/+WdGGLZJ8+fVSxYkWVLVtWffr0sU68DgAA4AsFPkSeP39e//nPf9S7d2/VqFFDERERCgkJUXBwsNeXkJACu9fepzK3RI4fP16hoaG6/fbb1a1bN82dO1edO3fWnDlzAlwhAAAoKgp0utq5c6d69eqlDRs2SJLLOSDhLDg4WLVq1dKHH36oXr16Wff/8ssv6tWrl4YNG6atW7cqIiIigFUCAICioMBuiczIyFD//v21fv16wqOX3nvvPSt4Z9WtWzcNGTJEBw4c0Lx58wJUHQAAKEoK7JbIL774Qv/8848cDoeMMapbt65uvfVWNW3aVOXLl1dwcHCgSyxUYmNj9emnn2rLli2BLgUAABQBBTZETps2zbreo0cPff/99woLCwtgRQXb+fPn9ddffyk9PV3t27d3mX769GlJUnh4eH6XBgAAiqACGyJXrlwp6cIwhpMmTSJAenD+/Hl17NhREREROnTokEqUKOE0feHChZKkNm3aBKI8AABQxBTY/0QeOHDAGju7Tp06gS6nwIuIiFDfvn114sQJjR492mnaZ599pjlz5qhDhw6KjY0NUIUAAKAoKbAhsnz58pKkihUrBrYQH1m/fr3Cw8NVq1Ytr9qfOXNGr7zyilq1aqXSpUurTJkyatasmV544QUdO3Ysx3neeOMNVatWTaNHj1bXrl31yCOPqE+fPrrllltUrVo1TZ482YfPCAAAFGcFdnd2vXr1dODAAe3YsSPQpVy01NRUDR8+XKmpqV6137Nnj3r27Kk1a9Y43b9q1SqtWrVKH3/8sWbPnq0mTZo4TY+KitKff/6p559/XrNnz9aSJUtUtWpV3XXXXXruuedUvXp1nz0nAABQvBXYEHn11Vdr2bJl2rx5s9auXavo6OhAl5QnaWlpGjx4sJYuXep1+6uvvlpr1qyRw+HQiBEjNGjQIAUFBembb77RhAkTtG3bNg0YMEArVqxQ2bJlneavUaOG3n//fX88FQAAAEuBDZG33nqrxo0bp2PHjmnkyJH68ccfA12SbYcOHdKNN96o+fPnez3PBx98oKSkJEnSm2++qfvvv9+aFhcXpw4dOuimm27S5s2b9dprr2nUqFE+rzs3Bw4c0MGDB23Ns2nTJj9VAwAAAsVhCvCZvKdNm6YhQ4bIGKNrrrlGr7/+umrXrh3osrwye/Zs3XPPPdbu+KCgIGVkZKhmzZratWuX2/mio6O1bt06RUdHa/Xq1QoKcv3bav/+/TVr1ixFRkbqwIED+Tqs4/PPP3/RwXX16tVq2rSpT+q5tHYt/Tl4QK5tYqfO1Pqd7l9zAABgX4HdErlmzRrFxMTo2Wef1fPPP69vv/1WM2bMUGxsrJo1a6aKFSuqZMmSXvf37LPP+rFaZ4MHD7bOc+lwOPTUU09p0aJFSkxMzHW+devWad26dZKkIUOG5BggJWnYsGGaNWuWjh49ql9//VU9e/b07RMAAADwoMCGyJiYGDkcDkmyRq1JT0/X8uXLtXz5ctv95WeIXLZsmaQLB7pMmjRJ3bt3V9euXT3Ol/V/k126dHHbrnPnztb1xMREQiQAAMh3BTZESspxzOy87H3PDKP5pUaNGrr33nv1wAMP2DpJ+tq1a63rDRs2dNuuWrVqKlmypM6cOeM0T3645557NGjQIFvzbNq0SQMHDvRPQQAAICAKbIjs0qVLvoc/X1m8eLHbXdG52bNnjyQpJCTE4+l4atSooc2bN1vz5JcqVaqoSpUq+fqYAACg4CmwIXLBggWBLiHP8hIgJeno0aOSpFKlSnkM0KVKlZIkHT9+PE+PBQAAcDEK7Ig1xVHmycjDw8M9ts08qMjbE5gDAAD4EiGyAMncgunNbvzM/4bmdasnAADAxSCBFCClS5eWJJ09e9Zj28w2dg7cAQAA8BVCZAFSpkwZSdLp06c9tk1JSZEkRUZG+rUmAACAnBTYA2uCg4N91pfD4VBaWprP+vOXunXrSpLOnTungwcPqnLlym7bZh6VXaNGjXypDQAAIKsCuyUy8z9/xhifXAqD6Oho6/rmzZvdttu7d6/OnDnjMg8AAEB+KbAhUsr7icUdDodCQkJUtWpV61IYtG3b1rq+ZMkSt+0WL15sXe/YsaNfawIAAMhJgd2d/euvv3rd9uzZszp06JBWr16t6dOna9OmTUpPT9cDDzygJ554wo9V+lbDhg112WWXadWqVZo8ebJGjhyZ45Han3zyiaQL/4eMi4vL7zIBAAAKbojMazgaPXq0nn32Wb3yyit6+umnFRERoQcffNDH1fnP3XffrXvuuUf//POPxo0bp3//+99O06dOnarZs2dLku68806OzgYAAAFRoHdn50VISIheeuklDR06VMYYPfHEE9qyZUugy/LanXfeqZYtW0qSHnvsMQ0bNkzz5s3TwoUL9fDDD+vmm2+WJEVFRRWqrawAAKBoKXIhMtOoUaMUFBSk1NRUffjhh4Eux2vBwcGaNWuWGjduLEmaPHmyevbsqbi4OL3xxhtKT09XzZo1NWvWLJUrVy7A1QIAgOKqyIbIevXqqVGjRjLGWLt/C4saNWpo5cqVGjNmjFq1aqUyZcooLCxM0dHReuKJJ/TPP/+oadOmgS4TAAAUYwX2P5G+UK1aNa1fv147duwIdClasGCBrfbh4eH697//7fKfSAAAgIKgyG6JlKSdO3dKks6fPx/gSgAAAIqWIrslcsmSJdqyZYscDgejuuSThIQEJSQkuNyfOUQjAAAoOopkiExOTtaQIUOs21dccUUAqyk+tm3bpsTExECXAQAA8kGBDZEvvPCC122NMTp//rxOnDihNWvWaMGCBU6j3dx6663+KBHZREVF5Xh+z5SUFCUlJQWgIgAA4C8FNkQ+//zzOY7W4g1jjDVvfHy82rVr58vS4EZ8fLzi4+Nd7k9OTlZMTEz+FwQAAPymwIZIKW9jZ2dyOBy6/fbbNXHiRB9WBAAAAKkAh8guXbrY2hLpcDgUFhamihUrqkWLFho4cKAaNmzoxwoBAACKrwIbIu2eVxEAAAD5p0ifJxIAAAD+QYgEAACAbQV2d7Y7x44d05IlS7Rnzx4dPnxYwcHBKlOmjKKiotSyZUtVrVo10CUCAAAUeYUmRP7www8aM2aMlixZkutR2y1bttRDDz2koUOH5mN1AAAAxUuB35195swZDRkyRP3799fixYuVkZFhhUhjjMv1FStWaNiwYerZs6eOHTsWwMoBAACKrgIdItPS0tSnTx9NnTrVZetjSEiIKleurMjISAUF/fdpOBwOGWP0yy+/qHfv3jp79mx+lw0AAFDkFegQ+dhjj2nhwoXW7QYNGuitt97SunXrlJqaqv379+vw4cNKTU3V6tWr9dprr6lBgwaSLmyZTEpK0pNPPhmo8gEAAIqsAhsit23bpgkTJlgnHB8+fLhWrVql++67T5dccolT26CgIDVp0kQPP/ywVq1apdtvv13ShSA5ceJE7dy5M9/rBwAAKMoKbIj85JNPdP78eUlSnz59NGnSJIWFhXmcLywsTO+//7769esn6cIu8Y8//tivtQIAABQ3BTZE/vzzz9b1119/3fb8r732mnV93rx5PqkJAAAAFxTYELllyxY5HA41atRIjRo1sj1/o0aNdOmll8oYozVr1vihQgAAgOKrwIbIw4cPS5Jq1aqV5z5q1KghSTpx4oRPagIAAMAFBTZEli5dWpJ09OjRPPeReZ7IcuXK+aIkAAAA/L8CGyJr164tY4xWr16dp5OGHz16VKtWrZLD4bC2SAIAAMA3Cuywh5dffrn++ecfpaWlady4cXrxxRdtzT9u3DilpaXJ4XCoS5cufqoSWSUkJCghIcHl/pSUlPwvBgAA+FWBDZFDhw7VxIkTJUljx45VixYtNGjQIK/mnTp1qsaOHWvdvuGGG/xSI5xt27ZNiYmJgS4DAADkgwIbItu1a6devXpp7ty5Sk9P14033qi5c+fq4YcfVpMmTXKcJzk5WePHj1dCQoKMMXI4HOrWrZsuv/zyfK6+eIqKilJcXJzL/SkpKUpKSgpARQAAwF8cJvug1AXInj171KJFCx0+fNgKhZJUvXp1NWnSRJGRkZIu/P8xOTlZ+/btkyRrnO2qVatq+fLlF3WENy5ecnKyYmJirNurV69W06ZNfdL3pbVr6c/BA3JtEzt1ptbv3OWTxwMAABcU2C2R0oVT9CxatEi9e/fWjh07JF0IiHv27NHevXud2mYGx8ygWbduXX333XcESAAAAD8osEdnZ7r00ku1cuVKPfLIIwoPD7fuN8Y4XTKVKlVKDz74oFasWKHmzZsHomQAAIAir0BvicwUGRmpsWPH6rnnntOCBQu0aNEi7dmzR4cPH1ZGRoYqVKig2rVrq1OnToqLi1PZsmUDXTIAAECRVihCZKZSpUqpb9++6tu3b6BLAQAAKNYK7O7sjIwMr9t+++23Wr16tR+rAQAAQFYFLkTOmTNHvXr10j333OP1PA888ICaN2+u1q1ba/bs2X6sDgAAAFIBCpHHjh3T1Vdfrb59+2r+/PlasGCBV/Nt3LhRu3fvliStWLFCAwYM0LXXXqvjx4/7sVoAAIDirUCEyL1796pdu3aaNWuWdbT1xo0bdejQIY/z/vrrr5JknUfSGKMZM2aoU6dOOnLkiL9LBwAAKJYCHiJTU1PVv39/bdy40bovODhY/fr1U2pqqsf5hw0bph9//FE333yzQkJCrCC5Zs0aDRgwwNZ/KwEAAOCdgIfIsWPHasWKFVb469atm1avXq0ZM2aoZs2aHucPCwtT79699emnn2r16tVq3769Ne23337TG2+84cfqAQAAiqeAhsiTJ09qzJgx1igzd9xxh37++WddeumleeqvUaNG+vXXX9W7d29JF3Zxv/jii15t0QQAAID3Ahoip06dqtOnT0uSYmNj9d5771mBMq9CQ0P15ZdfWsMdHjt2TNOmTbvoWgEAAPBfAQ2R8+fPt66PGjXqogNkpnLlyumJJ56wbs+bN88n/QIAAOCCgIbIlStXSpIiIiJ05ZVX+rTvm2++WcHBwZKkpKQkn/YNAABQ3AU0RB44cEAOh0NNmjTx2VbITGXLllXjxo1ljNH+/ft92jcAAEBxF/ADa6QLu5/9oUqVKpKkEydO+KV/AACA4iqgIbJ06dKSpFOnTvml/7S0NElSyZIl/dI/AABAcRXQEFmtWjUZY7R9+3a/9L9161ZJUvny5f3SPwAAQHEV0BB52WWXSZL27dunLVu2+LTvrVu3ateuXXI4HLrkkkt82jcAAEBxF9AQ2b17d+v6J5984tO+P/roI+t6bGysT/sGAAAo7gIaIgcOHKgSJUrIGKPXX39du3fv9km/e/bs0dtvv23dHjBggE/6ReF06NAhXVq7ltvL5W3aBLpEAAAKnZBAPnjVqlU1ePBgffbZZ0pJSdG1116rX375RaVKlcpzn6dPn9YNN9ygEydOyOFwqGnTpurYsaMPq4Y7CQkJSkhIcLk/JSUl/4vJwmRk6M/B7n9IxE6dmY/VAABQNAQ0RErS6NGj9c033+jMmTNKSkpSly5d9MUXX+Tpf4wbNmzQzTffrBUrVlj3vfzyy74sF7nYtm2bEhMTA10GAADIBwEPkXXr1tVrr72mu+++Ww6HQytXrlTz5s01dOhQ3XzzzerYsaNCQ0Pdzn/69GktWLBAX331laZMmaK0tDQZY+RwODRixAj17ds3H59N8RYVFaW4uDiX+1NSUhg1CACAIibgIVKS7rrrLm3fvl2vvPKKHA6HUlNT9dFHH+mjjz5SWFiY2rRpo7p166py5cqKiIjQ4cOHdejQIe3evVt//vmnzp8/L0lWeJSkG2+80el/kfC/+Ph4xcfHu9yfnJysmJiY/C8IAAD4TYEIkZL00ksvqW7duho5cqTOnDkj6UIoPHv2rBYvXqzFixfnOJ8xRpKs8FiiRAmNHj1ajz76aP4UDgAAUAwF9Ojs7O666y4lJydr0KBBCgryXFpmgJSk8PBwDR8+XOvWrSNAAgAA+FmB2RKZKSoqSlOnTtWuXbs0ffp0zZ07V3///bf27Nnj1C40NFQNGjRQmzZt1L17dw0cOFBlypQJUNUAAADFS4ELkZlq1aqlBx98UA8++KAkKTU1VSdOnNC5c+cUERGh8uXLW7uwAQAAkL8KbIjMLiwsTJUrVw50GQAAAFAB+08kAAAACgdCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2QiQAAABsI0QCAADANkIkAAAAbCNEAgAAwLaQQBeAoiMhIUEJCQku96ekpOR/MQAAwK8IkfCZbdu2KTExMdBlAACAfECIhM9ERUUpLi7O5f6UlBQlJSUFoCIAAOAvhEj4THx8vOLj413uT05OVkxMTP4XBAAA/IYQCQAAAu7yNm10YN/eXNtUqVZdi5Yvz6eK4AkhEgAA+J2nkHj48GFtu+fWXPuInTrT12XhIhAiAQCA3x3Yt1d/Dh7gdnqdtyflYzXwBUIkAAC4KN7sij58+HA+VYP8QogEAAAXxdNWRsk3WxoPHTqkS2vXcjud/0zmL0IkAAAoFExGRq5hlf9M5i+GPQQAAIBtbIkEAABFgqfd3RK7vH2JEAkAAIoET7u7JXZ5+xIhEgAA5Mqbczyi+CFEAgCAXHGOR+SEA2sAAABgG1siAQAo5thdjbwgRAIAUMyxuxp5we5sAAAA2EaIBAAAgG2ESAAAANhGiAQAAIBthEgAAADYxtHZAACg2PA0vjZja3uPEAkAQBHm6RyQUvE6D6Sn8bUZW9t7hEgUe55+lUr8MgVQeHk6B6TEeSCRN4RIFHuefpVK/DIFACA7DqwBAACAbYRIAAAA2EaIBAAAgG2ESAAAANjGgTUAABRink7hU5xO34P8RYgEAKAQ83QKH07fA38hRMJnEhISlJCQ4HJ/SkpK/hcDAAD8ihAJn9m2bZsSExMDXQYAAMgHhEj4TFRUlOLi4lzuT0lJUVJSUgAqAgAA/kKIhM/Ex8crPj7e5f7k5GTFxMTkf0EAAMBvOMUPAAAAbCNEAgAAwDZCJAAAAGwjRAIAAMA2DqwBAKCA8jQajcSINAgcQiQAAAWUp9FoJEakQeCwOxsAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALZxdDYAAAHi6RQ+nL4HBRkhEgCAAPF0Ch9O35P/Dh06pEtr18q1TZVq1bVo+fJ8qqjgIkQCAAD8P5OR4fHcnLFTZ+ZTNQUb/4kEAACAbYRIAAAA2EaIBAAAgG2ESAAAANjGgTUAAPiBp9P3SJzCB4UbIRIAAD/wdPoeiVP4oHAjRAJe8HTeMM4ZBgAobgiRgBc8nTeMc4YBAIobDqwBAACAbWyJBAAgDxj3GsUdIRIAgDxg3GsUd+zOBgAAgG2ESAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2MYpfgAAyMbTOSAlzgMJECIBAMjG0zkgJc4DCbA7GwAAALYRIgEAAGAbIRIAAAC2ESIBAABgGwfWwGcSEhKUkJDgcn9KSkr+FwMAufB09DVHXgOeESLhM9u2bVNiYmKgywAAjzwdfc2R14BnhEj4TFRUlOLi4lzuT0lJUVJSUgAqAgAA/kKIhM/Ex8crPj7e5f7k5GTFxMTkf0EAAMBvCJGADxw6dEiX1q6Va5sq1apr0fLl+VQRAAD+RYgEfMBkZHgc3SJ26sx8qgYAAP/jFD8AAACwjS2RAIAixdPpeyRO4QP4AiESAFCkeDp9j8QpfABfYHc2AAAAbCNEAgAAwDZCJAAAAGzjP5EAgEKFca+BgoEQCQAoVBj3GigYCJEAAAA2eBqlrLiMUEaIBAAAsMHTKGXFZYQyDqwBAACAbYRIAAAA2EaIBAAAgG38JxIAUKBwCh+gcCBEAgAKFE7hAxQO7M4GAACAbYRIAAAA2MbubCCfcHJaAEBRQogE8gknpwUAFCWESABAvvF05LXE0ddAYUGIBADkG09HXkscfQ0UFhxYAwAAANsIkQAAALCNEAkAAADb+E8kAMBnGLIQKD4IkQAAn2HIQqD4IEQCBYSnk5FLnJAcAFBwECKBAsLTycglTkgOACg4CJEAAK9wonAAWREiAQBe4UThALLiFD8AAACwjRAJAAAA2wiRAAAAsI0QCQAAANs4sAYAIInRZgDYQ4gEAEhitBkA9rA7GwAAALaxJRIoRDwNjciwiACA/EKIBAoRT0MjMiwi3GG0GQC+RogEgGKA0WYA+Br/iQQAAIBthEgAAADYxu5soAjhwJvii3M8AshvhEigCOHAm+KLczwCyG/szgYAAIBthEgAAADYRogEAACAbfwnEj6TkJCghIQEl/tTUlLyvxgAAOBXhEj4zLZt25SYmBjoMpALT0dvSxzBXRAx2gyAgogQCZ+JiopSXFycy/0pKSlKSkoKQEXIztPR2xJHcBdEjDYDoCAiRMJn4uPjFR8f73J/cnKyYmJi8r8gAADgN4RIAAgwThQOoDAiRAKAH3n7f8Zt99zqdjq7qgEURIRIAE48HXxz7PhxlS9XLtc+ODjnv/g/I4CiihAJwImng2/qvD2Jg3OyYFc0gOKKEAkAufAmJLIrGkBxRIgE4HOedokXpt3dnnZHExIBFFeESAA+52mXeN2JH130Sc+9OWDFF32wOxoAckaIBJDvvDnpuaeg6Wk3sq/6YEsjAOSMEAmgQPLmAJ/86AMAkLOgQBcAAACAwocQCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwLSTQBaDoS01Ndbq9adMm3/V9/rzWHjqSa5v0DJNrG0/Ti1IfhaXOotRHYamzKPVRWOosSn0Uljrzq4/U8+eVnJycax951aBBA4WHh/ulb7scxhgT6CJQtM2YMUMDBw4MdBkAABR6q1evVtOmTQNdhiR2ZwMAACAPCJEAAACwjd3Z8Ltjx44pMTHRul27dm2FhYVdVJ+bNm1y2kX+3XffqWHDhhfVJ3yP5VQ4sJwKPpZR4ZAfy6kg/SeSA2vgd+XLl9fVV1/t18do2LBhgfmPCNxjORUOLKeCj2VUOBT15cTubAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2EaIBAAAgG2ESAAAANhGiAQAAIBthEgAAADYRogEAACAbYRIAAAA2MbY2SiUKleurOeee87pNgoellPhwHIq+FhGhUNxW04OY4wJdBEAAAAoXNidDQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkfCrM2fO6JVXXlGrVq1UunRplSlTRs2aNdMLL7ygY8eOXXT/X375pbp3764KFSooPDxcDRo00N13360NGzZ4Nf+yZct0ww03qEaNGgoNDVXNmjV17bXXav78+RddW2Hiz+V09OhRvfTSS+rQoYMqVKig0NBQVatWTf369dNXX30lY0yu8zds2FAOh8PjZejQoRdVZ2Hgr+W0a9cur15jh8OhSZMmue2H9cn3yyghIcHrZZP1sm3bNpe+WJfcW79+vcLDw1WrVi2f9FdsvpsM4Ce7d+82TZo0MZJyvERFRZnk5OQ89X3u3Dlz/fXXu+07IiLCfPXVV7n28eqrr5qgoCC3fTz66KN5qq2w8edyWrp0qalevbrbviWZK6+80pw4cSLH+U+ePGkcDkeu82debr755ot5GQo8fy6nWbNmefUaSzIffPBBjn2wPvlnGX388cdeL5usl127djn1w7rk3tmzZ03Hjh2NJFOzZs2L6qu4fTcRIuEX58+fN61btzaSjMPhMHfffbf55ZdfzIIFC8wDDzxgrSANGjQwx48ft93/gw8+aK1QPXv2NDNmzDBLly41r776qilfvryRZMLCwkxSUlKO83/77bfW/I0aNTIJCQnmt99+M5MnT3b6EpgwYcLFvhQFmj+X07Zt20xkZKSRZEJCQsyIESPMjz/+aJYtW2Y+//xz0759e+t17t+/f459LFmyxGrz+eefm5UrV7q9bN++3RcvSYHk7/XpxRdfNJJMyZIlc32NV65caQ4fPuwyP+uT/5bR4cOHPS6TlStXOn0m5hQyWJdydv78eXP11Vdbr83Fhsji9t1EiIRfvPPOO9ab/a233nKZ/sUXX1jTn332WVt9r1q1yvpAvu6660xGRobT9LVr11ora5cuXVzmP3v2rKldu7b1gX7kyBGn6adOnTKxsbFGkilXrpzL9KLEn8tp6NCh1hfqt99+6zI9LS3N6Rf7rFmz3NYXFBRkTp06ZevxixJ/LidjjLnhhhuMJNO2bVvb87I+XeDvZZSbVatWmfDwcCPJtGvXzqSlpbmtr7ivS1kdPHjQdO/e3Wkr38WEyOL43USIhF80btzYSDLR0dEmPT09xzb9+vUzkkxkZKQ5f/68133fddddRpIpUaKE2bFjR45tXn31VetDIfsvvqwf5l9++WWO8yclJVltXn31Va9rK2z8tZxOnTplSpYsaSSZ66+/3m27gwcPmtDQUCPJ3HjjjS7TM5d1o0aNvHtCRZQ/1ydjjLn00kuNJHPHHXfYro316QJ/LyN30tLSrC2gYWFhZv369Tm2Y11yNmvWLFOnTh3rfZkZ/i4mRBbH7yZCJHxu7dq11pt89OjRbtt99dVXVru5c+d63X+1atWMJNO9e3e3bQ4cOGD1/eSTTzpNGzx4sPXflNTUVLd9NG3a1EgyHTt29Lq2wsSfy2nRokXWPJ988kmubTN/WTdt2tRlWuYu70GDBnn1uEWRv9enlJQU6wt04sSJtutjffL/MsrNW2+9ZfX53HPPuW3HuvRfmVveM/eUPP300yYuLu6iQ2Rx/G7i6Gz43NKlS63rXbp0cduuc+fO1vXExESv+t6yZYv27dvnse/KlSvr0ksvzbHvzPratWun0NBQj/X98ccfOnPmjFf1FSb+XE5Vq1bVc889p9tuu03NmzfPtW1GRoYkKTU11eX+VatWSZLHPooyfy4nSVq9erW1DPLyOrM++X8ZuXP06FE999xzkqRatWrpsccey7Ed65KzZcuWSZKioqL0888/a/To0RfdZ3H9biJEwufWrl1rXW/YsKHbdtWqVVPJkiVd5vFF35JUr149l3lSUlK0c+dOW/OnpaVp06ZNXtVXmPhzOTVq1EjPP/+8Jk2alOuX1pEjR7R69WpJUu3atZ2mbdq0SSkpKVZ/r7/+ujp06KCyZcsqIiJCTZs21ZNPPqnDhw97VVNh5c/lJEl//fWXJMnhcCgyMlL//ve/1bRpU4WHh6t8+fLq1KmT3nnnHZ0/f95lXtanC/y9jNwZN26cjh49KkkaNWqU1Xd2rEvOatSoobFjx2rdunXq3r27T/osrt9NIQF9dBRJe/bskSSFhISoevXqubatUaOGNm/ebM3jbd+SVKdOnVzb1qxZU9KFoJKamqqwsLA8zZ/5uJdddplXNRYW/lxO3nr99detcNKrVy+naX///bd1/fbbb9fJkyedpq9Zs0Zr1qzR+++/r++++85pK09R4u/llPk6BwcHq3Xr1k5bNlJTU7V06VItXbpUkyZN0qxZs1SjRg2X2qTivT4FYl06ceKEJkyYIOnCD7D/+Z//cduWdcnZ4sWLFRTk221oxfW7iS2R8LnMX8alSpWSw+HItW2pUqUkScePH7fVtySVLl3aq76z9n+x8xcl/lxO3lixYoXGjRtn9X/bbbc5Tc/6xXfy5Eldf/31+uabb/T777/rq6++Uv/+/SVJhw8f1pVXXumTLTsFkb+XU+brnJaWJofDoYcfflg//fSTfvvtN33wwQfWF9TKlSvVu3dvnT592qU2qXivT4FYlxISEqww+MADDygkxP02IdYlZ74OkFLx/W4iRMLnMv/bFh4e7rFt5u6X7P+H89S3N/1n3bWTOd/Fzl+U+HM5ebJz504NHDjQ6m/UqFGqXLmyU5vML76goCBNmTJFX331la655hq1a9dO119/vWbOnKlXXnlF0oVdQXfccYdPaito/LmcjDH6559/JF34r9ayZcs0fvx49erVS+3bt9ftt9+upKQkDRgwQNKF/0++9NJLLrV5U19RXp/ye10yxmjixImSpDJlynh877Mu+V9x/W4iRMLnMn/lefpFLska8s7bX4ZZ23nq32QZTi9zvoudvyjx53LKze7du9WjRw/r/z9XXXWVRo4c6dJu8uTJSkpK0sKFC3XTTTfl2Ndjjz2mrl27SpKWLFmiFStWXHR9BY0/l5PD4dC6deu0aNEiJSYmKiYmxqVNaGioJk+erHLlykmSJk6cqPT0dJfHKc7rU36vS3PnzrWGzxs+fLi1bNxhXfK/4vrdVLTWZBQImZviz54967FtZpuwsDBbfXvTf9bpmf1f7PxFiT+XkztbtmzR5Zdfbn0Btm3bVlOnTs3xQzMyMlKxsbHq1KlTrn0OHz7cuv7LL79cVH0Fkb+XU40aNdS5c2dFR0e7bVOuXDldd911kqRjx45p5cqVTrV5U19RXp/ye12aNm2add2bca5Zl/yvuH43ESLhc2XKlJEkp/9OuZN5xGBkZKStvrPO66lvh8Oh8uXL53l+O/UVJv5cTjn5888/1aFDB23dulWS1Lp1a/30008e///jSbNmzazrO3bsuKi+CqL8Xk7u5PQ6sz5dkJ/LyBij2bNnS7pwAEbr1q3z1E9Oivq65E/F9buJEAmfq1u3riTp3LlzOnjwYK5tM49Iy3rEpzd9Sxd2i+Ymc3rlypWtP53XqlVLwcHBtua3U19h4s/llN3cuXPVtWtXHThwQNKF85zNmzfP+gC9GBEREdb1c+fOXXR/BU1+Lqfc5PQ6sz5dkJ/L6I8//tD+/fslSddcc02e+nCnqK9L/lRcv5sIkfC5rLvFNm/e7Lbd3r17rdOJ5LYrLS99Sxd2nWafJzQ0VPXr17c1f4kSJTyet6sw8udyymrGjBnq37+/Tp06JenCfyDnzp2b6/+4Tp48qTlz5mjy5MnWiYHdyfqlXaVKFdv1FXT+XE47d+7UzJkzNWnSJO3atSvXtjm9zqxPF+TXuiRJP/zwg3X92muv9diedSl/FNfvJkIkfK5t27bW9SVLlrhtt3jxYut6x44dveq7Ro0aqlWrlse+Dx48aP3vLnvfmfX9/vvv1kgdudXXunVrlShRwqv6ChN/LqdMP//8s2644QZrq8awYcM0Y8YMtydFznTo0CH16dNHw4YN0xtvvJFr26yjhcTGxtqqrzDw53KaN2+err76at1xxx3WLlJ3Ml9nh8OhVq1audRXnNen/FiXsvcRGhqq9u3be2zPupQ/iu13U2BGW0RRd9lllxlJplmzZiYjIyPHNn379jWSTGRkpDl79qzXfd9///1GkgkLCzO7d+/Osc24ceOs8UmXLVvmNG369OnWtOnTp+c4//Lly602Y8aM8bq2wsafy2n79u2mXLly1ut43333uX2M7DIyMkzt2rWNJFO2bFlz7NixHNulpqaaRo0aGUmmfPnyJiUlxev6ChN/LaeNGzday6dLly5u223YsMEEBwcbSaZXr15O01ifLvDnupRV2bJljSTTtm1br9qzLnnHF2NnF8fvJkIk/OKdd97J9Y3+5ZdfWtMfe+wxW32vWrXK+kLr37+/SUtLc5q+bt06U758eSPJtGvXzmX+M2fOmDp16hhJJioqyuzbt89p+qlTp0zr1q2NJFO6dGlz4MABW/UVJv5cTt26dbPmvemmm2zXNnr0aGv+//mf/3H5Ys7IyDC33Xab1Wb06NG2H6Ow8Ody6t69uzXvpEmTXKYfPXrUtGrVykgyDofDLFy40Gk669MF/lxGmbZv3271ce+993o9H+uSZ74IkcXxu4kQCb9IS0szLVu2dPrg+vnnn01iYqJ56KGHrBUtKirK5Zfx1q1brfni4uJy7D/zF58k07FjRzN9+nSzdOlSM378eBMZGWkkmRIlSpjly5fnOP/XX39tzV+7dm3zn//8x/z222/ms88+M02bNrWmvfnmm75+aQoUfy2n+fPnW9MiIyPNwoULzcqVKz1esjp9+rSJiYmx+unatav5+uuvzbJly8wXX3xhOnXqZE3r1KmTSU1N9fOrFTj+XJ+Sk5NNmTJlrJB42223mblz55rffvvNTJgwwfpSk2QeffTRHOtjffL/Z54xxsybN89qN378eK9rY13yzJsQyXeTK0Ik/Gb37t2mcePG1ps++6VmzZpm9erVLvN5s6KeO3fOXHfddW77DgsLM19++WWu9Y0bN84EBQW57eOhhx7yxctQ4PljOQ0ZMsRtf7ldstu1a5eJjY3NdZ4ePXq43UVXlPhzfUpMTDSVK1d227fD4TCPPPJIrn9HYH3y7zIyxphPPvnEajdt2jRbtbEu5c5XIbK4fTcRIuFXZ86cMWPGjDGtWrUyZcqUMWFhYSY6Oto88cQT5vDhwznO4+0HqjEXdhH16NHDVKxY0ZQoUcLUqlXLxMfHm+TkZK/qW7ZsmbnppptMzZo1TYkSJUzFihXNVVddZX788Ue7T7VQ8/VyyvqL+WJCpDEXPpQnTZpkunfvbi3n6tWrm759+5qpU6f6+qUo0Py5Ph06dMiMGjXKxMbGWn1HRUWZ+Ph48/vvv3tVH+uTf5fRhAkTrHZLly61XRvrknu+CpGZist3k8OYLOPnAAAAAF7gFD8AAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkQAAALCNEAkAAADbCJEAAACwjRAJAAAA2wiRAAAAsI0QCQAAANsIkYAbGzZs0Ouvv64rr7xSl156qSIjIxUeHq6aNWuqRYsWGjp0qD799FMdPHgw0KXCz06cOKFp06YpPj5eLVq0UPXq1RUWFqZKlSopOjpavXr10pgxY/T333973afD4bAuCxYs8F/xefD8889btXXt2tVlekJCgjU9Kioq3+vzVteuXa06n3/+ebftjDH6+OOPtXLlyvwrLg/OnDmj77//Xvfcc49iY2NVq1YthYeHKzIyUpdccom6du2q559/Xr/99puMMYEuF8WBAeBk8+bNpm/fvkaSV5fQ0FAzYsQIc+TIkUCXDh87d+6cGTVqlClTpozX74e2bduapUuXeuw76zy//vqr/5+MDc8995xVW1xcnMv0jz/+2Jpet27dfK/PW3FxcVadzz33XI5tVq9ebTp37lwgl0OmjIwMM2HCBFO5cmWv34fR0dHm+++/D3TpKOJC/BdPgcJn8eLF6tu3r06cOGHdFxwcrPr166tq1aoKCwvTyZMntXHjRh09elSSdO7cOb333nv68ccf9dNPP+nSSy8NVPnwoVOnTqlfv35KTEx0ur9y5cqqX7++SpcurdTUVO3Zs0dbt261tvz88ccf6ty5s95++23dc889gSgdNrRo0UJpaWmBLsOt9PR0DRkyRNOmTXO6v3z58mrUqJHKli2rc+fO6eDBg9qwYYMyMjIkSWvXrlX//v312GOP6ZVXXglE6SgG2J0N/L89e/Zo4MCBVoCsUqWK/vOf/+jw4cPasGGDFi1apHnz5mnZsmU6cuSIli9fruuuu86af/v27erTp4+OHz8eqKcAH7rzzjutAOlwODRixAitXbtWBw4c0O+//6558+Zp0aJF2rx5s/bt26eXX35ZpUuXliRlZGTovvvu08yZMwP5FOCFghwgJemZZ55xCpCDBg1SUlKSjh49qj/++EPz5s3TwoULtXbtWh09elTvvvuuKleubLUfM2aM3n777UCUjuIg0JtCgYLijjvusHYFVa9e3Wzfvt2r+UaPHu20G+nRRx/1c6XwtyVLljgt0/fff9+r+f7++28TGRlpzVezZk2Tmprq52rzX2HZne2NrMu5oO3O3rp1qylRooRV35NPPunVfDt37jR169a15itVqpTZv3+/n6tFccSWSEAXdklPmTLFuj169GjVqVPHq3mffvppde7c2bo9adIkpaen+7xG5J+PP/7Yut65c2fdcccdXs3XrFkzjR071rq9e/duzZo1y+f1oXj4/PPPdf78eUlS3bp1NXr0aK/mq1WrliZNmmTdTklJ0WeffeaXGlG8ESIBXfj/UEpKinW7ffv2tua/++67retHjx7VmjVrfFYb8l9SUpJ13e57YejQoSpTpox1e8mSJT6rC8VL1vdhmzZtFBTk/Vd2jx491KhRI+s270P4AyESkHT27Fmn2/v377c1f+fOndWoUSN16NBB/fr18+rDfvv27XrhhRfUqVMn65QxlStXVuvWrfX4449r7dq1tmpITU3VBx98oF69eqlq1aoKDQ1VtWrV1KNHD3344YfWf7+uvPJK67QnCQkJufZ54sQJjR8/XnFxcapYsaJCQ0NVq1YtDRgwQF9//bXVrnHjxrZOVzN37lzdeeedio6Otk6dVKdOHV111VV65513nAK9O5mPV6lSJUnSrl27FB8frypVqigiIkL169fXTTfd5PRF7K2s7we774Xw8HBdddVVatmypXr27Kl69erlWr+71yzr6WnWrVsn6cKWzVGjRqlly5aqUKGCSpcurejoaN15551atWqVSx8LFizQkCFDVL9+fYWHh6tChQq6/PLL9fbbb7u857PydIofu9auXav//d//tV6PsmXLKiwsTFWqVNFll12mu+66Sz/++GOufSxYsMCq6frrr5ckLV26VL169VLZsmVVrlw5xcTE6N5777VOu+XuFD9ZT1GU1RVXXGHdHx8fL0m67LLLrPv69u3r9XO+5pprrPm83ZKd3cW8DyWpf//+at68ubp166amTZt6bJ+WlqYvv/xSgwYNUr169RQeHq6IiAhFRUXpuuuu09SpU73+D+nBgwc1ZswY9ejRw/p8i4yMVHR0tO644w7NmTPHYx9Zl9MjjzwiSZo9e7Y6deqkUqVKqUKFCmrVqpUeeeQRpaamusyfkZGh6dOn65ZbbrEOQoqIiFC9evV03XXXafLkydaWXuRRoPenAwXBvn37nP4b1bNnT5OWluaXx0pLSzOPP/64CQsLy/UUHcHBwebuu+82Z8+e9djnsmXLTP369XPtr1WrVmbz5s2md+/e1n0ff/yx2z5nz55tqlSpkmuf3bt3N4cOHTKXXnqpV/8r27x5s+nSpYvH05NUq1bNTJ8+PdfnnNm2YsWKZvfu3aZ27do59vXnn396fP2yu/LKK635IyIizLp162z34UnWGnN6zbKenmbt2rUmISEh11MNhYSEmEmTJhljjDl9+rS59dZbc32NY2NjzaFDh3KszVen+Dl48KC57rrrjMPh8Oq0NO3btze7d+/Osa9ff/3VanfdddeZefPmmdDQUJc+KlasaM6dO+fyGmY9xU/W+t1dhg0bZowxZty4cU6v8YEDB9w+30yHDx92qm3JkiUe58nJiBEjrD4cDodZsGBBnvrxxty5c029evU8vi7NmjUzycnJbvtJT083L774ogkPD/dqeee2bmVdTv/617/MJ598kuN76bLLLnOZNykpyTRr1sxjDQ0bNjSJiYk+eQ2LI0Ik8P+aN2/u9OHSuXNnM3/+fJORkeGzxzh79qxTQMm8NG7c2FxxxRWmVatWTn+kz6zj1KlTbvtcvHixKVmypNM8FSpUMJ07dzZt27Z1Cqt16tQxl112mccQ+dVXX5mgoCCXYBcXF2datmxpgoODrftbtGhhatWq5TFELl++3OU8dxEREaZdu3ama9euLl9gDofDvP76626fd9bn2rVr1xy/IKKjo20snf8aP368SzB5++23zYkTJ/LUX271exMi//3vfzu9Lpnvl+yvWVBQkElKSjIDBgyw7qtUqZK5/PLLTevWrU1ISIhT+2uuuSbH2nwRIvfu3WsaNWrk9HilSpUysbGxpkePHqZTp045/khp1qyZOX/+vEt/WUNkly5d3J4z8e67787xNcwaIufOnWt69+7t9INKkmnTpo11/6uvvmqMufADM+vr9vbbb7tfqP/vnXfesdpfcsklHtu7880337isL6NHj/YqyNrxySefuKzv5cqVM+3btzft2rUzZcuWdZpWqVIls379epd+0tLSzDXXXOOyTOrWrWvi4uJM69atXcJl+fLlzcKFC3OsK+v77KqrrnIbTMeMGeM03+zZs01ERITL8+nYsaPp0qWLqVmzptO00NBQM23aNJ++psUFIRL4f1OnTs3xA6pmzZpmxIgRZurUqWbv3r0X9Rh33XWXU99DhgwxW7dudWpz9OhR89RTTzkFtZtvvjnH/o4cOWJq1Kjh9CXz7rvvWltijDHm+PHj5tFHH83xF3xOIXLz5s2mVKlSTl8Y06ZNM+np6Vabffv2mWHDhuX4euUUiA4ePOgUNEuXLm0mTpxozpw549Ru5cqVTlsqg4KCzE8//ZTjc8/+uBEREWbixInm0KFD5sCBA2bq1KlmypQpbpZE7k6cOGGqVq3q8hjh4eGmX79+5q233jJ//fWX02til50QmXnp27evy5f3lClTnN4rmV/45cqVM59//rlTjTt27DAdOnRw6jOnLUG+CJE33nij1SYsLMy8++67OW5VX7hwodMPG0nmq6++cmmXNURmXurXr29mzZplTp48abZv325ee+01py3P3pxs3NNyMMaYfv36WW3at2+fY5us2rdvb7V/+eWXPbZ3Jz093TRt2tTleYeEhJhu3bqZsWPHmmXLluUYur21YsUKp62mlSpVMpMnT3b6DElNTTWvvvqqU5hu27atS1/33XefU52dOnUySUlJTm1OnjxpXn75ZZfH3LFjh0t/OW0xbtWqlUlMTDQpKSlm48aN5tlnnzW7du2y5tmwYYPTFvuqVauaKVOmuOxZSkxMdHrflSxZ0vz99995fh2LK0IkkMVjjz2WYzDKemnYsKG59dZbzaeffmr27dvndd+LFi1y6uepp57Ktf0XX3zh1H7evHkubZ544glrenBwsPnll1/c9jdx4kSvQuRNN91kTS9TpoxZvXq12z4fffRRr0Lkbbfd5tTnihUr3PZ57tw5079/f6t9VFRUjn8tyP64nnZ/27V8+XKnMJ3TpXz58qZPnz5m7NixuT6nnNgNkf3793f7F4usr68kU6JECfP777/n2Hb37t1OW6cnTpzo0uZiQ+TWrVud6nnnnXdyfS327t3rtOXo9ttvd2mTPUSWLl3a42m4fBUiv/76a6d2mzZtcvuYGzdudFonswacvNiyZYupVq1aru/DUqVKmW7duplRo0aZJUuW2PorTvfu3Z3ezzltYcz07rvvOj3u3LlzrWmrVq1y2prZv39/pyCa3c8//+wUSq+77jqXNtlDZO3atc3Ro0e9fj41atTI9T1y4sQJ07ZtW6t9586dc+0brgiRQDYff/yx07n+crs4HA7TsWNH8+GHH+b6gWmMcdrFGBsb69Vu8uuvv96ap0+fPk7T0tPTnbaWjRw50mN/2XelZw+Rx44dc9qd/tZbb+Xa3/nz5122ImX/It67d69TaPHUpzHGHDhwwGkX/dSpU13aZH3MJk2aeOwzL9auXeuy5S63S82aNc0jjzxidu7c6bFvOyEyKCjIbN682W1fs2bNcupv6NChuT52p06drLYPP/ywy/SLDZEffPCBFSiqVavm1Zaynj17Wn1eeeWVLtOzh8h77rnHY5++CpGpqammYsWKVrtRo0a5fcxnn33W7TqbV7t27XLaGurpUqlSJTNixAizdu3aXPtdv36903zvvfeex1qy/kVhxIgR1v1Dhw617q9YsaI5fPiwx76efvppp/d49gCbPUSOHTs21/5WrFjh1H7mzJkea1i1apXTXpply5Z5nAf/xdHZQDbx8fHasGGDXnjhBV1yySW5tjXGaOnSpbrtttvUtGlTt6fROHPmjNPRp/fcc4/LkaE5ueWWW6zr8+fPdzpac9myZU5HbN57770e+3v44Ydznf7DDz9YRytGRETo1ltvzbV9SEiI7rvvvlzbzJo1yzpyMjw83GOf0oWhBa+88krr9uzZs3Nt74sjiHPSuHFjLV68WF999ZV69+6tkJDcR4rdvXu3Xn31VTVq1EgvvviiNQTdxWrVqpXq16/vdnr2af3798+1v5o1a1rX/THC0u23367Tp09r7dq1+uGHHzy+bpJUrVo16/qZM2c8tvfXMs9JaGiobrrpJuv2559/7rZt1vMxevNe90bNmjX1/fffa+7cubr22msVHh6ea/tDhw7pvffeU0xMjB566CG3R+J///331vXSpUs7fd64M378eE2YMEFz587VM888I+nC5+BPP/1ktYmPj1eFChU89vXggw8qNDRU0oUjqS92PZ8+fbp1vXbt2urXr5/HGmJiYtSiRQvrtqca4IwQCeSgUqVKeuaZZ7R+/XqtWbNGb775pgYOHKiKFSu6nWfjxo3q1q2bvv32W5dpf/zxh9OpJNq1a+dVHbGxsdb1c+fOOZ2uZt68edb1qKioXENGpri4OJUoUcLt9Kx9tm3b1hrGLzc9evTIdXrWYB0TE+NVn5Lzc/d0jrusXwK+FhQUpOuvv15z5szRwYMHNW3aNI0YMULR0dFu5zl79qyefvppDRkyxCdB0tPpWUqWLOl029P47VlDiL9OjB8WFqbGjRurZcuWubbbvn27Pv/8c/3555+2avLnMs9J1kC4YcMGLV++3KXN4sWLtWXLFklSZGSkBgwY4NMaevbsqenTp+vw4cOaOXOmHnroIbVs2dLtKcXS09P15ptvqnfv3jkG8z/++MO6Hhsbq4iICI819OvXT/fee6969uypGjVqSJLWrVtnnVZJknr16uXV86lUqZJatWpl3V68eLHbtg6HQ82bN8+1v6yfE+3atfPqh7pk77MGzjz/PASKuejoaEVHR+uBBx6QMUbJycn69ddf9fPPP2vevHlOH87nzp3T0KFD9c8//6hBgwbW/Zs2bXLqMyYmJk+17N6927q+c+dO67qn0JApLCxM9evX1/r163Ocnpc+M89B6G5rR9bnnpSU5PUHe1ZZn3dOso4V7E/ly5fXoEGDNGjQIEkXzoWXmJio+fPn64cfftCOHTuc2k+dOlUtWrTQ448/flGPW7VqVVvty5Ytm+v0vCyDi5GWlqbk5GStXbtWmzdv1pYtW7Rx40YlJyfryJEjLu2NMR77zK9lnqlVq1Zq1qyZ/vnnH0kXtji2adPGqc2nn35qXR8yZIjCwsL8UktERIT69+9vbXE+fvy4Fi1apPnz5+vHH390Wb8XLlyohx9+WO+9957T/ZmBV/J+fc/Jnj17nG43adLE63ljYmL0+++/59hPVmXLlrW2WrqT9bPm66+/9stnDZyxJRKwweFwKCYmRvfff79mzpypffv26X//939VqlQpq83p06f14osvOs139OhRnzx+1i/cffv2WdfLlSvndR+RkZFup+W1z/Lly7ud5ovnfvbs2Vx3cXoKTf5SuXJlXX/99Xr33Xe1fft2ff/99y5bS1566SWdOnXqoh7H0+7L7PI7JLqzfft2jRgxQpUrV1aLFi1000036emnn9ZHH32kRYsWOb2fvdnlnVUglvmwYcOs61OnTnXaYpqamqpp06ZZt321K9sb5cqVU79+/fT6669r3bp1WrRokeLi4pzafPDBB9q6davTfVn/ymBnfc/u8OHDTrdz+4zJLmvb7P1k5c3y9sVnTU4/auAeWyKBi1C2bFk99dRTuvLKKxUXF2eNtPLdd9/pww8/tL7Ms+7KLlGihLp165anx6tevbp1PesIDXZ2mea2lccffWZ97lFRUXne4pFbPXaGg/Onfv36qVu3burfv79++eUXSdLJkyc1f/58XX311Xnut6CEQju++eYbDR06NMfwHxwcrHr16qlFixZq3769evbsqfHjx+uTTz7xuv9ALPOhQ4fqscceU1pamvbv36958+apd+/eki78v/DYsWOSLoxyk3UXaX7r3Lmz5s+fr2HDhln/38zIyNDMmTP14IMPWu189b66mL9sZB0BJzg42G07b5Z31s+a6Oho1alTx3Y93v7dBhcQIlHsbdiwQSNHjtT+/ft14MABfffddx7/x5VdbGys7r33Xo0dO1bShV/Ehw4dsna5Zf21nZGRoVmzZtne8pJd1j4zv7y8ceLEiYD1GRcX53GoxUC777779M8//2j//v0aOHCgxowZY2v+iIgITZgwwWmX3oYNG3xdZoG2cuVK3XTTTTp37pyk/x6Y0qNHDzVv3lyXXHKJy65eb4fTC6QqVaroqquu0syZMyVJX3zxhRUip0yZYrXzxVbI0aNH65dfftH+/fvVsmXLXA/myUlwcLDeeustTZ8+3fqrSfb3YdZ182IOsMp+EM3Ro0ed9s7kJuvWw6xjzudFZGSkdbDhDTfc4DTUJfyDEIliLz093emIvN9++812iJSk9u3bO93O+us869Gw6enpWr9+vVdj2eamYcOG1nV3/3HMLi0tzel/UDn1uWzZMlt97tixI9ddzVmfe3Jysld9BtLy5cutAw5+/fXXPPURHR2tsmXLWuHaV0dpFxZPP/20FSDLly+vhQsX6rLLLst1nsKyGzE+Pt4KkbNmzVJ6errOnz9vHZ1cokQJ3XzzzRf9OKtXr7bGVD906FCe+qhQoYIuueQS63+c2d+HWdfNjRs3etXnnj179NNPP6l+/fqqV6+e6tSpo9q1azu1SU5OVq1atbzqb/Xq1db1qKgor+Zxp2bNmlaILAyfNUVBwdgHBATQJZdc4vQL2M4utayy/6LO+sf/jh07Ou06mjVrlld9btq0ScOHD9cLL7ygyZMnOx0B2bFjR+v6zp07tX37do/9/f777067rLPL2ueKFSt0+vRpj30uXLgw1+nZ+9y7d6/HPiXpww8/1AMPPKDx48c7nYrE31q3bm1dT0pKytOX0blz55xeO2+OnC8q0tLSNHfuXOv2iBEjPAZIY4z+/vtv63ZBDt39+vVTpUqVJF34D9+iRYs0b948a3n37dtXVapUuejHyfo+PHjwoNMpwuzI+rmU/X2Y9YdvUlKS24PjspozZ46GDx+url27qmXLljLGqHHjxk5nrsi6/HNz4MABp+We9UjtvMj6WZP9lGi5GTNmjEaOHKm33norzz8ciytCJIq94OBgXX/99dbtP/74w/auI8n53HG9e/d2+g9PxYoVnU7r8/bbb1v/n8zN2LFj9fHHH+u5557T8OHDnb5cu3Xr5vTB/f7773vs79133811+jXXXGPtZk9JSfHqdch+xGd2ffv2tQJ0RkaGtcs/N8ePH9fjjz+ut99+W//617+8em6+MnjwYOu6MUaPPPKI7dPgTJ061do9GxYWpu7du/u0xoLs4MGDTrums/6P152vv/7a6cjc/Ny1bfd/gdm3NM6YMcPaMin57oCaQYMGOX2GPPnkk159ZmS1ePFipzMuXHXVVU7Ts56L9eTJk5o6darHPr/88kvr+hVXXCGHwyGHw6E+ffpY93/yySdebVl+6623nP7HmL0+u7KeF/Lo0aP6z3/+43GerVu36tlnn9Xrr7+uBx98UN98881F1VDsBOw050ABsmnTJqdRVUqUKGEmT57s9fyjR492GsUmp1EPsg+ddsMNN+Q6ksecOXOchhG74YYbXNo8/vjj1vSwsDCzfPlyt/198803LiNb5DTsYdYxjytWrGi2bNnits/x48e79JnTqB9ZR9sICgoy3377rds+MzIynEa/kGR++OEHl3aeHvNiZB06TZIZPHiwOXbsmFfzrly50lSqVMnjyCqe6vdmtJVM2YcZzD4ee3ZZxz0fNmyYy/SLGbHmzJkzTu/bq666KtdaVq1a5TQajCTTvHlzl3bZR6zxhjevYdaRkX788Uev+v3rr7+seaKiokz16tWNJFOlSpWLGsc6u+zDWXbt2tXs2bPHq3m3b99uGjRo4HE5XHHFFVabqlWr5jiGdaYZM2Y41TNnzhxr2sqVK52meRr28JdffnEaHatr164ubbwZoz277ONhuxv+05gLQ6xmff5BQUFm1apVXj0OLiBEAv/vvffecwlEl19+uZk6dao5fvy4S/vTp0+bH3/80XTt2tVpnn/961859p+RkeEydFlcXJz5888/ndqdOnXKjBkzxoSHh1vtSpcuneN4vSdPnjS1atWy2pUrV84kJCQ4fZGdPn3ajBkzxukDO/OSkJDg0ufOnTtN6dKlrTY1a9Y0M2bMcBqm8dixY+aRRx5x6U+SWbBggUufGzduNOXKlbPaBAcHm6eeesocOXLEqd2aNWvM1Vdf7dSfu6Hj/Bkid+zYYapUqeL0GJUrVzavvPKK2bBhQ47zrF692owcOdJpuTVo0MDtWL9FNUQaY3JcJ7KvQ/v27TMvv/yyU4jL+rpl568QmXX9uf/++73q1xhjmjdv7lK3N0OP2nH8+HHTuHFjp8coXbq0efrpp81ff/2V4zybNm0yL7zwgtP6VqFCBbNt27Yc2//1119O79natWubmTNnmvT0dKvNuXPnzDvvvOM0vnnv3r1d+rr//vudau3cubNJSkpyanPq1CkzduxYp8eMiIgwycnJLv3lJUQuWbLE6bMuIiLCjB8/3qSkpDi1W7ZsmencubNTvXfffbdXj4H/IkQCWbz11lsmJCTE5cuhRIkSpkGDBqZTp04mLi7ONG3a1GnLZeblzjvvdPrwze7IkSMmNjbWZb66deuarl27mubNmzt9UEsyoaGhZsaMGW77XLp0qSlVqpTTPBUqVDCXX3656dChg9O0hg0bOrWbMmVKjn1+8803Jjg42KlttWrVTFxcnGnbtq3Tc8/e59KlS3Ps84cffnCpMyQkxLRs2dJ07drV1KtXz2kMW0mmadOm5tChQzn2588QacyFQBsVFZVjUK5evbpp1aqV6datm2nVqpWpXLmyS5tGjRrluhW3KIfIRYsWubx/IiIiTLNmzUxcXJxp1KiRy3pWp04d63qpUqVcxpb3V4js06ePU7/Nmzc33bp1M/fee2+ufb/xxhsuy9wfW7F2795tWrRokeP7sFKlSla9bdq0MTVr1nRpU7VqVZcgl91nn33msjyqVKliunTpYjp37uwUSCWZSy65xOzdu9eln3Pnzpm+ffu61BAVFWW6du1q2rZt6xQeM98XOe1pMCZvIdIYYz788EOX5xMeHm7atGljunTpYmrXru1SY9euXc3Zs2e9fgxcQIgEsklKSnLaxeHNpX79+mbatGle9X/69Glz5513unzJ5nRp2LChWbhwocc+Fy1aZOrVq5drXx06dHAJG7ntVv72229zDEdZL/379zfLly93um/lypVu+1y5cqVp2bKlV6/p4MGDXbZUZuXvEGmMMSdOnDAjR450Cfa5XcLCwsxDDz1kTp48mWvfRTlEGmPMlClTvHrdSpYsaV5++WWzatUqp/uzb6H3V4hcsmSJCQ0NdanLU2g5ePCg0xav1q1be1VTXqSmpprRo0ebyMhIr9+HwcHBZtiwYWb//v1ePcbPP//s9kdT1kvv3r3Nvn373PaTnp5unnnmmRy3MGe/tG3b1vz9999u+8priDTGmHnz5rn8wM3pEhQUZO677z4CZB4RIgE31q5da1555RUzcOBA06RJE1O+fHlTokQJExwcbCIjI02rVq3MiBEjzOzZs01aWprt/jdu3GieffZZ07lzZ1O9enUTGhpqIiIiTP369c2gQYPMlClTTGpqqtf9paSkmHfeecd0797d1KhRw4SGhprKlSubnj17msmTJ5v09HRz5MgRpw/QxYsX59rn4cOHzbhx40ynTp1M1apVTWhoqKlevboZOHCgtXV0xYoVTn3u2rUr1z4zMjLMrFmzzB133GGio6NNZGSkCQkJMeXLlzexsbHmgQcecAkQOcmPEJnp2LFj5tNPPzW33367adu2ralWrZoJDw83QUFBJiIiwjRs2NBcc801ZuLEiebgwYNe9VnUQ6Qxxmzbts08/vjjJjY21pQvX94EBwebMmXKmPr165v+/fubsWPHOgWSrF/62f9L6q8QaYwxv//+uxk4cKCpWrWqCQkJMWXKlDExMTHmzJkzufbfrVs3q/+JEyd6VdPFOH36tJk+fbq59957TadOnUytWrVMRESECQoKMuHh4aZu3brmqquuMuPGjTM7d+603X9qaqr56KOPzDXXXGPq1q1rSpYsaUqWLGkaNGhgbrnlFjNv3jyv+9q9e7d58cUXTVxcnPV5VLJkSdO4cWMzfPhwM3fuXI99XEyINMaY8+fPmy+++MIMHTrUNGrUyJQtW9aEhISYihUrmg4dOpjHH3/crF+/3na/+C+HMV4MUgqgSFi3bp2io6Ot29u3b8/TqA5ZzZkzxzoys0SJEjp9+vRFn0gdKOhSU1NVrVo1HTt2TCVLltSePXtyHf4TKIr4pAcKqePHj+upp55SVFSU6tatqwEDBriMApJd5gmRpQsnIs4eIDdv3qy3335bUVFRioqK0sCBAz3WkbXPyy67jACJYmHmzJnWqE6DBw8mQKJY4tMeKKRKlSql999/3zrP2pQpU3TTTTe5bb93716nczQOGDDApU1ISIjefPNN6/bSpUvVoUMHt32uXbvW6RyOOfUJFEUTJkywrt97770BrAQIHE42DhRSISEh1ri90oWRQSZPnuwySoMxRvPmzVPnzp2tEzqHhobqsccec+mzbt26iomJsW4PGjRIM2bMcDnZdlpamqZPn64uXbpYI3VUqFCBL1MUeRkZGXrmmWeskZq6devmNLoMUJzwn0igEFuzZo3atGnjNMRe6dKlVa9ePVWpUkVnzpzR+vXrdfjwYWt6UFCQEhISdMstt+TY5y+//KJevXo5BcfIyEjVrVtXFStW1MmTJ7Vu3TprXGhJCg8P1/fff68ePXr44VkCgfXwww9r6dKlioiI0Nq1a63xmcPCwrRixQo1adIkwBUCgUGIBAq55cuX65ZbbtH69es9tq1Vq5Y++OADp+HOcjJnzhzdfvvt2r17t8c+mzRpooSEBLVp08brmoHCZOzYsS5b7kNCQvTRRx+5/TEGFAeESKAISEtL08yZM/Xtt99qxYoV2rVrl1JSUhQaGqqaNWuqRYsWGjBggG644QaPB99kOnPmjL766ivNmjVLf/31l/bu3avTp08rIiJCtWrVUps2bXTNNddowIABCg4O9vMzBAJn8eLFuvPOO7VlyxaVLVtWbdu21RNPPKFOnToFujQgoAiRAAAAsI0DawAAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAthEiAQAAYBshEgAAALYRIgEAAGAbIRIAAAC2ESIBAABgGyESAAAAtv0fBe7DHrVOS4oAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1,1, figsize=(2,2))\n", + "sns.histplot(\n", + " segmentation['score'],\n", + " bins=50,\n", + " ax=ax,\n", + ")\n", + "ax.set_ylabel('Count')\n", + "ax.set_xlabel('Segger Similarity Score')\n", + "ax.set_yscale('log')" + ] + }, + { + "cell_type": "markdown", + "id": "5492fb96-bf8e-49d5-b40e-7e6b3f871bbe", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-11T22:34:15.990223Z", + "iopub.status.busy": "2024-09-11T22:34:15.988880Z" + }, + "id": "5492fb96-bf8e-49d5-b40e-7e6b3f871bbe" + }, + "source": [ + "#### The Importance of the Receptive Field in Segger\n", + "\n", + "The **receptive field** is a critical parameter in Segger, as it directly influences how the model interprets the spatial relationships between **transcripts** and **nuclei**. In the context of spatial transcriptomics, the receptive field determines the size of the neighborhood that each node (representing transcripts or nuclei) can \"see\" during graph construction and model training. Segger is particularly sensitive to the size of the receptive field because it affects the model's ability to propagate information across the graph. If the receptive field is too small, the model may fail to capture sufficient context for correct cell boundary delineation. Conversely, a very large receptive field may introduce noise by linking unrelated or distant nodes, reducing segmentation accuracy.\n", + "\n", + "#### Parameters affecting the receptive field in Segger:\n", + "- **`--r`**: This parameter defines the radius used when connecting transcripts to nuclei. A larger `r` expands the receptive field, linking more distant nodes. Fine-tuning this parameter helps ensure that Segger captures the right level of spatial interaction in the dataset.\n", + "- **`--k_bd` and `--k_tx`**: These control the number of nearest neighbors (nuclei and transcripts, respectively) considered in the graph. By increasing these values, the receptive field is effectively broadened, allowing more nodes to contribute to the information propagation.\n", + "- **`--dist_bd` and `--dist_tx`**: These parameters specify the maximum distances used to connect nuclei (`dist_bd`) and transcripts (`dist_tx`) to their neighbors during graph construction. They directly affect the receptive field by defining the cut-off distance for forming edges in the graph. Larger distance values expand the receptive field, connecting nodes that are further apart spatially. Careful tuning of these values is necessary to ensure that Segger captures relevant spatial relationships without introducing noise." + ] + }, + { + "cell_type": "markdown", + "id": "7ece1ac0-0708-45e2-87fc-1b25782831f8", + "metadata": { + "id": "7ece1ac0-0708-45e2-87fc-1b25782831f8" + }, + "source": [ + "# **4. Tune Parameters**" + ] + }, + { + "cell_type": "markdown", + "id": "896b8288-5287-4d10-a206-e68c0e4731c6", + "metadata": { + "id": "896b8288-5287-4d10-a206-e68c0e4731c6" + }, + "source": [ + "### Evaluating Receptive Field Parameters with Grid Search\n", + "\n", + "To evaluate the impact of different receptive field parameters in Segger, we use a **grid search** approach. The parameters `k_bd`, `k_tx`, `dist_bd`, and `dist_tx` (which control the number of neighbors and distances for nuclei and transcripts) are explored through various configurations defined in `param_space`. Each combination of these parameters is passed to the `trainable` function, which creates the dataset, trains the model, and makes predictions based on the specified receptive field.\n", + "\n", + "For each parameter combination:\n", + "1. A dataset is created with the specified receptive field.\n", + "2. The Segger model is trained on this dataset.\n", + "3. Predictions are made, and segmentation results are evaluated using the custom `evaluate` function. This function computes metrics like the fraction of assigned transcripts and average cell sizes.\n", + "\n", + "The results from each configuration are saved, allowing us to compare how different receptive field settings impact the model’s performance. This process enables a thorough search of the parameter space, optimizing the model for accurate segmentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0c1a7a8-acb2-4aae-8ae4-8aa9a4196717", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T01:10:47.781418Z", + "iopub.status.busy": "2024-09-12T01:10:47.781067Z", + "iopub.status.idle": "2024-09-12T01:10:48.706615Z", + "shell.execute_reply": "2024-09-12T01:10:48.706194Z", + "shell.execute_reply.started": "2024-09-12T01:10:47.781401Z" + }, + "id": "b0c1a7a8-acb2-4aae-8ae4-8aa9a4196717" + }, + "outputs": [], + "source": [ + "import itertools\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bd0803c-e58d-4f43-9627-d2c1ab187d5e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T01:16:31.976312Z", + "iopub.status.busy": "2024-09-12T01:16:31.975947Z", + "iopub.status.idle": "2024-09-12T01:16:33.168389Z", + "shell.execute_reply": "2024-09-12T01:16:33.167956Z", + "shell.execute_reply.started": "2024-09-12T01:16:31.976295Z" + }, + "id": "0bd0803c-e58d-4f43-9627-d2c1ab187d5e" + }, + "outputs": [], + "source": [ + "tuning_dir = Path('path/to/tutorial/tuning/')\n", + "sampling_rate = 0.125" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b879a0b5-150c-4240-99ec-81075855aa52", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T01:16:33.169525Z", + "iopub.status.busy": "2024-09-12T01:16:33.169189Z", + "iopub.status.idle": "2024-09-12T01:16:34.147222Z", + "shell.execute_reply": "2024-09-12T01:16:34.146804Z", + "shell.execute_reply.started": "2024-09-12T01:16:33.169508Z" + }, + "jupyter": { + "source_hidden": true + }, + "id": "b879a0b5-150c-4240-99ec-81075855aa52" + }, + "outputs": [], + "source": [ + "# Fixed function arguments used for each trial\n", + "transcripts_path = xenium_data_dir / 'transcripts.parquet'\n", + "\n", + "boundaries_path = xenium_data_dir / 'nucleus_boundaries.parquet'\n", + "\n", + "dataset_kwargs = dict(\n", + " x_size=80, y_size=80, d_x=80, d_y=80, margin_x=10, margin_y=10,\n", + " num_workers=4, sampling_rate=sampling_rate,\n", + ")\n", + "\n", + "model_kwargs = dict(\n", + " metadata=(['tx', 'bd'], [('tx', 'belongs', 'bd'), ('tx', 'neighbors', 'tx')]),\n", + " num_tx_tokens=500, init_emb=8, hidden_channels=32, out_channels=8,\n", + " heads=2, num_mid_layers=2, aggr='sum',\n", + ")\n", + "\n", + "trainer_kwargs = dict(\n", + " accelerator='cuda', strategy='auto', precision='16-mixed', devices=1,\n", + " max_epochs=100,\n", + ")\n", + "\n", + "predict_kwargs = dict(score_cut=0.2, use_cc=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbd831c9-3a50-4e3b-97d3-3c152ae01188", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "id": "fbd831c9-3a50-4e3b-97d3-3c152ae01188" + }, + "outputs": [], + "source": [ + "def trainable(config):\n", + "\n", + " receptive_field = {k: config[k] for k in ['k_bd', 'k_tx', 'dist_bd', 'dist_tx']}\n", + "\n", + " # Dataset creation\n", + " xs = XeniumSample(verbose=False)\n", + " xs.set_file_paths(transcripts_path, boundaries_path)\n", + " xs.set_metadata()\n", + " try:\n", + " xs.save_dataset_for_segger(\n", + " processed_dir=config['data_dir'],\n", + " receptive_field=receptive_field,\n", + " **dataset_kwargs,\n", + " )\n", + " except:\n", + " pass\n", + "\n", + " # Model training\n", + " ls = LitSegger(**model_kwargs)\n", + " dm = SeggerDataModule(\n", + " data_dir=config['data_dir'],\n", + " batch_size=2,\n", + " num_workers=dataset_kwargs['num_workers'],\n", + " )\n", + " trainer = Trainer(\n", + " default_root_dir=config['model_dir'],\n", + " logger=CSVLogger(config['model_dir']),\n", + " **trainer_kwargs,\n", + " )\n", + " trainer.fit(model=ls, datamodule=dm)\n", + "\n", + " segmentation = predict(\n", + " load_model(config['model_dir']/'lightning_logs/version_0/checkpoints'),\n", + " dm.train_dataloader(),\n", + " receptive_field=receptive_field,\n", + " **predict_kwargs,\n", + " )\n", + "\n", + " metrics = evaluate(segmentation)\n", + "\n", + "\n", + "def evaluate(segmentation: pd.DataFrame, score_cut: float) -> pd.Series:\n", + "\n", + " assigned = segmentation['score'] > score_cut\n", + " metrics = pd.Series(dtype=float)\n", + " metrics['frac_assigned'] = assigned.mean()\n", + " cell_sizes = segmentation.groupby(assigned)['segger_cell_id'].value_counts()\n", + " assigned_avg = 0 if True not in cell_sizes.index else cell_sizes[True].mean()\n", + " cc_avg = 0 if False not in cell_sizes.index else cell_sizes[False].mean()\n", + " metrics['cell_size_assigned'] = assigned_avg\n", + " metrics['cell_size_cc'] = cc_avg\n", + " return metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba2dcc9a-3a06-4b84-a487-59a768eed5d5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-09-12T01:16:35.184598Z", + "iopub.status.busy": "2024-09-12T01:16:35.184180Z", + "iopub.status.idle": "2024-09-12T01:19:55.171470Z", + "shell.execute_reply": "2024-09-12T01:19:55.170810Z", + "shell.execute_reply.started": "2024-09-12T01:16:35.184582Z" + }, + "scrolled": true, + "id": "ba2dcc9a-3a06-4b84-a487-59a768eed5d5" + }, + "outputs": [], + "source": [ + "param_space = {\n", + " \"k_bd\": [3, 5, 10],\n", + " \"dist_bd\": [5, 10, 15, 20],\n", + " \"k_tx\": [3, 5, 10],\n", + " \"dist_tx\": [3, 5, 10],\n", + "}\n", + "\n", + "metrics = []\n", + "\n", + "for params in itertools.product(*param_space.values()):\n", + "\n", + " config = dict(zip(param_space.keys(), params))\n", + "\n", + " # Setup directories\n", + " trial_dir = tuning_dir / '_'.join([f'{k}={v}' for k, v in config.items()])\n", + "\n", + " data_dir = trial_dir / 'segger_data'\n", + " data_dir.mkdir(exist_ok=True, parents=True)\n", + " config['data_dir'] = data_dir\n", + "\n", + " model_dir = trial_dir / 'models'\n", + " model_dir.mkdir(exist_ok=True, parents=True)\n", + " config['model_dir'] = model_dir\n", + "\n", + " segmentation = trainable(config)\n", + " trial = evaluate(segmentation, predict_kwargs['score_cut'])\n", + " trial = pd.concat([pd.Series(config), trial])\n", + " metrics.append(trial)\n", + "\n", + "metrics = pd.DataFrame(metrics)" + ] + }, + { + "cell_type": "markdown", + "id": "dcfa5570-ada2-4102-aae0-a3830d304c5f", + "metadata": { + "id": "dcfa5570-ada2-4102-aae0-a3830d304c5f" + }, + "source": [ + "### Interpreting Output Metrics\n", + "\n", + "The key output metrics include:\n", + "- **`frac_assigned`**: The fraction of transcripts that were successfully assigned to a cell. A higher value indicates that the model is doing a good job associating transcripts with nuclei, which is a strong indicator of successful segmentation.\n", + "- **`cell_size_assigned`**: The average size of cells that have assigned transcripts. This helps assess how well the model is predicting cell boundaries, with unusually large or small values indicating potential issues with segmentation accuracy.\n", + "- **`cell_size_cc`**: The average size of connected components that were not assigned to a cell (i.e., nucleus-less regions). Large values here may suggest that transcripts are being incorrectly grouped together in the absence of a nucleus, which could indicate problems with the receptive field parameters or the segmentation process.\n", + "\n", + "These metrics illuminate the effectiveness of the model by highlighting both the success in associating transcripts with cells and potential areas where the model may need further tuning.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a89aed4-c53b-460f-8a6f-f690920b6829", + "metadata": { + "execution": { + "iopub.status.busy": "2024-09-12T01:19:55.171961Z", + "iopub.status.idle": "2024-09-12T01:19:55.172161Z", + "shell.execute_reply": "2024-09-12T01:19:55.172071Z", + "shell.execute_reply.started": "2024-09-12T01:19:55.172062Z" + }, + "id": "1a89aed4-c53b-460f-8a6f-f690920b6829" + }, + "outputs": [], + "source": [ + "metrics" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots(1,1, figsize=(2,2))\n", - "sns.histplot(\n", - " segmentation['score'],\n", - " bins=50,\n", - " ax=ax,\n", - ")\n", - "ax.set_ylabel('Count')\n", - "ax.set_xlabel('Segger Similarity Score')\n", - "ax.set_yscale('log')" - ] - }, - { - "cell_type": "markdown", - "id": "5492fb96-bf8e-49d5-b40e-7e6b3f871bbe", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-11T22:34:15.990223Z", - "iopub.status.busy": "2024-09-11T22:34:15.988880Z" - } - }, - "source": [ - "#### The Importance of the Receptive Field in Segger\n", - "\n", - "The **receptive field** is a critical parameter in Segger, as it directly influences how the model interprets the spatial relationships between **transcripts** and **nuclei**. In the context of spatial transcriptomics, the receptive field determines the size of the neighborhood that each node (representing transcripts or nuclei) can \"see\" during graph construction and model training. Segger is particularly sensitive to the size of the receptive field because it affects the model's ability to propagate information across the graph. If the receptive field is too small, the model may fail to capture sufficient context for correct cell boundary delineation. Conversely, a very large receptive field may introduce noise by linking unrelated or distant nodes, reducing segmentation accuracy.\n", - "\n", - "#### Parameters affecting the receptive field in Segger:\n", - "- **`--r`**: This parameter defines the radius used when connecting transcripts to nuclei. A larger `r` expands the receptive field, linking more distant nodes. Fine-tuning this parameter helps ensure that Segger captures the right level of spatial interaction in the dataset.\n", - "- **`--k_bd` and `--k_tx`**: These control the number of nearest neighbors (nuclei and transcripts, respectively) considered in the graph. By increasing these values, the receptive field is effectively broadened, allowing more nodes to contribute to the information propagation.\n", - "- **`--dist_bd` and `--dist_tx`**: These parameters specify the maximum distances used to connect nuclei (`dist_bd`) and transcripts (`dist_tx`) to their neighbors during graph construction. They directly affect the receptive field by defining the cut-off distance for forming edges in the graph. Larger distance values expand the receptive field, connecting nodes that are further apart spatially. Careful tuning of these values is necessary to ensure that Segger captures relevant spatial relationships without introducing noise." - ] - }, - { - "cell_type": "markdown", - "id": "7ece1ac0-0708-45e2-87fc-1b25782831f8", - "metadata": {}, - "source": [ - "# **4. Tune Parameters**" - ] - }, - { - "cell_type": "markdown", - "id": "896b8288-5287-4d10-a206-e68c0e4731c6", - "metadata": {}, - "source": [ - "### Evaluating Receptive Field Parameters with Grid Search\n", - "\n", - "To evaluate the impact of different receptive field parameters in Segger, we use a **grid search** approach. The parameters `k_bd`, `k_tx`, `dist_bd`, and `dist_tx` (which control the number of neighbors and distances for nuclei and transcripts) are explored through various configurations defined in `param_space`. Each combination of these parameters is passed to the `trainable` function, which creates the dataset, trains the model, and makes predictions based on the specified receptive field.\n", - "\n", - "For each parameter combination:\n", - "1. A dataset is created with the specified receptive field.\n", - "2. The Segger model is trained on this dataset.\n", - "3. Predictions are made, and segmentation results are evaluated using the custom `evaluate` function. This function computes metrics like the fraction of assigned transcripts and average cell sizes.\n", - "\n", - "The results from each configuration are saved, allowing us to compare how different receptive field settings impact the model’s performance. This process enables a thorough search of the parameter space, optimizing the model for accurate segmentation." - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "b0c1a7a8-acb2-4aae-8ae4-8aa9a4196717", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T01:10:47.781418Z", - "iopub.status.busy": "2024-09-12T01:10:47.781067Z", - "iopub.status.idle": "2024-09-12T01:10:48.706615Z", - "shell.execute_reply": "2024-09-12T01:10:48.706194Z", - "shell.execute_reply.started": "2024-09-12T01:10:47.781401Z" - } - }, - "outputs": [], - "source": [ - "import itertools\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "0bd0803c-e58d-4f43-9627-d2c1ab187d5e", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T01:16:31.976312Z", - "iopub.status.busy": "2024-09-12T01:16:31.975947Z", - "iopub.status.idle": "2024-09-12T01:16:33.168389Z", - "shell.execute_reply": "2024-09-12T01:16:33.167956Z", - "shell.execute_reply.started": "2024-09-12T01:16:31.976295Z" - } - }, - "outputs": [], - "source": [ - "tuning_dir = Path('path/to/tutorial/tuning/')\n", - "sampling_rate = 0.125" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "b879a0b5-150c-4240-99ec-81075855aa52", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T01:16:33.169525Z", - "iopub.status.busy": "2024-09-12T01:16:33.169189Z", - "iopub.status.idle": "2024-09-12T01:16:34.147222Z", - "shell.execute_reply": "2024-09-12T01:16:34.146804Z", - "shell.execute_reply.started": "2024-09-12T01:16:33.169508Z" - }, - "jupyter": { - "source_hidden": true - } - }, - "outputs": [], - "source": [ - "# Fixed function arguments used for each trial\n", - "transcripts_path = xenium_data_dir / 'transcripts.parquet'\n", - "\n", - "boundaries_path = xenium_data_dir / 'nucleus_boundaries.parquet'\n", - "\n", - "dataset_kwargs = dict(\n", - " x_size=80, y_size=80, d_x=80, d_y=80, margin_x=10, margin_y=10,\n", - " num_workers=4, sampling_rate=sampling_rate,\n", - ")\n", - "\n", - "model_kwargs = dict(\n", - " metadata=(['tx', 'bd'], [('tx', 'belongs', 'bd'), ('tx', 'neighbors', 'tx')]),\n", - " num_tx_tokens=500, init_emb=8, hidden_channels=32, out_channels=8,\n", - " heads=2, num_mid_layers=2, aggr='sum',\n", - ")\n", - "\n", - "trainer_kwargs = dict(\n", - " accelerator='cuda', strategy='auto', precision='16-mixed', devices=1, \n", - " max_epochs=100,\n", - ")\n", - "\n", - "predict_kwargs = dict(score_cut=0.2, use_cc=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbd831c9-3a50-4e3b-97d3-3c152ae01188", - "metadata": { - "jupyter": { - "source_hidden": true - } - }, - "outputs": [], - "source": [ - "def trainable(config):\n", - "\n", - " receptive_field = {k: config[k] for k in ['k_bd', 'k_tx', 'dist_bd', 'dist_tx']}\n", - "\n", - " # Dataset creation\n", - " xs = XeniumSample(verbose=False)\n", - " xs.set_file_paths(transcripts_path, boundaries_path)\n", - " xs.set_metadata()\n", - " try:\n", - " xs.save_dataset_for_segger(\n", - " processed_dir=config['data_dir'],\n", - " receptive_field=receptive_field,\n", - " **dataset_kwargs,\n", - " )\n", - " except:\n", - " pass\n", - " \n", - " # Model training\n", - " ls = LitSegger(**model_kwargs)\n", - " dm = SeggerDataModule(\n", - " data_dir=config['data_dir'],\n", - " batch_size=2,\n", - " num_workers=dataset_kwargs['num_workers'],\n", - " )\n", - " trainer = Trainer( \n", - " default_root_dir=config['model_dir'],\n", - " logger=CSVLogger(config['model_dir']),\n", - " **trainer_kwargs,\n", - " )\n", - " trainer.fit(model=ls, datamodule=dm)\n", - "\n", - " segmentation = predict(\n", - " load_model(config['model_dir']/'lightning_logs/version_0/checkpoints'),\n", - " dm.train_dataloader(),\n", - " receptive_field=receptive_field,\n", - " **predict_kwargs,\n", - " )\n", - "\n", - " metrics = evaluate(segmentation)\n", - "\n", - "\n", - "def evaluate(segmentation: pd.DataFrame, score_cut: float) -> pd.Series:\n", - " \n", - " assigned = segmentation['score'] > score_cut\n", - " metrics = pd.Series(dtype=float)\n", - " metrics['frac_assigned'] = assigned.mean()\n", - " cell_sizes = segmentation.groupby(assigned)['segger_cell_id'].value_counts()\n", - " assigned_avg = 0 if True not in cell_sizes.index else cell_sizes[True].mean()\n", - " cc_avg = 0 if False not in cell_sizes.index else cell_sizes[False].mean()\n", - " metrics['cell_size_assigned'] = assigned_avg\n", - " metrics['cell_size_cc'] = cc_avg\n", - " return metrics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba2dcc9a-3a06-4b84-a487-59a768eed5d5", - "metadata": { - "execution": { - "iopub.execute_input": "2024-09-12T01:16:35.184598Z", - "iopub.status.busy": "2024-09-12T01:16:35.184180Z", - "iopub.status.idle": "2024-09-12T01:19:55.171470Z", - "shell.execute_reply": "2024-09-12T01:19:55.170810Z", - "shell.execute_reply.started": "2024-09-12T01:16:35.184582Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "param_space = {\n", - " \"k_bd\": [3, 5, 10],\n", - " \"dist_bd\": [5, 10, 15, 20],\n", - " \"k_tx\": [3, 5, 10],\n", - " \"dist_tx\": [3, 5, 10],\n", - "}\n", - "\n", - "metrics = []\n", - "\n", - "for params in itertools.product(*param_space.values()):\n", - "\n", - " config = dict(zip(param_space.keys(), params))\n", - "\n", - " # Setup directories\n", - " trial_dir = tuning_dir / '_'.join([f'{k}={v}' for k, v in config.items()])\n", - " \n", - " data_dir = trial_dir / 'segger_data'\n", - " data_dir.mkdir(exist_ok=True, parents=True)\n", - " config['data_dir'] = data_dir\n", - "\n", - " model_dir = trial_dir / 'models'\n", - " model_dir.mkdir(exist_ok=True, parents=True)\n", - " config['model_dir'] = model_dir\n", - "\n", - " segmentation = trainable(config)\n", - " trial = evaluate(segmentation, predict_kwargs['score_cut'])\n", - " trial = pd.concat([pd.Series(config), trial])\n", - " metrics.append(trial)\n", - "\n", - "metrics = pd.DataFrame(metrics)" - ] - }, - { - "cell_type": "markdown", - "id": "dcfa5570-ada2-4102-aae0-a3830d304c5f", - "metadata": {}, - "source": [ - "### Interpreting Output Metrics\n", - "\n", - "The key output metrics include:\n", - "- **`frac_assigned`**: The fraction of transcripts that were successfully assigned to a cell. A higher value indicates that the model is doing a good job associating transcripts with nuclei, which is a strong indicator of successful segmentation.\n", - "- **`cell_size_assigned`**: The average size of cells that have assigned transcripts. This helps assess how well the model is predicting cell boundaries, with unusually large or small values indicating potential issues with segmentation accuracy.\n", - "- **`cell_size_cc`**: The average size of connected components that were not assigned to a cell (i.e., nucleus-less regions). Large values here may suggest that transcripts are being incorrectly grouped together in the absence of a nucleus, which could indicate problems with the receptive field parameters or the segmentation process.\n", - "\n", - "These metrics illuminate the effectiveness of the model by highlighting both the success in associating transcripts with cells and potential areas where the model may need further tuning.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a89aed4-c53b-460f-8a6f-f690920b6829", - "metadata": { - "execution": { - "iopub.status.busy": "2024-09-12T01:19:55.171961Z", - "iopub.status.idle": "2024-09-12T01:19:55.172161Z", - "shell.execute_reply": "2024-09-12T01:19:55.172071Z", - "shell.execute_reply.started": "2024-09-12T01:19:55.172062Z" } - }, - "outputs": [], - "source": [ - "metrics" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "accelerator": "GPU" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From e11d8209a3c648387bcf243b865094007c550502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Tue, 22 Oct 2024 08:51:03 +0200 Subject: [PATCH 136/156] None checking while processing embeddings --- src/segger/cli/create_dataset_fast.py | 1 + src/segger/data/parquet/sample.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/segger/cli/create_dataset_fast.py b/src/segger/cli/create_dataset_fast.py index 94ebe21..c3716a4 100644 --- a/src/segger/cli/create_dataset_fast.py +++ b/src/segger/cli/create_dataset_fast.py @@ -58,6 +58,7 @@ def create_dataset(args: Namespace): logging.basicConfig(level=logging.INFO, handlers=[ch]) # If scRNAseq file is provided, calculate gene-celltype embeddings + gene_celltype_abundance_embedding = None if args.scrnaseq_file: logging.info("Calculating gene and celltype embeddings...") scRNAseq = sc.read(args.scrnaseq_file) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 8c89d8d..06ccd7e 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -74,6 +74,7 @@ def __init__( self._boundaries_metadata = None # Setup default embedding for transcripts + self.emb_genes = None if weights is not None: self.emb_genes = weights.index.to_list() classes = self.transcripts_metadata["feature_names"] From 94e4d4446f5b09897c30039cec1cac8c0454b023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Tue, 22 Oct 2024 08:53:20 +0200 Subject: [PATCH 137/156] Job submission hotfixes --- scripts/submit_job.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 0949ab3..0dead12 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -64,10 +64,6 @@ def run_data_processing(): config["preprocessing"]["data_dir"], "--sample_type", config["preprocessing"]["sample_type"], - "--scrnaseq_file", - config["preprocessing"]["scrnaseq_file"], - "--celltype_column", - config["preprocessing"]["celltype_column"], "--k_bd", str(config["preprocessing"]["k_bd"]), "--dist_bd", @@ -95,6 +91,10 @@ def run_data_processing(): command.extend(["--tile_width", str(config["preprocessing"]["tile_width"])]) if config["preprocessing"].get("tile_height") is not None: command.extend(["--tile_height", str(config["preprocessing"]["tile_height"])]) + if config["preprocessing"].get("scrnaseq_file") is not None: + command.extend(["--scrnaseq_file", config["preprocessing"]["scrnaseq_file"]]) + if config["preprocessing"].get("celltype_column") is not None: + command.extend(["--celltype_column", config["preprocessing"]["celltype_column"]]) if config.get("use_lsf", False): command = [ @@ -179,8 +179,8 @@ def run_training(): "gpu", ] + command # only run training after data_processing - if "1" in config["pipelines"]: - command[4:4] = ["-w", f"done(job_data_processing_{time_stamp})"] + if 1 in config["pipelines"]: + command[3:3] = ["-w", f"done(job_data_processing_{time_stamp})"] try: print(f"Running command: {command}") @@ -252,10 +252,10 @@ def run_prediction(): "gpu", ] + command # only run prediction after training/data_processing - if "2" in config["pipelines"]: - command[4:4] = ["-w", f"done(job_training_{time_stamp})"] - elif "1" in config["pipelines"]: - command[4:4] = ["-w", f"done(job_data_processing_{time_stamp})"] + if 2 in config["pipelines"]: + command[3:3] = ["-w", f"done(job_training_{time_stamp})"] + elif 1 in config["pipelines"]: + command[3:3] = ["-w", f"done(job_data_processing_{time_stamp})"] try: print(f"Running command: {command}") From d8bb7dea18799cf48c43473ad32bd30b52e3d135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Wed, 23 Oct 2024 09:37:00 +0200 Subject: [PATCH 138/156] Docs update --- docs/cli.md | 37 +++--- docs/installation.md | 7 +- docs/notebooks/segger_tutorial.ipynb | 175 +++++++++++++++------------ 3 files changed, 125 insertions(+), 94 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 6847a1c..889b445 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -11,6 +11,8 @@ python3 src/segger/cli/create_dataset_fast.py \ --base_dir /path/to/raw_data \ --data_dir /path/to/save/processed_data \ --sample_type xenium \ + --scrnaseq_file /path/to/scrnaseq_file \ + --celltype_column celltype_column_name \ --k_bd 3 \ --dist_bd 15.0 \ --k_tx 3 \ @@ -28,15 +30,17 @@ python3 src/segger/cli/create_dataset_fast.py \ | Parameter | Description | Default Value | |----------------------|-----------------------------------------------------------------------------------------|---------------| -| `base_dir` | Directory containing the raw dataset (e.g., transcripts, boundaries). | None | -| `data_dir` | Directory to save the processed Segger dataset (in PyTorch Geometric format). | None | -| `sample_type` | The sample type of the raw data, e.g., "xenium" or "merscope". | `xenium` | +| `base_dir` | Directory containing the raw dataset (e.g., transcripts, boundaries). | - | +| `data_dir` | Directory to save the processed Segger dataset (in PyTorch Geometric format). | - | +| `sample_type` | The sample type of the raw data, e.g., "xenium" or "merscope". | None | +| `scrnaseq_file` | Path to the scRNAseq file. | None | +| `celltype_column` | Column name for cell type annotations in the scRNAseq file. | None | | `k_bd` | Number of nearest neighbors for boundary nodes. | `3` | | `dist_bd` | Maximum distance for boundary neighbors. | `15.0` | | `k_tx` | Number of nearest neighbors for transcript nodes. | `3` | | `dist_tx` | Maximum distance for transcript neighbors. | `5.0` | -| `tile_width` | Width of the tiles in pixels (ignored if `tile_size` is provided). | `200` | -| `tile_height` | Height of the tiles in pixels (ignored if `tile_size` is provided). | `200` | +| `tile_width` | Width of the tiles in pixels (ignored if `tile_size` is provided). | None | +| `tile_height` | Height of the tiles in pixels (ignored if `tile_size` is provided). | None | | `neg_sampling_ratio` | Ratio of negative samples. | `5.0` | | `frac` | Fraction of the dataset to process. Useful for subsampling large datasets. | `1.0` | | `val_prob` | Proportion of the dataset used for validation split. | `0.1` | @@ -52,10 +56,11 @@ python3 src/segger/cli/create_dataset_fast.py \ - **val_prob, test_prob**: Control the dataset portions for validation and testing. Adjust based on your dataset size and evaluation needs. - **frac**: Specifies the fraction of the dataset to process. Reducing `frac` can be useful when working with very large datasets, allowing for faster dataset creation by only processing a subset of the data. - !!! tip "Faster Dataset Creation" Increasing the number of workers (`n_workers`) can significantly accelerate the dataset creation process, especially for large datasets, by taking advantage of parallel processing across multiple CPU cores. +!!! tip "Enhancing Segmentation Accuracy with scRNA-seq" + Incorporating single cell RNA sequencing (scRNA-seq) data as features can provide additional biological context, improving the accuracy of the segger model. --- ### 2. Training a Model @@ -88,9 +93,9 @@ $ python3 src/segger/cli/train_model.py \ | Parameter | Description | Default Value | |--------------------|-----------------------------------------------------------------------------------------|---------------| -| `dataset_dir` | Directory containing the processed Segger dataset (in PyTorch Geometric format). | None | -| `models_dir` | Directory to save the trained model and training logs. | None | -| `sample_tag` | Tag used to identify the dataset during training. | None | +| `dataset_dir` | Directory containing the processed Segger dataset (in PyTorch Geometric format). | - | +| `models_dir` | Directory to save the trained model and training logs. | - | +| `sample_tag` | Tag used to identify the dataset during training. | - | | `init_emb` | Size of the embedding layer for input data. | `8` | | `hidden_channels` | Number of hidden units in each layer of the neural network. | `32` | | `num_tx_tokens` | Number of transcript tokens used during training. | `500` | @@ -146,10 +151,10 @@ $ python3 src/segger/cli/predict_fast.py \ | Parameter | Description | Default Value | |-----------------------|------------------------------------------------------------------------------------------|---------------| -| `segger_data_dir` | Directory containing the processed Segger dataset (in PyTorch Geometric format). | None | -| `models_dir` | Directory containing the trained models. | None | -| `benchmarks_dir` | Directory to save the segmentation results, including cell boundaries and associations. | None | -| `transcripts_file` | Path to the transcripts.parquet file. | None | +| `segger_data_dir` | Directory containing the processed Segger dataset (in PyTorch Geometric format). | - | +| `models_dir` | Directory containing the trained models. | - | +| `benchmarks_dir` | Directory to save the segmentation results, including cell boundaries and associations. | - | +| `transcripts_file` | Path to the transcripts.parquet file. | - | | `batch_size` | Number of samples to process per batch during prediction. | `1` | | `num_workers` | Number of workers for parallel data loading. | `1` | | `model_version` | Model version number to load for predictions, corresponding to the version from training logs. | `0` | @@ -205,7 +210,7 @@ For users who want a portable, containerized environment, segger supports both D You can pull the segger Docker image from Docker Hub with this command: ```console -docker pull danielunyi42/segger_dev:latest +docker pull danielunyi42/segger_dev:cuda121 ``` To run the pipeline in Docker, make sure your YAML configuration includes the following settings: @@ -219,13 +224,13 @@ Afterwards, run the pipeline inside the Docker container with the same `submit_j For a Singularity environment, pull the image with: ```console -singularity pull docker://danielunyi42/segger_dev:latest +singularity pull docker://danielunyi42/segger_dev:cuda121 ``` Ensure `use_singularity: true` in the YAML file and specify the Singularity image file (e.g., `segger_dev_latest.sif`) in the `singularity_image` field. !!! note "Containerization" - - The segger Docker image currently supports CUDA 12.1. A CUDA 11.8 compatible version will be added soon. + - The segger Docker image currently supports CUDA 11.8 and CUDA 12.1. ### 6. HPC Environments diff --git a/docs/installation.md b/docs/installation.md index 470cba6..ec2f8e2 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -23,15 +23,16 @@ pip install . === ":whale: Docker Installation" ```bash -docker pull danielunyi42/segger_dev:latest +docker pull danielunyi42/segger_dev:cuda121 ``` -The Docker image comes with all required packages pre-installed, including PyTorch, RAPIDS, and PyTorch Geometric. The current image supports CUDA 12.1, and we are working on another image that will support CUDA 11.8 soon. +The Docker image comes with all required packages pre-installed, including PyTorch, RAPIDS, and PyTorch Geometric. +The current images support CUDA 11.8 and CUDA 12.1, which can be specified in the image tag. For users who prefer Singularity: ```bash -singularity pull docker://danielunyi42/segger_dev:latest +singularity pull docker://danielunyi42/segger_dev:cuda121 ``` === ":octocat: Github Installation" diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index 680495e..6b358bf 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -30,74 +30,76 @@ }, { "cell_type": "markdown", - "source": [ - "Installing segger from the GitHub repository:" - ], + "id": "XEY6CTzK0648", "metadata": { "id": "XEY6CTzK0648" }, - "id": "XEY6CTzK0648" + "source": [ + "Installing segger from the GitHub repository:" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "TIQnPzfx08Zr", + "metadata": { + "id": "TIQnPzfx08Zr" + }, + "outputs": [], "source": [ "!git clone https://github.com/EliHei2/segger_dev.git\n", "%cd segger_dev\n", "!pip install \".[rapids12]\" -q" - ], - "metadata": { - "id": "TIQnPzfx08Zr" - }, - "id": "TIQnPzfx08Zr", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "Downloading the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer):" - ], + "id": "q3SNnImS09_N", "metadata": { "id": "q3SNnImS09_N" }, - "id": "q3SNnImS09_N" + "source": [ + "Downloading the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer):" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "Qjdt3f-U0_i9", + "metadata": { + "id": "Qjdt3f-U0_i9" + }, + "outputs": [], "source": [ "!mkdir data_xenium\n", "%cd data_xenium\n", "!wget https://cf.10xgenomics.com/samples/xenium/1.6.0/Xenium_V1_hPancreas_Cancer_Add_on_FFPE/Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs.zip\n", "!unzip Xenium_V1_hPancreas_Cancer_Add_on_FFPE_outs.zip\n", "%cd .." - ], - "metadata": { - "id": "Qjdt3f-U0_i9" - }, - "id": "Qjdt3f-U0_i9", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "id": "trM8h-Ek16sJ", + "metadata": { + "id": "trM8h-Ek16sJ" + }, + "outputs": [], "source": [ "from segger.data.io import XeniumSample\n", - "from segger.training.train import LitSegger\n", "from segger.training.segger_data_module import SeggerDataModule\n", + "from segger.training.train import LitSegger\n", "from segger.prediction.predict import predict, load_model\n", + "from segger.data.utils import calculate_gene_celltype_abundance_embedding\n", "from lightning.pytorch.loggers import CSVLogger\n", "from pytorch_lightning import Trainer\n", "from pathlib import Path\n", "import pandas as pd\n", "from matplotlib import pyplot as plt\n", - "import seaborn as sns" - ], - "metadata": { - "id": "trM8h-Ek16sJ" - }, - "id": "trM8h-Ek16sJ", - "execution_count": null, - "outputs": [] + "import seaborn as sns\n", + "import scanpy as sc" + ] }, { "cell_type": "markdown", @@ -129,7 +131,24 @@ "To create the dataset, you need to specify the path to the **transcripts** file and the **nuclei boundaries** file. These are typically downloaded from a spatial transcriptomics dataset like the [Xenium Human Pancreatic Dataset](https://www.10xgenomics.com/products/xenium-human-pancreatic-dataset-explorer).\n", "\n", "- **`--transcripts_path`**: Path to the transcripts file, which contains single-cell transcriptomic data.\n", - "- **`--boundaries_path`**: Path to the boundaries file, most often representing the nuclei boundaries in the imaging dataset." + "- **`--boundaries_path`**: Path to the boundaries file, most often representing the nuclei boundaries in the imaging dataset.\n", + "\n", + "If single cell RNA sequencing results are available, you can incorporate them as features for segger:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "598b4b16", + "metadata": {}, + "outputs": [], + "source": [ + "# scrnaseq_file = Path('my_scRNAseq_file.h5ad')\n", + "# celltype_column = 'celltype_column'\n", + "# gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(\n", + "# sc.read(scrnaseq_file),\n", + "# celltype_column\n", + "# )" ] }, { @@ -153,7 +172,10 @@ "segger_data_dir = Path('data_segger')\n", "\n", "# Setup Xenium sample to create dataset\n", - "xs = XeniumSample(verbose=False)\n", + "xs = XeniumSample(\n", + " verbose=False,\n", + " # embedding_df=gene_celltype_abundance_embedding # uncomment if gene-celltype embeddings are available\n", + ")\n", "xs.set_file_paths(\n", " transcripts_path=xenium_data_dir / 'transcripts.parquet',\n", " boundaries_path=xenium_data_dir / 'nucleus_boundaries.parquet',\n", @@ -194,8 +216,8 @@ "shell.execute_reply": "2024-09-12T00:49:07.234925Z", "shell.execute_reply.started": "2024-09-12T00:49:06.357975Z" }, - "scrolled": true, - "id": "c8cf7102-ad9c-4bd0-bbd7-61a1d73abccd" + "id": "c8cf7102-ad9c-4bd0-bbd7-61a1d73abccd", + "scrolled": true }, "outputs": [], "source": [ @@ -244,15 +266,15 @@ }, { "cell_type": "code", - "source": [ - "from segger.data.parquet.sample import STSampleParquet" - ], + "execution_count": 10, + "id": "vlDtoWZb24FJ", "metadata": { "id": "vlDtoWZb24FJ" }, - "id": "vlDtoWZb24FJ", - "execution_count": 10, - "outputs": [] + "outputs": [], + "source": [ + "from segger.data.parquet.sample import STSampleParquet" + ] }, { "cell_type": "code", @@ -269,7 +291,8 @@ "sample = STSampleParquet(\n", " base_dir=xenium_data_dir,\n", " n_workers=4,\n", - " sample_type='xenium'\n", + " sample_type='xenium',\n", + " # weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available\n", ")\n", "\n", "sample.save(\n", @@ -300,6 +323,8 @@ "- **--base_dir**: Directory containing the raw spatial transcriptomics dataset.\n", "- **--data_dir**: Directory where the processed Segger dataset (in PyG format) will be saved.\n", "- **--sample_type**: (Optional) Specifies the type of dataset (e.g., \"xenium\" or \"merscope\"). Defaults to None.\n", + "- **--scrnaseq_file**: Path to the scRNAseq file (default: None).\n", + "- **--celltype_column**: Column name for cell type annotations in the scRNAseq file (default: None).\n", "- **--k_bd**: Number of nearest neighbors for boundary nodes (default: 3).\n", "- **--dist_bd**: Maximum distance for boundary neighbors (default: 15.0).\n", "- **--k_tx**: Number of nearest neighbors for transcript nodes (default: 3).\n", @@ -419,8 +444,8 @@ "execution_count": null, "id": "207864b8-7e52-4add-a4a2-e95a4debdc06", "metadata": { - "scrolled": true, - "id": "207864b8-7e52-4add-a4a2-e95a4debdc06" + "id": "207864b8-7e52-4add-a4a2-e95a4debdc06", + "scrolled": true }, "outputs": [], "source": [ @@ -591,6 +616,24 @@ }, { "cell_type": "code", + "execution_count": null, + "id": "PEOtAs-t9CiY", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PEOtAs-t9CiY", + "outputId": "8b7a5375-9ebc-4bb4-9421-254410319120" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting segmentation for segger_embedding_1001...\n" + ] + } + ], "source": [ "dm = SeggerDataModule(\n", " data_dir='data_segger',\n", @@ -619,24 +662,6 @@ " knn_method='cuda',\n", " verbose=True,\n", ")\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PEOtAs-t9CiY", - "outputId": "8b7a5375-9ebc-4bb4-9421-254410319120" - }, - "id": "PEOtAs-t9CiY", - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Starting segmentation for segger_embedding_1001...\n" - ] - } ] }, { @@ -832,10 +857,10 @@ "shell.execute_reply": "2024-09-12T01:16:34.146804Z", "shell.execute_reply.started": "2024-09-12T01:16:33.169508Z" }, + "id": "b879a0b5-150c-4240-99ec-81075855aa52", "jupyter": { "source_hidden": true - }, - "id": "b879a0b5-150c-4240-99ec-81075855aa52" + } }, "outputs": [], "source": [ @@ -868,10 +893,10 @@ "execution_count": null, "id": "fbd831c9-3a50-4e3b-97d3-3c152ae01188", "metadata": { + "id": "fbd831c9-3a50-4e3b-97d3-3c152ae01188", "jupyter": { "source_hidden": true - }, - "id": "fbd831c9-3a50-4e3b-97d3-3c152ae01188" + } }, "outputs": [], "source": [ @@ -941,8 +966,8 @@ "shell.execute_reply": "2024-09-12T01:19:55.170810Z", "shell.execute_reply.started": "2024-09-12T01:16:35.184582Z" }, - "scrolled": true, - "id": "ba2dcc9a-3a06-4b84-a487-59a768eed5d5" + "id": "ba2dcc9a-3a06-4b84-a487-59a768eed5d5", + "scrolled": true }, "outputs": [], "source": [ @@ -1015,6 +1040,11 @@ } ], "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "name": "python3" @@ -1030,13 +1060,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" - }, - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "accelerator": "GPU" + } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 924e3d2b7de63e50905c1c73e72fd9b70841199f Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Wed, 23 Oct 2024 13:35:58 +0200 Subject: [PATCH 139/156] Update segger_tutorial.ipynb --- docs/notebooks/segger_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb index 6b358bf..ee4cebc 100644 --- a/docs/notebooks/segger_tutorial.ipynb +++ b/docs/notebooks/segger_tutorial.ipynb @@ -611,7 +611,7 @@ }, "outputs": [], "source": [ - "from segger.prediction.predict import segment, load_model" + "from segger.prediction.predict_parquet import segment, load_model" ] }, { From 24b3d9486c37e6c011c5ca0aeae557152987a4f1 Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Wed, 23 Oct 2024 23:55:21 +0200 Subject: [PATCH 140/156] Update pyproject.toml --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c073c3e..33a0664 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,8 @@ dependencies = [ "pyarrow>=16.1.0,<16.2.0", "dask_geopandas>=0.4.0", "torch-geometric>=2.2.0", - "pqdm>=0.2.0" + "pqdm>=0.2.0", + "rtree>=1.3.0", ] [project.optional-dependencies] From d8083f9c0c55f15c761a4d10d6d5264990957f2f Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Thu, 24 Oct 2024 10:02:39 +0200 Subject: [PATCH 141/156] added multi-gpu prediction, needs testing --- src/segger/prediction/predict_multigpu.py | 777 ++++++++++++++++++++++ 1 file changed, 777 insertions(+) create mode 100644 src/segger/prediction/predict_multigpu.py diff --git a/src/segger/prediction/predict_multigpu.py b/src/segger/prediction/predict_multigpu.py new file mode 100644 index 0000000..faaad3b --- /dev/null +++ b/src/segger/prediction/predict_multigpu.py @@ -0,0 +1,777 @@ +import os +import torch +import cupy as cp +import pandas as pd +import numpy as np +import torch.nn.functional as F +import torch._dynamo +import gc +import rmm +import re +import glob +from pathlib import Path +from torch_geometric.loader import DataLoader +from torch_geometric.data import Batch +from segger.data.utils import ( + get_edge_index_cuda, + get_edge_index, + format_time, + create_anndata, + coo_to_dense_adj, +) +from segger.training.train import LitSegger +from segger.training.segger_data_module import SeggerDataModule +from scipy.sparse.csgraph import connected_components as cc +from typing import Union, Dict +import dask.dataframe as dd +from dask import delayed +from dask.diagnostics import ProgressBar +import time +import dask +from rmm.allocators.cupy import rmm_cupy_allocator +from cupyx.scipy.sparse import coo_matrix +from torch.utils.dlpack import to_dlpack, from_dlpack + +from dask.distributed import Client, LocalCluster +import cupy as cp +import numpy as np +import pandas as pd +from cupyx.scipy.sparse import coo_matrix +from cupyx.scipy.sparse import find # To find non-zero elements in sparse matrix +from scipy.sparse.csgraph import connected_components as cc +from scipy.sparse import coo_matrix as scipy_coo_matrix +from dask.distributed import get_client +from pqdm.processes import pqdm +from tqdm import tqdm +import json +from datetime import datetime +import dask_geopandas as dgpd # Assuming dask-geopandas is installed +import cudf +import dask_cudf +import cupy as cp +import cupyx +import warnings +import shutil +from time import time +from cupyx.scipy.sparse import coo_matrix as cp_coo_matrix +from cupyx.scipy.sparse.csgraph import connected_components as cp_cc +from dask.distributed import LocalCluster, Client, progress +import dask +import random +from dask_cuda import LocalCUDACluster +# Setup Dask cluster with 3 workers + + +# CONFIG +torch._dynamo.config.suppress_errors = True +os.environ["PYTORCH_USE_CUDA_DSA"] = "1" +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + + +# Function to zero out diagonal of sparse COO matrix +def zero_out_diagonal_gpu(sparse_matrix): + """ + Zero out the diagonal elements of a sparse CuPy COO matrix while keeping it sparse on the GPU. + + Args: + sparse_matrix (cupyx.scipy.sparse.coo_matrix): Input sparse matrix. + + Returns: + cupyx.scipy.sparse.coo_matrix: Matrix with diagonal elements zeroed out. + """ + # Filter out the diagonal (where row == col) + non_diagonal_mask = sparse_matrix.row != sparse_matrix.col + + # Create a new sparse matrix without diagonal elements + sparse_matrix_no_diag = cupyx.scipy.sparse.coo_matrix( + ( + sparse_matrix.data[non_diagonal_mask], + (sparse_matrix.row[non_diagonal_mask], sparse_matrix.col[non_diagonal_mask]), + ), + shape=sparse_matrix.shape, + ) + + return sparse_matrix_no_diag + + +# Function to subset rows and columns of a sparse matrix +def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): + """ + Subset a sparse matrix using row and column indices. + + Parameters: + sparse_matrix (cupyx.scipy.sparse.spmatrix): The input sparse matrix in COO, CSR, or CSC format. + row_idx (cupy.ndarray): Row indices to keep in the subset. + col_idx (cupy.ndarray): Column indices to keep in the subset. + + Returns: + cupyx.scipy.sparse.spmatrix: A new sparse matrix that is a subset of the input matrix. + """ + + # Convert indices to CuPy arrays if not already + row_idx = cp.asarray(row_idx) + col_idx = cp.asarray(col_idx) + + # Ensure sparse matrix is in COO format for easy indexing (you can use CSR/CSC if more optimal) + sparse_matrix = sparse_matrix.tocoo() + + # Create boolean masks for the row and column indices + row_mask = cp.isin(sparse_matrix.row, row_idx) + col_mask = cp.isin(sparse_matrix.col, col_idx) + + # Apply masks to filter the data, row, and column arrays + mask = row_mask & col_mask + row_filtered = sparse_matrix.row[mask] + col_filtered = sparse_matrix.col[mask] + data_filtered = sparse_matrix.data[mask] + + # Map the row and col indices to the new submatrix indices + row_mapped = cp.searchsorted(row_idx, row_filtered) + col_mapped = cp.searchsorted(col_idx, col_filtered) + + # Return the new subset sparse matrix + return coo_matrix((data_filtered, (row_mapped, col_mapped)), shape=(len(row_idx), len(col_idx))) + + + + +def load_model(checkpoint_path: str) -> LitSegger: + """ + Load a LitSegger model from a checkpoint. + + Parameters + ---------- + checkpoint_path : str + Specific checkpoint file to load, or directory where the model checkpoints are stored. + If directory, the latest checkpoint is loaded. + + Returns + ------- + LitSegger + The loaded LitSegger model. + + Raises + ------ + FileNotFoundError + If the specified checkpoint file does not exist. + """ + checkpoint_path = Path(checkpoint_path) + msg = f"No checkpoint found at {checkpoint_path}. Please make sure you've provided the correct path." + + # Get last checkpoint if directory is provided + if os.path.isdir(checkpoint_path): + checkpoints = glob.glob(str(checkpoint_path / "*.ckpt")) + if len(checkpoints) == 0: + raise FileNotFoundError(msg) + + # Sort checkpoints by epoch and step + def sort_order(c): + match = re.match(r".*epoch=(\d+)-step=(\d+).ckpt", c) + return int(match[1]), int(match[2]) + + checkpoint_path = Path(sorted(checkpoints, key=sort_order)[-1]) + elif not checkpoint_path.exists(): + raise FileExistsError(msg) + + # Load model from checkpoint + lit_segger = LitSegger.load_from_checkpoint( + checkpoint_path=checkpoint_path, + ) + + return lit_segger + + +def get_similarity_scores( + model: torch.nn.Module, + batch: Batch, + from_type: str, + to_type: str, + receptive_field: dict, + knn_method: str = 'cuda', + gpu_id: int = 0 # Added argument for GPU ID +) -> coo_matrix: + """ + Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes + using sparse matrix multiplication with CuPy and the 'sees' edge relation. + + Args: + model (torch.nn.Module): The segmentation model used to generate embeddings. + batch (Batch): A batch of data containing input features and edge indices. + from_type (str): The type of node from which the similarity is computed. + to_type (str): The type of node to which the similarity is computed. + knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. + gpu_id (int, optional): The GPU ID to use for the computations. Defaults to 0. + + Returns: + coo_matrix: A sparse matrix containing the similarity scores between + 'from_type' and 'to_type' nodes. + """ + + with cp.cuda.Device(gpu_id): + # Move the batch to the specified GPU + batch = batch.to(f'cuda:{gpu_id}') + + # Step 1: Get embeddings from the model (on GPU) + shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + # Compute edge indices using knn method (still on GPU) + edge_index = get_edge_index( + batch[to_type].pos[:, :2].cpu(), # 'tx' positions + batch[from_type].pos[:, :2].cpu(), # 'bd' positions + k=receptive_field[f'k_{to_type}'], + dist=receptive_field[f'dist_{to_type}'], + method=knn_method + ) + + # Convert to dense adjacency matrix (on GPU) + edge_index = coo_to_dense_adj( + edge_index.T, + num_nodes=shape[0], + num_nbrs=receptive_field[f'k_{to_type}'] + ) + + with torch.no_grad(): + embeddings = model(batch.x_dict, batch.edge_index_dict) + + + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: + m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros + + similarity = torch.bmm( + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 + del embeddings + # Sigmoid to get most similar 'to_type' neighbor + similarity[similarity == 0] = -torch.inf # ensure zero stays zero + similarity = F.sigmoid(similarity) + # Neighbor-filtered similarity scores + # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] + indices = torch.argwhere(edge_index != -1).T + indices[1] = edge_index[edge_index != -1] + rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) + columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + # print(rows) + del indices + values = similarity[edge_index != -1].flatten() + sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) + return sparse_result + # Free GPU memory after computation + # Call the sparse multiply function + sparse_similarity = sparse_multiply(embeddings, edge_index, shape) + + return sparse_similarity + + +def predict_batch( + lit_segger: torch.nn.Module, + batch: Batch, + score_cut: float, + receptive_field: Dict[str, float], + use_cc: bool = True, + knn_method: str = 'cuda', + edge_index_save_path: Union[str, Path] = None, + output_ddf_save_path: Union[str, Path] = None, + gpu_id: int = 0 # Added argument for GPU ID +): + """ + Predict cell assignments for a batch of transcript data using a segmentation model. + Writes both the assignments and edge_index directly into Parquet files incrementally. + + Args: + lit_segger (torch.nn.Module): The lightning module wrapping the segmentation model. + batch (Batch): A batch of transcript and cell data. + score_cut (float): The threshold for assigning transcripts to cells based on similarity scores. + receptive_field (Dict[str, float]): Dictionary defining the receptive field for transcript-cell + and transcript-transcript relations. + use_cc (bool, optional): If True, perform connected components analysis for unassigned transcripts. + Defaults to True. + knn_method (str, optional): The method to use for nearest neighbors. Defaults to 'cuda'. + edge_index_save_path (str, optional): Path to the Parquet file where edge indices are saved incrementally. + output_ddf_save_path (str, optional): Path to the Parquet file where transcript assignments (`output_ddf`) + are saved incrementally. + gpu_id (int, optional): The GPU ID to use for the computations. Defaults to 0. + """ + + def _get_id(): + """Generate a random Xenium-style ID.""" + return "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 8)) + "-nx" + + print(gpu_id) + with cp.cuda.Device(gpu_id): + # Move the batch to the specified GPU + batch = batch.to(f'cuda:{gpu_id}') + lit_segger.model = lit_segger.model.to(f'cuda:{gpu_id}') + + # Extract transcript IDs and initialize a dictionary for assignments + transcript_id = batch["tx"].id.cpu().numpy().astype("str") + assignments = {"transcript_id": transcript_id} + + if len(batch["bd"].pos) >= 10: + while True: + try: + # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) + scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method, gpu_id=gpu_id) + break + except Exception as e: + print(f"This weird error: {e}.") + torch.cuda.empty_cache() + + # Convert sparse matrix to dense format (on GPU) + dense_scores = scores.toarray() # Convert to dense NumPy array + del scores # Remove from memory + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + + # Step 2: Maximize score and assign transcripts based on score threshold + belongs = cp.max(dense_scores, axis=1) # Max score per transcript + assignments["score"] = cp.asnumpy(belongs) # Move back to CPU + + mask = assignments["score"] >= score_cut # Mask for assigned transcripts + all_ids = batch["bd"].id # Boundary IDs as NumPy array + assignments["segger_cell_id"] = np.where(mask, all_ids[cp.argmax(dense_scores, axis=1).get()], None) + # Clear memory after score processing + del dense_scores + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + torch.cuda.empty_cache() + + assignments["bound"] = np.where(mask, 1, 0) # Mark as 'bound' (1 if assigned, 0 if unassigned) + + # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) + if use_cc: + scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id) + + # Stay on GPU and use CuPy sparse matrices + no_id_scores = cupyx.scipy.sparse.coo_matrix( + (scores_tx.data, (scores_tx.row, scores_tx.col)), shape=scores_tx.shape + ) + + # Apply threshold on GPU + no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold + + # Zero out the diagonal on GPU + no_id_scores = zero_out_diagonal_gpu(no_id_scores) + no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse + + # Find unassigned transcripts (those with no segger_cell_id) + no_id = cp.where(cp.asarray(assignments['segger_cell_id'] == None))[0] # Using CuPy to handle None values + + if len(no_id) > 0: # Only compute if there are unassigned transcripts + # Apply score cut-off to unassigned transcripts + no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) + no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold + no_id_scores.eliminate_zeros() # Clean up zeros + + # Find the non-zero entries in the no_id_scores to construct edge_index + non_zero_rows, non_zero_cols, _ = find(no_id_scores) + unassigned_ids = transcript_id[no_id.get()] # Unassigned transcript IDs + + # Construct edge index (source, target) based on non-zero connections in no_id_scores + source_nodes = unassigned_ids[non_zero_rows.get()] + target_nodes = unassigned_ids[non_zero_cols.get()] + if len(source_nodes) > 10: + # while True: + try: + # # Save edge_index using CuDF and Dask-CuDF for GPU acceleration + edge_index_ddf = delayed(dd.from_pandas)(pd.DataFrame({'source': source_nodes, 'target': target_nodes}), npartitions=1) + # Use delayed for asynchronous disk writing of edge_index in Dask DataFrame + delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True, ignore_divisions=True) + delayed_write_edge_index.compute() # Schedule writing + # break + except Exception as e: + print(f"resource access fault, trying again.") + + + assignments = { + "transcript_id": assignments["transcript_id"].astype("str"), + "score": assignments["score"].astype("float32"), + "segger_cell_id": assignments["segger_cell_id"].astype("str"), # Ensure 'string' dtype + "bound": assignments["bound"].astype("int8"), # Ensure 'int64' dtype + } + # Step 4: Convert assignments to Dask-CuDF DataFrame for this batch + # batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) + if len(assignments['transcript_id']) > 10: + # while True: + try: + batch_ddf = delayed(dd.from_pandas)(pd.DataFrame(assignments), npartitions=1) + + # Save the updated `output_ddf` asynchronously using Dask delayed + delayed_write_output_ddf = delayed(batch_ddf.to_parquet)( + output_ddf_save_path, append=True, ignore_divisions=True + ) + delayed_write_output_ddf.compute() # Schedule writing + # Free memory after computation + cp.get_default_memory_pool().free_all_blocks() # Free CuPy memory + torch.cuda.empty_cache() + # break + except Exception as e: + print(f"resource access fault, trying again.") + + +def segment( + model: LitSegger, + dm: SeggerDataModule, + save_dir: Union[str, Path], + seg_tag: str, + transcript_file: Union[str, Path], + score_cut: float = 0.5, + use_cc: bool = True, + save_transcripts: bool = True, + save_anndata: bool = True, + save_cell_masks: bool = False, # Placeholder for future implementation + receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, + knn_method: str = "cuda", + verbose: bool = False, + gpu_ids: list = ['0'], + cleanup: bool = True, + **anndata_kwargs +) -> None: + """ + Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, + and log the parameters used during segmentation. + + Args: + model (LitSegger): The trained segmentation model. + dm (SeggerDataModule): The SeggerDataModule instance for data loading. + save_dir (Union[str, Path]): Directory to save the final segmentation results. + seg_tag (str): Tag to include in the saved filename. + transcript_file (Union[str, Path]): Path to the transcripts Parquet file. + score_cut (float, optional): The threshold for assigning transcripts to cells based on + similarity scores. Defaults to 0.5. + use_cc (bool, optional): If True, perform connected components analysis for unassigned + transcripts. Defaults to True. + save_transcripts (bool, optional): Whether to save the transcripts as Parquet. Defaults to True. + save_anndata (bool, optional): Whether to save the results in AnnData format. Defaults to True. + save_cell_masks (bool, optional): Save cell masks as Dask Geopandas Parquet. Defaults to False. + receptive_field (dict, optional): Defines the receptive field for transcript-cell and + transcript-transcript relations. Defaults to + `{'k_bd': 4, 'dist_bd': 10, 'k_tx': 5, 'dist_tx': 3}`. + knn_method (str, optional): The method to use for nearest neighbors (`'cuda'` or `'kd_tree'`). + Defaults to 'cuda'. + verbose (bool, optional): Whether to print verbose status updates. Defaults to False. + cleanup (bool, optional): To clean up intermediate files, i.e., `transcripts_df.parqeut` and `edge_index.parquet`. Defaults to True. + **anndata_kwargs: Additional keyword arguments passed to the `create_anndata` function. + + Returns: + None. Saves the result to disk in various formats and logs the parameter choices. + """ + + start_time = time() + + # Create a subdirectory with important parameter info (receptive field values) + sub_dir_name = f"{seg_tag}_{score_cut}_{use_cc}_{receptive_field['k_bd']}_{receptive_field['dist_bd']}_{receptive_field['k_tx']}_{receptive_field['dist_tx']}_{datetime.now().strftime('%Y%m%d')}" + save_dir = Path(save_dir) / sub_dir_name + save_dir.mkdir(parents=True, exist_ok=True) + + # Paths for saving the output_ddf and edge_index Parquet files + output_ddf_save_path = save_dir / "transcripts_df.parquet" + edge_index_save_path = save_dir / "edge_index.parquet" + + if output_ddf_save_path.exists(): + warnings.warn(f"Removing existing file: {output_ddf_save_path}") + shutil.rmtree(output_ddf_save_path) + + if use_cc: + if edge_index_save_path.exists(): + warnings.warn(f"Removing existing file: {edge_index_save_path}") + shutil.rmtree(edge_index_save_path) + + if verbose: + print(f"Assigning transcripts to nuclei for {seg_tag}...") + + # Step 1: Load the data loaders from the SeggerDataModule + step_start_time = time() + # train_dataloader = dm.train_dataloader() + # val_dataloader = dm.val_dataloader() + # test_dataloader = dm.test_dataloader() + + # # Initialize Dask DataFrame for assignments + # output_ddf = None + cluster = LocalCUDACluster( + CUDA_VISIBLE_DEVICES=gpu_ids, + n_workers=len(gpu_ids), # Number of workers to use (adjust based on your available CPUs/GPUs) + threads_per_worker=1, # Each worker will use 1 thread + ) + + # # Connect the client to the cluster + client = Client(cluster) + + @dask.delayed + def process_batch(batch, gpu_id): + # Assume you're using CuPy, and you need to use a specific GPU + predict_batch( + model, + batch, + score_cut, + receptive_field, + use_cc=use_cc, + knn_method=knn_method, + edge_index_save_path=edge_index_save_path, + output_ddf_save_path=output_ddf_save_path, + gpu_id=gpu_id + ) + + # this is to off-load some tasks from the computation graph + for j in range(len(dm.train) // 100): + print(len(dm.train)) + ind_min = j * 100 + ind_max = min((j+1) * 100, len(dm.train)) + delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.train[ind_min:ind_max])] + future = dask.persist(*delayed_tasks) + progress(future) + dask.compute(*future) + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() + + for j in range(len(dm.val) // 50): + ind_min = j * 50 + ind_max = min((j+1) * 50, len(dm.val)) + delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.val[ind_min:ind_max])] + future = dask.persist(*delayed_tasks) + progress(future) + dask.compute(*future) + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() + + for j in range(len(dm.test) // 50): + ind_min = j * 50 + ind_max = min((j+1) * 50, len(dm.test)) + delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.test[ind_min:ind_max])] + future = dask.persist(*delayed_tasks) + progress(future) + dask.compute(*future) + torch.cuda.empty_cache() + cp.get_default_memory_pool().free_all_blocks() + + client.close() + + # pqdm(delayed_tasks, n_jobs=len(gpu_ids), argument_type='delayed', progress_bar=True) + # dask.compute(*delayed_tasks) + # delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(batches)] + + # # Use tqdm for progress bar + # with ProgressBar(): + # # Execute the delayed tasks with a Dask compute call + # dask.compute(*delayed_tasks) + + # Loop through the data loaders (train, val, and test) + # for loader_name, loader in zip( + # ["Train", "Validation", "Test"], [train_dataloader, val_dataloader, test_dataloader] + # ): + # # for loader_name, loader in zip(['Test'], [test_dataloader]): + # if verbose: + # print(f"Processing {loader_name} data...") + + # for batch in tqdm(loader, desc=f'Processing {loader_name} batches'): + # gpu_id = random.choice(gpu_ids) + # # Call predict_batch for each batch + # predict_batch( + # model, + # batch, + # score_cut, + # receptive_field, + # use_cc=use_cc, + # knn_method=knn_method, + # edge_index_save_path=edge_index_save_path, + # output_ddf_save_path=output_ddf_save_path, + # gpu_id=gpu_id + # ) + + # if verbose: + # elapsed_time = time() - step_start_time + # print(f"Batch processing completed in {elapsed_time:.2f} seconds.") + if verbose: + elapsed_time = time() - step_start_time + print(f"Assigned transcripts to nuclei in {elapsed_time:.2f} seconds.") + # Load the full saved segmentation results + seg_final_dd = dd.read_parquet(output_ddf_save_path, ignore_metadata_file=True) + seg_final_dd = seg_final_dd.set_index("transcript_id", sorted=False) + + step_start_time = time() + if verbose: + print(f"Applying max score selection logic...") + + # Step 1: Find max bound indices (bound == 1) and max unbound indices (bound == 0) + max_bound_idx = seg_final_dd[seg_final_dd["bound"] == 1].groupby("transcript_id")["score"].idxmax() + max_unbound_idx = seg_final_dd[seg_final_dd["bound"] == 0].groupby("transcript_id")["score"].idxmax() + + # Step 2: Combine indices, prioritizing bound=1 scores + final_idx = max_bound_idx.combine_first(max_unbound_idx).compute() + + # Step 3: Use the computed final_idx to select the best assignments + # Make sure you are using the divisions and set the index correctly before loc + # seg_final_dd = seg_final_dd.set_index('transcript_id', sorted=True) + seg_final_filtered = seg_final_dd.loc[final_idx].compute() + + if verbose: + elapsed_time = time() - step_start_time + print(f"Max score selection completed in {elapsed_time:.2f} seconds.") + + # Step 3: Load the transcripts DataFrame and merge results + + if verbose: + print(f"Loading transcripts from {transcript_file}...") + + transcripts_df = dd.read_parquet(transcript_file) + transcripts_df["transcript_id"] = transcripts_df["transcript_id"].astype(str) + + step_start_time = time() + if verbose: + print(f"Merging segmentation results with transcripts...") + + # Outer merge to include all transcripts, even those without assigned cell ids + transcripts_df_filtered = transcripts_df.merge(seg_final_filtered, on="transcript_id", how="outer") + + if verbose: + elapsed_time = time() - step_start_time + print(f"Merging segmentation results with transcripts completed in {elapsed_time:.2f} seconds.") + + # Step 4: Handle unassigned transcripts using connected components (if use_cc=True) + if use_cc: + + step_start_time = time() + if verbose: + print(f"Regrouping the unassigned transcripts...") + # Load edge indices from saved Parquet + edge_index_dd = dd.read_parquet(edge_index_save_path) + + # Step 2: Get unique transcript_ids from edge_index_dd and their positional indices + transcript_ids_in_edges = dd.concat([edge_index_dd["source"], edge_index_dd["target"]]).unique().compute() + + # Create a lookup table with unique indices + lookup_table = pd.Series(data=range(len(transcript_ids_in_edges)), index=transcript_ids_in_edges).to_dict() + + # Map source and target to positional indices + edge_index_dd["index_source"] = edge_index_dd["source"].map(lookup_table) + edge_index_dd["index_target"] = edge_index_dd["target"].map(lookup_table) + # Step 3: Compute connected components for transcripts involved in edges + source_indices = np.asarray(edge_index_dd["index_source"].compute()) + target_indices = np.asarray(edge_index_dd["index_target"].compute()) + data_cp = np.ones(len(source_indices), dtype=cp.float32) + + # Create the sparse COO matrix + coo_cp_matrix = scipy_coo_matrix( + (data_cp, (source_indices, target_indices)), + shape=(len(transcript_ids_in_edges), len(transcript_ids_in_edges)), + ) + + # Use CuPy's connected components algorithm to compute components + n, comps = cc(coo_cp_matrix, directed=True, connection="weak") + + # Step 4: Map back the component labels to the original transcript_ids + comp_labels = pd.Series(comps, index=transcript_ids_in_edges) + # Step 5: Handle only unassigned transcripts in transcripts_df_filtered + unassigned_mask = transcripts_df_filtered["segger_cell_id"].isna() + + unassigned_transcripts_df = transcripts_df_filtered.loc[unassigned_mask, ["transcript_id"]] + + # Step 6: Map component labels only to unassigned transcript_ids + new_segger_cell_ids = unassigned_transcripts_df["transcript_id"].map(comp_labels) + + # Step 7: Create a DataFrame with updated 'segger_cell_id' for unassigned transcripts + unassigned_transcripts_df = unassigned_transcripts_df.assign(segger_cell_id=new_segger_cell_ids) + + # Step 8: Merge this DataFrame back into the original to update only the unassigned segger_cell_id + # We perform a left join so that only the rows in unassigned_transcripts_df are updated + # transcripts_df_filtered = transcripts_df_filtered.drop(columns='segger_cell_id') + + # Merging the updates back to the original DataFrame + transcripts_df_filtered = transcripts_df_filtered.merge( + unassigned_transcripts_df[["transcript_id", "segger_cell_id"]], + on="transcript_id", + how="left", # Perform a left join to only update the unassigned rows + suffixes=("", "_new"), # Suffix for new column to avoid overwriting + ) + + # Step 9: Fill missing segger_cell_id values with the updated values from the merge + transcripts_df_filtered["segger_cell_id"] = transcripts_df_filtered["segger_cell_id"].fillna( + transcripts_df_filtered["segger_cell_id_new"] + ) + + # Step 10: Clean up by dropping the temporary 'segger_cell_id_new' column + transcripts_df_filtered = transcripts_df_filtered.drop(columns=["segger_cell_id_new"]) + + # Fill the NaN values in segger_cell_id with the already existing (assigned) values + # transcripts_df_filtered['segger_cell_id'] = transcripts_df_filtered['segger_cell_id'].fillna(transcripts_df_filtered['segger_cell_id_target']) + + # Drop any temporary columns used during the merge + # transcripts_df_filtered = transcripts_df_filtered.drop(columns=['segger_cell_id_target']) + + if verbose: + elapsed_time = time() - step_start_time + print(f"Regrouped the unassigned transcripts in {elapsed_time:.2f} seconds.") + + # Step 5: Save the merged results based on options + + if save_transcripts: + if verbose: + step_start_time = time() + print(f"Saving transcirpts.parquet...") + transcripts_save_path = save_dir / "segger_transcripts.parquet" + # transcripts_df_filtered = transcripts_df_filtered.repartition(npartitions=100) + transcripts_df_filtered.to_parquet( + transcripts_save_path, + engine="pyarrow", # PyArrow is faster and recommended + compression="snappy", # Use snappy compression for speed + write_index=False, # Skip writing index if not needed + append=False, # Set to True if you're appending to an existing Parquet file + overwrite=True + ) # Dask handles Parquet well + if verbose: + elapsed_time = time() - step_start_time + print(f"Saved trasncripts.parquet in {elapsed_time:.2f} seconds.") + + if save_anndata: + if verbose: + step_start_time = time() + print(f"Saving anndata object...") + anndata_save_path = save_dir / "segger_adata.h5ad" + segger_adata = create_anndata(transcripts_df_filtered.compute(), **anndata_kwargs) # Compute for AnnData + segger_adata.write(anndata_save_path) + if verbose: + elapsed_time = time() - step_start_time + print(f"Saved anndata object in {elapsed_time:.2f} seconds.") + + if save_cell_masks: + if verbose: + step_start_time = time() + print(f"Computing and saving cell masks anndata object...") + # Placeholder for future cell masks implementation as Dask Geopandas Parquet + cell_masks_save_path = save_dir / "segger_cell_boundaries.parquet" + if verbose: + elapsed_time = time() - step_start_time + print(f"Saved cell masks in {elapsed_time:.2f} seconds.") + + + if cleanup: + if verbose: + step_start_time = time() + print(f"Cleaning up intermediate files...") + shutil.rmtree(output_ddf_save_path) + shutil.rmtree(edge_index_save_path) + + + + # Step 6: Save segmentation parameters as a JSON log + log_data = { + "seg_tag": seg_tag, + "score_cut": score_cut, + "use_cc": use_cc, + "receptive_field": receptive_field, + "knn_method": knn_method, + "save_transcripts": save_transcripts, + "save_anndata": save_anndata, + "save_cell_masks": save_cell_masks, + "timestamp": datetime.now().isoformat(), + } + + log_path = save_dir / "segmentation_log.json" + with open(log_path, "w") as log_file: + json.dump(log_data, log_file, indent=4) + + # Step 7: Garbage collection and memory cleanup + torch.cuda.empty_cache() + gc.collect() + + # Total time taken for the segmentation process + if verbose: + total_time = time() - start_time + print(f"Total segmentation process completed in {total_time:.2f} seconds.") From b1da3e6a6d11273479a68ea99b2790c2145a83ac Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Thu, 24 Oct 2024 10:10:08 +0200 Subject: [PATCH 142/156] Updated generate_boundaries in prediciton.boundary --- src/segger/prediction/boundary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/prediction/boundary.py b/src/segger/prediction/boundary.py index e1865f9..f528588 100644 --- a/src/segger/prediction/boundary.py +++ b/src/segger/prediction/boundary.py @@ -328,7 +328,7 @@ def get_cycles(graph: dict): def generate_boundaries(df, x="x_location", y="y_location", cell_id="segger_cell_id"): res = [] group_df = df.groupby(cell_id) - for cell_id, t in tqdm(group_df, total=len(group_df)): + for cell_id, t in tqdm(group_df, total=group_df.ngroups): res.append({"cell_id": cell_id, "length": len(t), "geom": generate_boundary(t, x=x, y=y)}) return gpd.GeoDataFrame( From d5022196a478345a480f95d3246715959ed54b76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 25 Oct 2024 00:32:23 +0200 Subject: [PATCH 143/156] Handling bytes-based gene names --- src/segger/data/parquet/sample.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 06ccd7e..6fb7249 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -74,9 +74,9 @@ def __init__( self._boundaries_metadata = None # Setup default embedding for transcripts - self.emb_genes = None + self._emb_genes = None if weights is not None: - self.emb_genes = weights.index.to_list() + self._emb_genes = weights.index.to_list() classes = self.transcripts_metadata["feature_names"] self._transcript_embedding = TranscriptEmbedding(np.array(classes), weights) @@ -171,9 +171,9 @@ def transcripts_metadata(self) -> dict: # Get filtered unique feature names table = pq.read_table(self._transcripts_filepath) names = pc.unique(table[self.settings.transcripts.label]) - if self.emb_genes is not None: + if self._emb_genes is not None: # Filter substring is extended with the genes missing in the embedding - missing_genes = list(set(names.to_pylist()) - set(self.emb_genes)) + missing_genes = list(set(names.to_pylist()) - set(self._emb_genes)) self.settings.transcripts.filter_substrings.extend(missing_genes) pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) @@ -531,6 +531,9 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): bounds=bounds, extra_columns=self.settings.transcripts.columns, ) + transcripts[self.settings.transcripts.label] = transcripts[self.settings.transcripts.label].apply( + lambda x: x.decode('utf-8') if isinstance(x, bytes) else x + ) transcripts = utils.filter_transcripts( transcripts, self.settings.transcripts.label, From f203c1f92646b016e9f9823f021e6b5c83cf7b50 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 22:33:24 +0000 Subject: [PATCH 144/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/segger/data/parquet/sample.py | 2 +- src/segger/prediction/predict_multigpu.py | 133 ++++++++++++---------- 2 files changed, 71 insertions(+), 64 deletions(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 6fb7249..5a84810 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -532,7 +532,7 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0): extra_columns=self.settings.transcripts.columns, ) transcripts[self.settings.transcripts.label] = transcripts[self.settings.transcripts.label].apply( - lambda x: x.decode('utf-8') if isinstance(x, bytes) else x + lambda x: x.decode("utf-8") if isinstance(x, bytes) else x ) transcripts = utils.filter_transcripts( transcripts, diff --git a/src/segger/prediction/predict_multigpu.py b/src/segger/prediction/predict_multigpu.py index faaad3b..0ac992f 100644 --- a/src/segger/prediction/predict_multigpu.py +++ b/src/segger/prediction/predict_multigpu.py @@ -59,6 +59,7 @@ import dask import random from dask_cuda import LocalCUDACluster + # Setup Dask cluster with 3 workers @@ -107,11 +108,11 @@ def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): Returns: cupyx.scipy.sparse.spmatrix: A new sparse matrix that is a subset of the input matrix. """ - + # Convert indices to CuPy arrays if not already row_idx = cp.asarray(row_idx) col_idx = cp.asarray(col_idx) - + # Ensure sparse matrix is in COO format for easy indexing (you can use CSR/CSC if more optimal) sparse_matrix = sparse_matrix.tocoo() @@ -133,8 +134,6 @@ def subset_sparse_matrix(sparse_matrix, row_idx, col_idx): return coo_matrix((data_filtered, (row_mapped, col_mapped)), shape=(len(row_idx), len(col_idx))) - - def load_model(checkpoint_path: str) -> LitSegger: """ Load a LitSegger model from a checkpoint. @@ -182,13 +181,13 @@ def sort_order(c): def get_similarity_scores( - model: torch.nn.Module, + model: torch.nn.Module, batch: Batch, from_type: str, to_type: str, receptive_field: dict, - knn_method: str = 'cuda', - gpu_id: int = 0 # Added argument for GPU ID + knn_method: str = "cuda", + gpu_id: int = 0, # Added argument for GPU ID ) -> coo_matrix: """ Compute similarity scores between embeddings for 'from_type' and 'to_type' nodes @@ -209,7 +208,7 @@ def get_similarity_scores( with cp.cuda.Device(gpu_id): # Move the batch to the specified GPU - batch = batch.to(f'cuda:{gpu_id}') + batch = batch.to(f"cuda:{gpu_id}") # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] @@ -217,48 +216,44 @@ def get_similarity_scores( edge_index = get_edge_index( batch[to_type].pos[:, :2].cpu(), # 'tx' positions batch[from_type].pos[:, :2].cpu(), # 'bd' positions - k=receptive_field[f'k_{to_type}'], - dist=receptive_field[f'dist_{to_type}'], - method=knn_method + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method=knn_method, ) - + # Convert to dense adjacency matrix (on GPU) - edge_index = coo_to_dense_adj( - edge_index.T, - num_nodes=shape[0], - num_nbrs=receptive_field[f'k_{to_type}'] - ) - + edge_index = coo_to_dense_adj(edge_index.T, num_nodes=shape[0], num_nbrs=receptive_field[f"k_{to_type}"]) + with torch.no_grad(): embeddings = model(batch.x_dict, batch.edge_index_dict) - - + def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: m = torch.nn.ZeroPad2d((0, 0, 0, 1)) # pad bottom with zeros similarity = torch.bmm( - m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed - embeddings[from_type].unsqueeze(-1) # 'to' x embed x 1 - ) # -> 'to' x 'from' neighbors x 1 + m(embeddings[to_type])[edge_index], # 'to' x 'from' neighbors x embed + embeddings[from_type].unsqueeze(-1), # 'to' x embed x 1 + ) # -> 'to' x 'from' neighbors x 1 del embeddings # Sigmoid to get most similar 'to_type' neighbor similarity[similarity == 0] = -torch.inf # ensure zero stays zero similarity = F.sigmoid(similarity) # Neighbor-filtered similarity scores # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - indices = torch.argwhere(edge_index != -1).T + indices = torch.argwhere(edge_index != -1).T indices[1] = edge_index[edge_index != -1] - rows = cp.fromDlpack(to_dlpack(indices[0,:].to('cuda'))) - columns = cp.fromDlpack(to_dlpack(indices[1,:].to('cuda'))) + rows = cp.fromDlpack(to_dlpack(indices[0, :].to("cuda"))) + columns = cp.fromDlpack(to_dlpack(indices[1, :].to("cuda"))) # print(rows) del indices values = similarity[edge_index != -1].flatten() sparse_result = coo_matrix((cp.fromDlpack(to_dlpack(values)), (rows, columns)), shape=shape) return sparse_result # Free GPU memory after computation + # Call the sparse multiply function sparse_similarity = sparse_multiply(embeddings, edge_index, shape) - + return sparse_similarity @@ -268,10 +263,10 @@ def predict_batch( score_cut: float, receptive_field: Dict[str, float], use_cc: bool = True, - knn_method: str = 'cuda', + knn_method: str = "cuda", edge_index_save_path: Union[str, Path] = None, output_ddf_save_path: Union[str, Path] = None, - gpu_id: int = 0 # Added argument for GPU ID + gpu_id: int = 0, # Added argument for GPU ID ): """ Predict cell assignments for a batch of transcript data using a segmentation model. @@ -299,8 +294,8 @@ def _get_id(): print(gpu_id) with cp.cuda.Device(gpu_id): # Move the batch to the specified GPU - batch = batch.to(f'cuda:{gpu_id}') - lit_segger.model = lit_segger.model.to(f'cuda:{gpu_id}') + batch = batch.to(f"cuda:{gpu_id}") + lit_segger.model = lit_segger.model.to(f"cuda:{gpu_id}") # Extract transcript IDs and initialize a dictionary for assignments transcript_id = batch["tx"].id.cpu().numpy().astype("str") @@ -308,10 +303,12 @@ def _get_id(): if len(batch["bd"].pos) >= 10: while True: - try: - # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) - scores = get_similarity_scores(lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method, gpu_id=gpu_id) - break + try: + # Step 1: Compute similarity scores between 'tx' (transcripts) and 'bd' (boundaries) + scores = get_similarity_scores( + lit_segger.model, batch, "tx", "bd", receptive_field, knn_method=knn_method, gpu_id=gpu_id + ) + break except Exception as e: print(f"This weird error: {e}.") torch.cuda.empty_cache() @@ -337,7 +334,9 @@ def _get_id(): # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) if use_cc: - scores_tx = get_similarity_scores(lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id) + scores_tx = get_similarity_scores( + lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id + ) # Stay on GPU and use CuPy sparse matrices no_id_scores = cupyx.scipy.sparse.coo_matrix( @@ -352,11 +351,13 @@ def _get_id(): no_id_scores.eliminate_zeros() # Remove zero entries to keep the matrix sparse # Find unassigned transcripts (those with no segger_cell_id) - no_id = cp.where(cp.asarray(assignments['segger_cell_id'] == None))[0] # Using CuPy to handle None values - + no_id = cp.where(cp.asarray(assignments["segger_cell_id"] == None))[ + 0 + ] # Using CuPy to handle None values + if len(no_id) > 0: # Only compute if there are unassigned transcripts # Apply score cut-off to unassigned transcripts - no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) + no_id_scores = subset_sparse_matrix(no_id_scores, no_id, no_id) no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold no_id_scores.eliminate_zeros() # Clean up zeros @@ -371,14 +372,17 @@ def _get_id(): # while True: try: # # Save edge_index using CuDF and Dask-CuDF for GPU acceleration - edge_index_ddf = delayed(dd.from_pandas)(pd.DataFrame({'source': source_nodes, 'target': target_nodes}), npartitions=1) + edge_index_ddf = delayed(dd.from_pandas)( + pd.DataFrame({"source": source_nodes, "target": target_nodes}), npartitions=1 + ) # Use delayed for asynchronous disk writing of edge_index in Dask DataFrame - delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)(edge_index_save_path, append=True, ignore_divisions=True) + delayed_write_edge_index = delayed(edge_index_ddf.to_parquet)( + edge_index_save_path, append=True, ignore_divisions=True + ) delayed_write_edge_index.compute() # Schedule writing # break except Exception as e: print(f"resource access fault, trying again.") - assignments = { "transcript_id": assignments["transcript_id"].astype("str"), @@ -388,9 +392,9 @@ def _get_id(): } # Step 4: Convert assignments to Dask-CuDF DataFrame for this batch # batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) - if len(assignments['transcript_id']) > 10: + if len(assignments["transcript_id"]) > 10: # while True: - try: + try: batch_ddf = delayed(dd.from_pandas)(pd.DataFrame(assignments), npartitions=1) # Save the updated `output_ddf` asynchronously using Dask delayed @@ -420,9 +424,9 @@ def segment( receptive_field: dict = {"k_bd": 4, "dist_bd": 10, "k_tx": 5, "dist_tx": 3}, knn_method: str = "cuda", verbose: bool = False, - gpu_ids: list = ['0'], + gpu_ids: list = ["0"], cleanup: bool = True, - **anndata_kwargs + **anndata_kwargs, ) -> None: """ Perform segmentation using the model, save transcripts, AnnData, and cell masks as needed, @@ -493,7 +497,7 @@ def segment( # # Connect the client to the cluster client = Client(cluster) - + @dask.delayed def process_batch(batch, gpu_id): # Assume you're using CuPy, and you need to use a specific GPU @@ -506,43 +510,49 @@ def process_batch(batch, gpu_id): knn_method=knn_method, edge_index_save_path=edge_index_save_path, output_ddf_save_path=output_ddf_save_path, - gpu_id=gpu_id + gpu_id=gpu_id, ) - + # this is to off-load some tasks from the computation graph for j in range(len(dm.train) // 100): print(len(dm.train)) ind_min = j * 100 - ind_max = min((j+1) * 100, len(dm.train)) - delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.train[ind_min:ind_max])] + ind_max = min((j + 1) * 100, len(dm.train)) + delayed_tasks = [ + process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.train[ind_min:ind_max]) + ] future = dask.persist(*delayed_tasks) progress(future) dask.compute(*future) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() - + for j in range(len(dm.val) // 50): ind_min = j * 50 - ind_max = min((j+1) * 50, len(dm.val)) - delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.val[ind_min:ind_max])] + ind_max = min((j + 1) * 50, len(dm.val)) + delayed_tasks = [ + process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.val[ind_min:ind_max]) + ] future = dask.persist(*delayed_tasks) progress(future) dask.compute(*future) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() - + for j in range(len(dm.test) // 50): ind_min = j * 50 - ind_max = min((j+1) * 50, len(dm.test)) - delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.test[ind_min:ind_max])] + ind_max = min((j + 1) * 50, len(dm.test)) + delayed_tasks = [ + process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(dm.test[ind_min:ind_max]) + ] future = dask.persist(*delayed_tasks) progress(future) dask.compute(*future) torch.cuda.empty_cache() cp.get_default_memory_pool().free_all_blocks() - + client.close() - + # pqdm(delayed_tasks, n_jobs=len(gpu_ids), argument_type='delayed', progress_bar=True) # dask.compute(*delayed_tasks) # delayed_tasks = [process_batch(batch, gpu_ids[i % len(gpu_ids)]) for i, batch in enumerate(batches)] @@ -713,7 +723,7 @@ def process_batch(batch, gpu_id): compression="snappy", # Use snappy compression for speed write_index=False, # Skip writing index if not needed append=False, # Set to True if you're appending to an existing Parquet file - overwrite=True + overwrite=True, ) # Dask handles Parquet well if verbose: elapsed_time = time() - step_start_time @@ -740,15 +750,12 @@ def process_batch(batch, gpu_id): elapsed_time = time() - step_start_time print(f"Saved cell masks in {elapsed_time:.2f} seconds.") - if cleanup: if verbose: step_start_time = time() print(f"Cleaning up intermediate files...") shutil.rmtree(output_ddf_save_path) shutil.rmtree(edge_index_save_path) - - # Step 6: Save segmentation parameters as a JSON log log_data = { From cd10f93a1d88b2c52b3ae9d467bf85b72744ac1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 25 Oct 2024 13:04:09 +0200 Subject: [PATCH 145/156] Handling bytes-based gene names while filtering embeddings --- src/segger/data/parquet/sample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 6fb7249..d002a52 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -173,7 +173,8 @@ def transcripts_metadata(self) -> dict: names = pc.unique(table[self.settings.transcripts.label]) if self._emb_genes is not None: # Filter substring is extended with the genes missing in the embedding - missing_genes = list(set(names.to_pylist()) - set(self._emb_genes)) + names_str = [x.decode("utf-8") if isinstance(x, bytes) else x for x in names.to_pylist()] + missing_genes = list(set(names_str) - set(self._emb_genes)) self.settings.transcripts.filter_substrings.extend(missing_genes) pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) From a6187939a6b721fd430ad8bac3614b084ada5ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Fri, 25 Oct 2024 16:31:31 +0200 Subject: [PATCH 146/156] Handling bytes-based gene names assertion fix --- src/segger/data/parquet/sample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 1a4c3c8..98d7fbd 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -178,7 +178,8 @@ def transcripts_metadata(self) -> dict: self.settings.transcripts.filter_substrings.extend(missing_genes) pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) - metadata["feature_names"] = pc.filter(names, mask).tolist() + filtered_names = pc.filter(names, mask).to_pylist() + metadata["feature_names"] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in filtered_names] self._transcripts_metadata = metadata return self._transcripts_metadata From f2df878ec2c45f31da744b58ff5ae59879121c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Sat, 26 Oct 2024 11:36:15 +0200 Subject: [PATCH 147/156] Submit job fix --- scripts/submit_job.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 0dead12..139ed65 100644 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -223,13 +223,13 @@ def run_prediction(): "--file_format", config["prediction"]["file_format"], "--k_bd", - str(config["preprocessing"]["k_bd"]), + str(config["prediction"]["k_bd"]), "--dist_bd", - str(config["preprocessing"]["dist_bd"]), + str(config["prediction"]["dist_bd"]), "--k_tx", - str(config["preprocessing"]["k_tx"]), + str(config["prediction"]["k_tx"]), "--dist_tx", - str(config["preprocessing"]["dist_tx"]), + str(config["prediction"]["dist_tx"]), ] ) From 1ef71697ed1facf8b6bdb0942d33e0162ad75012 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Mon, 28 Oct 2024 12:19:43 +0100 Subject: [PATCH 148/156] fixes #62 with knn_method = 'kd_tree'. --- src/segger/prediction/predict_parquet.py | 26 ++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index ad235a3..d507599 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -213,15 +213,25 @@ def get_similarity_scores( # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - + + if knn_method == 'kd_tree': # Compute edge indices using knn method (still on GPU) - edge_index = get_edge_index( - batch[to_type].pos[:, :2], # 'tx' positions - batch[from_type].pos[:, :2], # 'bd' positions - k=receptive_field[f"k_{to_type}"], - dist=receptive_field[f"dist_{to_type}"], - method=knn_method, - ) + edge_index = get_edge_index( + batch[to_type].pos[:, :2].cpu(), # 'tx' positions + batch[from_type].pos[:, :2].cpu(), # 'bd' positions + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method=knn_method, + ) + else: + edge_index = get_edge_index( + batch[to_type].pos[:, :2], # 'tx' positions + batch[from_type].pos[:, :2], # 'bd' positions + k=receptive_field[f"k_{to_type}"], + dist=receptive_field[f"dist_{to_type}"], + method=knn_method, + ) + # Convert to dense adjacency matrix (on GPU) edge_index = coo_to_dense_adj(edge_index.T, num_nodes=shape[0], num_nbrs=receptive_field[f"k_{to_type}"]) From dc0b63188364b4168cedf047d5785fd1f59125a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=81NIEL=20UNYI?= Date: Mon, 28 Oct 2024 12:24:58 +0100 Subject: [PATCH 149/156] Log number of missing genes if embeddings are available --- src/segger/data/parquet/sample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py index 98d7fbd..a6d7db6 100644 --- a/src/segger/data/parquet/sample.py +++ b/src/segger/data/parquet/sample.py @@ -175,6 +175,7 @@ def transcripts_metadata(self) -> dict: # Filter substring is extended with the genes missing in the embedding names_str = [x.decode("utf-8") if isinstance(x, bytes) else x for x in names.to_pylist()] missing_genes = list(set(names_str) - set(self._emb_genes)) + logging.warning(f"Number of missing genes: {len(missing_genes)}") self.settings.transcripts.filter_substrings.extend(missing_genes) pattern = "|".join(self.settings.transcripts.filter_substrings) mask = pc.invert(pc.match_substring_regex(names, pattern)) From f31bbe222bc2046a28bca2da7a1730ac61451a83 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:25:57 +0000 Subject: [PATCH 150/156] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/segger/prediction/predict_parquet.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index d507599..2fa21aa 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -213,9 +213,9 @@ def get_similarity_scores( # Step 1: Get embeddings from the model (on GPU) shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] - - if knn_method == 'kd_tree': - # Compute edge indices using knn method (still on GPU) + + if knn_method == "kd_tree": + # Compute edge indices using knn method (still on GPU) edge_index = get_edge_index( batch[to_type].pos[:, :2].cpu(), # 'tx' positions batch[from_type].pos[:, :2].cpu(), # 'bd' positions @@ -231,7 +231,6 @@ def get_similarity_scores( dist=receptive_field[f"dist_{to_type}"], method=knn_method, ) - # Convert to dense adjacency matrix (on GPU) edge_index = coo_to_dense_adj(edge_index.T, num_nodes=shape[0], num_nbrs=receptive_field[f"k_{to_type}"]) From 1165ec0c010cb03b2b3bd3f3cec4333de1d24243 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 23:50:12 +0000 Subject: [PATCH 151/156] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/blacken-docs: 1.19.0 → 1.19.1](https://github.com/asottile/blacken-docs/compare/1.19.0...1.19.1) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 598ff15..4620eaa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,6 +17,6 @@ repos: hooks: - id: prettier - repo: https://github.com/asottile/blacken-docs - rev: 1.19.0 + rev: 1.19.1 hooks: - id: blacken-docs From 51872ce8d5f7156300d897481c336b34ec4d48d6 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Tue, 29 Oct 2024 17:24:47 +0100 Subject: [PATCH 152/156] fixed the problem of keeping low scores. --- src/segger/prediction/predict_parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 2fa21aa..ab95dfa 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -394,7 +394,9 @@ def _get_id(): } # Step 4: Convert assignments to Dask-CuDF DataFrame for this batch # batch_ddf = dask_cudf.from_cudf(cudf.DataFrame(assignments), npartitions=1) - batch_ddf = delayed(dd.from_pandas)(pd.DataFrame(assignments), npartitions=1) + assignments = pd.DataFrame(assignments) + assignments = assignments[assignments['bound'] == 1] + batch_ddf = delayed(dd.from_pandas)(assignments, npartitions=1) # Save the updated `output_ddf` asynchronously using Dask delayed delayed_write_output_ddf = delayed(batch_ddf.to_parquet)( From 51cf0ce0c3a027f6893801897770969c406ba855 Mon Sep 17 00:00:00 2001 From: Elyas Heidari <55977725+EliHei2@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:37:42 +0100 Subject: [PATCH 153/156] Update predict_parquet.py, median cut for tx-tx --- src/segger/prediction/predict_parquet.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index ab95dfa..4d6b0d2 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -186,6 +186,7 @@ def get_similarity_scores( from_type: str, to_type: str, receptive_field: dict, + compute_sigmoid: bool = Ture, knn_method: str = "cuda", gpu_id: int = 0, # Added argument for GPU ID ) -> coo_matrix: @@ -248,7 +249,8 @@ def sparse_multiply(embeddings, edge_index, shape) -> coo_matrix: del embeddings # Sigmoid to get most similar 'to_type' neighbor similarity[similarity == 0] = -torch.inf # ensure zero stays zero - similarity = F.sigmoid(similarity) + if compute_sigmoid: + similarity = F.sigmoid(similarity) # Neighbor-filtered similarity scores # shape = batch[from_type].x.shape[0], batch[to_type].x.shape[0] indices = torch.argwhere(edge_index != -1).T @@ -342,7 +344,7 @@ def _get_id(): # Step 3: Handle unassigned transcripts with connected components (if use_cc=True) if use_cc: scores_tx = get_similarity_scores( - lit_segger.model, batch, "tx", "tx", receptive_field, knn_method=knn_method, gpu_id=gpu_id + lit_segger.model, batch, "tx", "tx", receptive_field, compute_sigmoid = False, knn_method=knn_method, gpu_id=gpu_id ) # Stay on GPU and use CuPy sparse matrices @@ -350,8 +352,10 @@ def _get_id(): (scores_tx.data, (scores_tx.row, scores_tx.col)), shape=scores_tx.shape ) + score_cut_tx = no_id_scores.data.median() + # Apply threshold on GPU - no_id_scores.data[no_id_scores.data < score_cut] = 0 # Apply threshold + no_id_scores.data[no_id_scores.data < score_cut_tx] = 0 # Apply threshold # Zero out the diagonal on GPU no_id_scores = zero_out_diagonal_gpu(no_id_scores) From 30218de34bbda7eac4eb151b0d85bdf3b2151b4f Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Sat, 30 Nov 2024 13:53:09 +0100 Subject: [PATCH 154/156] Fix typo in predict_parquet.py --- src/segger/prediction/predict_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 4d6b0d2..24ea19d 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -186,7 +186,7 @@ def get_similarity_scores( from_type: str, to_type: str, receptive_field: dict, - compute_sigmoid: bool = Ture, + compute_sigmoid: bool = True, knn_method: str = "cuda", gpu_id: int = 0, # Added argument for GPU ID ) -> coo_matrix: From 6418d118f6034c8c5689c490ef5ea37ac8d806b4 Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Sat, 30 Nov 2024 13:55:48 +0100 Subject: [PATCH 155/156] Fix io.py --- src/segger/data/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/data/io.py b/src/segger/data/io.py index 3bb52e2..3e9c566 100644 --- a/src/segger/data/io.py +++ b/src/segger/data/io.py @@ -194,7 +194,7 @@ def load_transcripts( transcripts_df = transcripts_df[transcripts_df[self.keys.FEATURE_NAME.value].isin(valid_genes)] final_count = delayed(lambda df: df.shape[0])(transcripts_df) if self.verbose: - print(f"Dropped {initial_count - final_count} transcripts not found in {key} embedding.") + print(f"Dropped {initial_count - final_count} transcripts not found in embedding.") # Ensure that the 'OVERLAPS_BOUNDARY' column is boolean if it exists if self.keys.OVERLAPS_BOUNDARY.value in transcripts_df.columns: From 98861e2d4532e0a480879cdb86c8b8b292542112 Mon Sep 17 00:00:00 2001 From: daniel-unyi-42 <63173826+daniel-unyi-42@users.noreply.github.com> Date: Sat, 30 Nov 2024 14:39:04 +0100 Subject: [PATCH 156/156] Fix numpy error in predict_parquet.py --- src/segger/prediction/predict_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py index 24ea19d..4288337 100644 --- a/src/segger/prediction/predict_parquet.py +++ b/src/segger/prediction/predict_parquet.py @@ -352,7 +352,7 @@ def _get_id(): (scores_tx.data, (scores_tx.row, scores_tx.col)), shape=scores_tx.shape ) - score_cut_tx = no_id_scores.data.median() + score_cut_tx = np.median(no_id_scores.data) # Apply threshold on GPU no_id_scores.data[no_id_scores.data < score_cut_tx] = 0 # Apply threshold