From d841ac3a6987f4d502db67dd58d7b3f5ed99462e Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <17606346+irisdianauy@users.noreply.github.com>
Date: Tue, 1 Oct 2024 16:33:19 +0100
Subject: [PATCH] Switch to using Python to convert cluster files from wide to
 long format (#76)

* Create Python version of wide-to-long cluster file conversion Rscript

* Call python version of script

* Add permission to execute python script

* Specify python3

* Update container used for tests
---
 Dockerfile                         |  2 +-
 bin/load_db_scxa_cell_clusters.sh  |  2 +-
 bin/wideSCCluster2longSCCluster.py | 30 ++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100755 bin/wideSCCluster2longSCCluster.py

diff --git a/Dockerfile b/Dockerfile
index e1abbaf..826f165 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.0.0
+FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.1.0
 # debian
 
 ADD bin/* /usr/local/bin/
diff --git a/bin/load_db_scxa_cell_clusters.sh b/bin/load_db_scxa_cell_clusters.sh
index fcf0bd3..090fd87 100755
--- a/bin/load_db_scxa_cell_clusters.sh
+++ b/bin/load_db_scxa_cell_clusters.sh
@@ -39,7 +39,7 @@ cleanup() {
 cleanup
 
 print_log "Clusters: Create data file for $EXP_ID..."
-wideSCCluster2longSCCluster.R -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $clustersToLoad
+wideSCCluster2longSCCluster.py -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $clustersToLoad
 
 # Delete clusters table content for current EXP_ID
 print_log "clusters table: Delete rows for $EXP_ID:"
diff --git a/bin/wideSCCluster2longSCCluster.py b/bin/wideSCCluster2longSCCluster.py
new file mode 100755
index 0000000..5d5134c
--- /dev/null
+++ b/bin/wideSCCluster2longSCCluster.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import csv
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("-c", "--clusters-file", dest="clusters_path", help="Path to clusters file")
+parser.add_argument("-e", "--experiment-accession", dest="exp_acc", help="Experiment accession")
+parser.add_argument("-o", "--output", dest="output_path", help="Output file path")
+args = parser.parse_args()
+
+# Read clusters file
+clusters_wide = pd.read_csv(args.clusters_path, sep='\t', header=0, usecols=lambda x: x != "sel.K")
+
+# Reshape data from wide to long format
+clusters_long = pd.melt(clusters_wide, id_vars=['K'], var_name='cell_id', value_name='cluster_id')
+
+# Add experiment accession column
+clusters_long['experiment_accession'] = args.exp_acc
+
+# Rename 'K' column to 'k'
+clusters_long.rename(columns={'K': 'k'}, inplace=True)
+
+# Select and reorder columns
+columns = ['experiment_accession', 'cell_id', 'k', 'cluster_id']
+
+# Write output to file
+clusters_long.to_csv(args.output_path, index=False, columns=columns, quoting=csv.QUOTE_NONNUMERIC)