Switch to using Python to convert cluster files from wide to long for…

…mat (#76) * Create Python version of wide-to-long cluster file conversion Rscript * Call python version of script * Add permission to execute python script * Specify python3 * Update container used for tests
ebi-gene-expression-group · Oct 1, 2024 · d841ac3 · d841ac3
1 parent 72ffd49
commit d841ac3
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 2 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.0.0
+FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.1.0
 # debian
 
 ADD bin/* /usr/local/bin/

diff --git a/bin/load_db_scxa_cell_clusters.sh b/bin/load_db_scxa_cell_clusters.sh
@@ -39,7 +39,7 @@ cleanup() {
 cleanup
 
 print_log "Clusters: Create data file for $EXP_ID..."
-wideSCCluster2longSCCluster.R -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $clustersToLoad
+wideSCCluster2longSCCluster.py -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $clustersToLoad
 
 # Delete clusters table content for current EXP_ID
 print_log "clusters table: Delete rows for $EXP_ID:"

diff --git a/bin/wideSCCluster2longSCCluster.py b/bin/wideSCCluster2longSCCluster.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import csv
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("-c", "--clusters-file", dest="clusters_path", help="Path to clusters file")
+parser.add_argument("-e", "--experiment-accession", dest="exp_acc", help="Experiment accession")
+parser.add_argument("-o", "--output", dest="output_path", help="Output file path")
+args = parser.parse_args()
+
+# Read clusters file
+clusters_wide = pd.read_csv(args.clusters_path, sep='\t', header=0, usecols=lambda x: x != "sel.K")
+
+# Reshape data from wide to long format
+clusters_long = pd.melt(clusters_wide, id_vars=['K'], var_name='cell_id', value_name='cluster_id')
+
+# Add experiment accession column
+clusters_long['experiment_accession'] = args.exp_acc
+
+# Rename 'K' column to 'k'
+clusters_long.rename(columns={'K': 'k'}, inplace=True)
+
+# Select and reorder columns
+columns = ['experiment_accession', 'cell_id', 'k', 'cluster_id']
+
+# Write output to file
+clusters_long.to_csv(args.output_path, index=False, columns=columns, quoting=csv.QUOTE_NONNUMERIC)