From d841ac3a6987f4d502db67dd58d7b3f5ed99462e Mon Sep 17 00:00:00 2001 From: Iris Diana Yu <17606346+irisdianauy@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:33:19 +0100 Subject: [PATCH] Switch to using Python to convert cluster files from wide to long format (#76) * Create Python version of wide-to-long cluster file conversion Rscript * Call python version of script * Add permission to execute python script * Specify python3 * Update container used for tests --- Dockerfile | 2 +- bin/load_db_scxa_cell_clusters.sh | 2 +- bin/wideSCCluster2longSCCluster.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) create mode 100755 bin/wideSCCluster2longSCCluster.py diff --git a/Dockerfile b/Dockerfile index e1abbaf..826f165 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.0.0 +FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.1.0 # debian ADD bin/* /usr/local/bin/ diff --git a/bin/load_db_scxa_cell_clusters.sh b/bin/load_db_scxa_cell_clusters.sh index fcf0bd3..090fd87 100755 --- a/bin/load_db_scxa_cell_clusters.sh +++ b/bin/load_db_scxa_cell_clusters.sh @@ -39,7 +39,7 @@ cleanup() { cleanup print_log "Clusters: Create data file for $EXP_ID..." -wideSCCluster2longSCCluster.R -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $clustersToLoad +wideSCCluster2longSCCluster.py -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $clustersToLoad # Delete clusters table content for current EXP_ID print_log "clusters table: Delete rows for $EXP_ID:" diff --git a/bin/wideSCCluster2longSCCluster.py b/bin/wideSCCluster2longSCCluster.py new file mode 100755 index 0000000..5d5134c --- /dev/null +++ b/bin/wideSCCluster2longSCCluster.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import csv + +# Parse command-line arguments +parser = argparse.ArgumentParser() +parser.add_argument("-c", "--clusters-file", dest="clusters_path", help="Path to clusters file") +parser.add_argument("-e", "--experiment-accession", dest="exp_acc", help="Experiment accession") +parser.add_argument("-o", "--output", dest="output_path", help="Output file path") +args = parser.parse_args() + +# Read clusters file +clusters_wide = pd.read_csv(args.clusters_path, sep='\t', header=0, usecols=lambda x: x != "sel.K") + +# Reshape data from wide to long format +clusters_long = pd.melt(clusters_wide, id_vars=['K'], var_name='cell_id', value_name='cluster_id') + +# Add experiment accession column +clusters_long['experiment_accession'] = args.exp_acc + +# Rename 'K' column to 'k' +clusters_long.rename(columns={'K': 'k'}, inplace=True) + +# Select and reorder columns +columns = ['experiment_accession', 'cell_id', 'k', 'cluster_id'] + +# Write output to file +clusters_long.to_csv(args.output_path, index=False, columns=columns, quoting=csv.QUOTE_NONNUMERIC)