picardmetrics

#!/usr/bin/env bash
# picardmetrics
# Kamil Slowikowski

PICARDMETRICS_VERSION="0.2.4 2016-07-06"

# Main functions. ------------------------------------------------------------

main() {
  echo "picardmetrics version $PICARDMETRICS_VERSION"

  if [[ "$1" == "collate" ]]; then
    shift
    main_collate $@
  elif [[ "$1" == "run" ]]; then
    shift
    main_run $@
  elif [[ "$1" == "refFlat" ]]; then
    main_refFlat $@
  elif [[ "$1" == "rRNA" ]]; then
    main_rRNA $@
  else
    echo "Usage: picardmetrics COMMAND"
    echo "  run         Run the Picard tools on a given BAM file."
    echo "  collate     Collate metrics files for multiple BAM files."
    echo "  refFlat     Create a .refFlat file for CollectRnaSeqMetrics."
    echo "  rRNA        Create a .rRNA.list file for CollectRnaSeqMetrics."
  fi
}

main_run() {
  # Ensure that the script was called with correct arguments.
  if [[ "$1" == "" ]]; then
    echo "Usage: picardmetrics run [-f FILE] [-o DIR] [-r] <file.bam>"
    echo "  -f FILE     The configuration file. (Default: ./picardmetrics.conf)"
    echo "  -o DIR      Write output files in this directory. (Default: .)"
    echo "  -r          The BAM file has RNA-seq reads. (Default: false)"
    echo "  -k          Keep the output BAM file. (Default: false)"
    return 0
  fi

  # The configuration file for picardmetrics.
  PICARDMETRICS_CONFIG=""

  # By default, write output to the user's current working directory.
  OUT_DIR="."

  # Assume the input .bam file does not come from an RNA-seq experiment.
  RNASEQ=""

  # By default, the sorted and deduplicated .bam file will be deleted after
  # metrics are computed. This option allows you to keep it instead.
  KEEP=""

  while getopts ":f:o:rk" opt; do
    case "$opt" in
      f)
        PICARDMETRICS_CONFIG="$OPTARG"
        ;;
      o)
        OUT_DIR="$OPTARG"
        ;;
      r)
        RNASEQ=1
        ;;
      k)
        KEEP=1
        ;;
      \?)
        warn "Invalid option: -$OPTARG"
        exit 1
        ;;
      :)
        warn "Option -$OPTARG requires an argument."
        ;;
    esac
  done

  # Load the configuration variables.
  configure_run

  # Get the first positional argument after the option flags.
  bam=${@:$OPTIND:1}

  # Ensure the BAM file is valid.
  if ! is_bam "$bam"; then
    warn "ERROR: Not a .bam file: '$bam'"
    exit 1
  elif bam_incomplete "$bam"; then
    warn "ERROR: Missing EOF marker: '$bam'"
    exit 1
  fi

  # Get the absolute path, but do not resolve symlinks.
  bam="$(bash_realpath $bam)"

  # Make the ouput directory if it does not exist.
  if ! mkdir -p "$OUT_DIR"; then
    warn "ERROR: mkdir -p '$OUT_DIR'"
    exit 1
  fi

  # Perform all actions in the output directory.
  if ! cd "$OUT_DIR"; then
    warn "ERROR: cd '$OUT_DIR'"
    exit 1
  fi

  warn -e "$(DATE)\tSTART\t$bam"

  # Create a sequence dictionary needed for ReorderSam.
  create_sequence_dictionary "$REFERENCE_SEQUENCE"

  # Temporary directory with intermediate files.
  if [[ ! -d "$TEMP_DIR" ]]; then
    warn "Setting TEMP_DIR=/tmp"
    warn "Please ensure disk space is sufficient with 'df -h /tmp'"
    TEMP_DIR=/tmp
  fi

  # Temporary directory mimics BAM file directory.
  TEMP_DIR="$TEMP_DIR/picardmetrics_${USER}_${$}/$(dirname $bam)"
  if ! mkdir -p "$TEMP_DIR"; then
    warn "ERROR: mkdir -p '$TEMP_DIR'"
    exit 1
  fi

  # Temporary BAM file that is deleted by default.
  local temp_bam="$TEMP_DIR/$(basename $bam)"

  if ! ln -sf "$bam" "$temp_bam"; then
    warn "ERROR: ln -sf '$bam' '$temp_bam'"
    exit 1
  fi

  # Run all of the Picard tools.
  compute_metrics "$temp_bam" "$RNASEQ"

  if [[ "$KEEP" = "" ]]; then
    # Delete the temporary BAM file after computing metrics.
    if ! rm -f "$temp_bam"; then
      warn "ERROR: rm -f '$temp_bam'"
      exit 1
    fi
  else
    # Otherwise, move it to the output folder.
    dup_bam="$(basename ${bam%.bam}.MarkDuplicates.bam)"
    if ! mv "$temp_bam" "$dup_bam" ;then
      warn "ERROR: mv '$temp_bam' '$dup_bam'"
      exit 1
    fi
  fi
  # Delete the temporary directory.
  if ! rm -rf "$TEMP_DIR"; then
    warn "ERROR: rm -rf '$TEMP_DIR'"
    exit 1
  fi

  warn -e "$(DATE)\tDONE\t$bam"
}

main_collate() {
  if [[ "$1" == "" || "$2" == "" ]]; then
    echo "Usage: picardmetrics collate PREFIX DIR"
    echo "  Find all picardmetrics output files in DIR and collate them"
    echo "  into a file named 'PREFIX-all-metrics.tsv'."
    return 0
  fi

  local prefix="$1"
  local dir="$2"

  if [[ ! -d "$dir" ]]; then
    warn "ERROR: Not a directory: '$dir'"
    exit 1
  fi

  if ! mkdir -p $(dirname "$prefix"); then
    warn "ERROR: mkdir -p '$(dirname $prefix)'"
    exit 1
  fi

  warn -e "$(DATE)\tSTART\t$prefix"
  
  # CollectAlignmentMetrics
  collate_alignment_metrics "$prefix-alignment-metrics.tsv" "$dir"

  # CollectQualityMetrics
  collate_quality_metrics "$prefix-quality-histogram.tsv" "$dir"

  # CollectRnaSeqMetrics
  collate_rnaseq_metrics "$prefix-rnaseq-metrics.tsv" "$dir"
  collate_rnaseq_coverage "$prefix-rnaseq-coverage.tsv" "$dir"

  # CollectGcBiasMetrics
  collate_gc_bias_metrics "$prefix-gc-bias-metrics.tsv" "$dir"
  collate_gc_bias_histogram "$prefix-gc-bias-histogram.tsv" "$dir"

  # MarkDuplicates
  collate_duplicate_metrics "$prefix-duplicate-metrics.tsv" "$dir"

  # CollectInsertSizeMetrics
  collate_insert_size_metrics "$prefix-insert-size-metrics.tsv" "$dir"
  collate_insert_size_histogram "$prefix-insert-size-histogram.tsv" "$dir"

  # BaseDistributionByCycleMetrics
  collate_base_distribution_by_cycle \
    "$prefix-base-distribution-by-cycle-histogram.tsv" "$dir"

  # EstimateLibraryComplexity
  collate_library_complexity "$prefix-library-complexity.tsv" "$dir"
  collate_library_complexity_histogram \
    "$prefix-library-complexity-histogram.tsv" "$dir"

  # MAPQ statistics.
  collate_mapq_stats "$prefix-mapq-stats.tsv" "$dir"

  # Merge all collated files into one master file "$prefix-all-metrics.tsv".
  collate_all "$prefix"

  warn -e "$(DATE)\tDONE\t$prefix"
}

main_refFlat() {
  local gtf="$2"
  # Ensure that the script was called with correct arguments.
  if [[ ! ("$gtf" == *.gtf || "$gtf" == *.gtf.gz) ]]; then
    echo "Usage: picardmetrics refFlat <file.gtf[.gz]>"
    echo "  Create 'file.refFlat' for CollectRnaSeqMetrics."
    return 0
  fi

  if [[ ! -f "$gtf" ]]; then
    warn "ERROR: File not found '$gtf'"
    exit 1
  fi

  local out="${gtf%.gtf*}.refFlat"
  if [[ -f "$out" ]]; then
    echo "$out"
    return 0
  fi

  local in=$(mktemp)

  if [[ "$gtf" == *.gtf.gz ]] ; then
    gunzip -c "$gtf" > "$in"
  else
    cat "$gtf" > "$in"
  fi

  if ! which gtfToGenePred &> /dev/null; then
    warn "ERROR: gtfToGenePred is not installed."
    warn "  Install gtfToGenePred to a directory in your PATH"
    warn "  http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred"
    exit 1
  fi

  gtfToGenePred -ignoreGroupsWithoutExons "$in" "$out" \
    && perl -i -lane 'print join "\t", ($F[0], @F)' "$out" \
    && rm -f "$in"
  local ERR=$?

  if [[ "$ERR" != 0 && "$ERR" != 255 ]]; then
      warn "Unexpected error $ERR from gtfToGenePred"
      exit 1
  fi

  echo "$out"
}

main_rRNA() {
  local gtf="$2"
  # Ensure that the script was called with correct arguments.
  if [[ ! ("$gtf" == *.gtf || "$gtf" == *.gtf.gz) ]]; then
    echo "Usage: picardmetrics rRNA <file.gtf[.gz]>"
    echo "  Create 'file.rRNA.list' for CollectRnaSeqMetrics."
    return 0
  fi

  if [[ ! -f "$gtf" ]]; then
    warn "ERROR: File not found '$gtf'"
    exit 1
  fi

  local out="${gtf%.gtf*}.rRNA.list"
  if [[ -f "$out" ]]; then
    echo "$out"
    return 0
  fi

  local in=$(mktemp)

  if [[ "$gtf" == *.gtf.gz ]] ; then
    gunzip -c "$gtf" > "$in"
  else
    cat "$gtf" > "$in"
  fi

  grep '"rRNA"' "$in" | \
      awk '$3 == "transcript"' | \
      cut -f1,4,5,7,9 | \
      perl -lane '
          /transcript_id "([^"]+)"/ or die "no transcript_id on $.";
          print join "\t", (@F[0,1,2,3], $1)
      ' | \
      sort -k1V -k2n -k3n \
  > "$out" && \
  rm -f "$in"
  local ERR=$?

  if [[ "$ERR" != 0 ]]; then
      warn "ERROR: Unexpected error $ERR while making '$out'"
      exit 1
  fi

  echo "$out"
}

# Check for correct configuration. -------------------------------------------

configure_run() {
  # In order of preference:
  #     1. The user's specified config.
  #     2. The config in the current directory.
  #     3. The config in the home directory.
  if [[ -f "$PICARDMETRICS_CONFIG" ]]; then
    PICARDMETRICS_CONFIG="$PICARDMETRICS_CONFIG"
  elif [[ -f picardmetrics.conf ]]; then
    PICARDMETRICS_CONFIG=picardmetrics.conf
  elif [[ -f ~/picardmetrics.conf ]]; then
    PICARDMETRICS_CONFIG=~/picardmetrics.conf
  else
    warn "WARNING: Config file '$PICARDMETRICS_CONFIG' not found"
  fi
  warn -e "$(DATE)\tpicardmetrics config: '$PICARDMETRICS_CONFIG'"
  source "$PICARDMETRICS_CONFIG"

  # All jobs will be launched with niceness=20 by default.
  if [[ -z "$NICENESS" ]]; then
    warn "Setting NICENESS=20"
    NICENESS=20
  fi
  NICE="nice -n$NICENESS"

  if [[ ! -e "$PICARD_JAR" ]]; then
    warn "ERROR: Please download the 'picard.jar' file"
    warn "         https://broadinstitute.github.io/picard/"
    warn "       and define PICARD_JAR=/path/to/picard.jar"
    exit 1
  fi

  if [[ -z "$PICARD" ]]; then
    warn "ERROR: Please define the variable PICARD"
    warn '       For example: PICARD="java -jar $PICARD_JAR"'
  fi

  if ! which samtools &> /dev/null; then
    warn "ERROR: Please install the 'samtools' program"
    warn "         https://github.com/samtools/samtools"
    exit 1
  fi

  if ! which stats &> /dev/null; then
    warn "ERROR: Please install the 'stats' program"
    warn "         https://github.com/arq5x/filo"
    exit 1
  fi

  if [[ ! -f "$REFERENCE_SEQUENCE" ]]; then
    warn "ERROR: FASTA file not found: '$REFERENCE_SEQUENCE'"
    warn "       Please define REFERENCE_SEQUENCE"
    warn '       For example: REFERENCE_SEQUENCE=~/data/hg19.fasta'
    exit 1
  fi

  # If the input BAM file comes from an RNA-seq experiment.
  if [[ "$RNASEQ" ]]; then
    if [[ "$REF_FLAT" && ! -f "$REF_FLAT" ]]; then
      warn "ERROR: refFlat file not found: '$REF_FLAT'"
      exit 1
    elif [[ -f "$GTF" ]]; then
      REF_FLAT=$(main_refFlat 1 "$GTF")
    fi

    if [[ "$RIBOSOMAL_INTERVALS" && ! -f "$RIBOSOMAL_INTERVALS" ]]; then
      warn "ERROR: Ribosomal intervals file not found: '$RIBOSOMAL_INTERVALS'"
      exit 1
    elif [[ -f "$GTF" ]]; then
      RIBOSOMAL_INTERVALS=$(main_rRNA 1 "$GTF")
    fi
  fi
}

# Call the Picard tools. -----------------------------------------------------

compute_metrics() {
  local bam="$1" rnaseq="$2"
  if [[ ! -f "$bam" ]]; then
    warn "BAM file not found '$bam'"
    return 1
  fi

  # Reorder the BAM header to match the header of the reference sequence.
  reorder_sam "$bam"
  
  # Sort the reads by coordinate.
  sort_sam "$bam"

  local prefix=$(basename "$bam" .bam)

  # Check if the BAM file is newer or if the metrics file is absent.
  mapq_stats="${prefix}.mapq_stats"
  if [[ "$bam" -nt "$mapq_stats" || ! -f "$mapq_stats" ]]; then
    if ! ( $NICE samtools view "$bam" | cut -f5 | stats > "$mapq_stats" );
    then
      warn "ERROR: mapq_stats failed"
    fi
  fi

  metrics="${prefix}.duplicate_metrics"
  if [[ "$bam" -nt "$metrics" || ! -f "$metrics" ]]; then
    # featureCounts can ignore the marked duplicate reads with --ignoreDup
    mark_duplicates "$bam"
  fi

  metrics="${prefix}.alignment_summary_metrics"
  if [[ "$bam" -nt "$metrics" || ! -f "$metrics" ]]; then
    multiple_metrics "$bam"
  fi

  if [[ "$rnaseq" ]]; then
    metrics="${prefix}.rnaseq_metrics"
    if [[ "$bam" -nt "$metrics" || ! -f "$metrics" ]]; then
      rnaseq_metrics "$bam"
    fi
  fi

  metrics="${prefix}.gc_bias_metrics"
  if [[ "$bam" -nt "$metrics" || ! -f "$metrics" ]]; then
    gcbias_metrics "$bam"
  fi

  metrics="${prefix}.library_complexity"
  if [[ "$bam" -nt "$metrics" || ! -f "$metrics" ]]; then
    library_complexity "$bam"
  fi
}

# Picard tools. --------------------------------------------------------------

run_picard() {
  local tool="$1"; shift
  local args=($@)
  local bam=${args[0]}
  local log=$(basename ${bam#*=} .bam).${tool}.log
  warn -e "$(DATE)\t$tool"
  if ! $NICE $PICARD $tool ${args[*]} &> $log; then
    warn "ERROR: $tool failed. See: $(pwd)/$log"
    cat "$(pwd)/$log" >&2
  fi
}

create_sequence_dictionary() {
  local ref="$1"
  local dict="${1%.*}.dict"
  if [[ ! -e "$dict" ]]; then
    local args=(
      CreateSequenceDictionary
      REFERENCE="$1"
      OUTPUT="$dict"
      TMP_DIR="$TEMP_DIR"
    )
    run_picard ${args[*]}
  fi
}

reorder_sam() {
  local bam="$1"
  local out="${bam%.bam}.ReorderSam.bam"
  local args=(
    ReorderSam
    INPUT="$bam"
    OUTPUT="$out"
    REFERENCE="${REFERENCE_SEQUENCE}"
    VALIDATION_STRINGENCY=LENIENT
    TMP_DIR="$TEMP_DIR"
  )
  run_picard ${args[*]}
  if ! mv -f "$out" "$bam"; then
    warn "ERROR: Failed to move '$out' to '$bam'"
    exit 1
  fi
}

sort_sam() {
  local bam="$1"
  local out="${bam%.bam}.SortSam.bam"
  if ! bam_sorted "$bam"; then
    local args=(
      SortSam
      INPUT="$bam"
      OUTPUT="$out"
      SORT_ORDER=coordinate
      VALIDATION_STRINGENCY=LENIENT
      TMP_DIR="$TEMP_DIR"
    )
    run_picard ${args[*]}
    if ! mv -f "$out" "$bam"; then
      warn "ERROR: Failed to move '$out' to '$bam'"
      exit 1
    fi
  fi
}

mark_duplicates() {
  local bam="$1"
  local out="${bam%.bam}.MarkDuplicates.bam"
  local prefix=$(basename "$bam" .bam)
  local args=(
    MarkDuplicates
    INPUT="$bam"
    OUTPUT="$out"
    METRICS_FILE="${prefix}.duplicate_metrics"
    REMOVE_DUPLICATES=false
    READ_NAME_REGEX=null
    VALIDATION_STRINGENCY=LENIENT
    TMP_DIR="$TEMP_DIR"
  )
  run_picard ${args[*]}
  if ! mv -f "$out" "$bam"; then
    warn "ERROR: Failed to move '$out' to '$bam'"
    exit 1
  fi
}

multiple_metrics() {
  local bam="$1"
  local prefix=$(basename "$bam" .bam)
  local args=(
    CollectMultipleMetrics
    INPUT="$bam"
    OUTPUT="$prefix"
    REFERENCE_SEQUENCE="$REFERENCE_SEQUENCE"
    VALIDATION_STRINGENCY=LENIENT
    PROGRAM=CollectAlignmentSummaryMetrics
    PROGRAM=CollectInsertSizeMetrics
    PROGRAM=QualityScoreDistribution
    TMP_DIR="$TEMP_DIR"
  )
  run_picard ${args[*]}
}

rnaseq_metrics() {
  local bam="$1"
  local prefix=$(basename "$bam" .bam)
  # Ignore the sequence dictionary present in the .list file,
  # and instead take the sequence dictionary from the BAM file.
  local t="$TEMP_DIR/${prefix}.list"
  if ! (
    samtools view -H "$bam" | grep '^@SQ' > "$t" \
      && grep -v '^@SQ' "$RIBOSOMAL_INTERVALS" >> "$t"
  ); then
    warn "ERROR: Could not retrieve header from '$bam'"
    exit 1
  fi
  local args=(
    CollectRnaSeqMetrics
    INPUT="$bam"
    OUTPUT="${prefix}.rnaseq_metrics"
    CHART_OUTPUT="${prefix}.rnaseq_metrics.pdf"
    REFERENCE_SEQUENCE="$REFERENCE_SEQUENCE"
    REF_FLAT="$REF_FLAT"
    RIBOSOMAL_INTERVALS="$t"
    STRAND_SPECIFICITY=NONE
    VALIDATION_STRINGENCY=LENIENT
    TMP_DIR="$TEMP_DIR"
  )
  run_picard ${args[*]}
  # Remove the temporary file after we're done with it.
  if ! rm -f "$t"; then
    warn "WARNING: Could not delete '$t'"
  fi
}

gcbias_metrics() {
  local bam="$1"
  local prefix=$(basename "$bam" .bam)
  local args=(
    CollectGcBiasMetrics
    INPUT="$bam"
    OUTPUT="${prefix}.gc_bias_histogram"
    CHART_OUTPUT="${prefix}.gc_bias_histogram.pdf"
    SUMMARY_OUTPUT="${prefix}.gc_bias_metrics"
    REFERENCE_SEQUENCE="$REFERENCE_SEQUENCE"
    VALIDATION_STRINGENCY=LENIENT
    TMP_DIR="$TEMP_DIR"
  )
  run_picard ${args[*]}
}

library_complexity() {
  local bam="$1"
  local prefix=$(basename "$bam" .bam)
  local args=(
    EstimateLibraryComplexity
    INPUT="$bam"
    OUTPUT="${prefix}.library_complexity"
    VALIDATION_STRINGENCY=LENIENT
    TMP_DIR="$TEMP_DIR"
  )
  run_picard ${args[*]}
}

# Collate functions. ---------------------------------------------------------

collate_alignment_metrics() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.alignment_summary_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} alignment_summary_metrics files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | cut -f1-22
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## METRICS' "$f" | tail -n+3 | \
        grep -Pv '^$' | cut -f1-22 | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_rnaseq_metrics() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.rnaseq_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} rnaseq_metrics files (summary)"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | cut -f1-22
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A2 '## METRICS' "$f" | tail -n1 | cut -f1-22 \
        | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_rnaseq_coverage() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.rnaseq_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} rnaseq_metrics files (coverage)"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## HISTOGRAM' "${files[0]}" | tail -n1 | perl -pe 's/\s+$/\n/'
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## HISTOGRAM' "$f" | tail -n+3 \
        | grep -Pv '^$' | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_quality_metrics() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.quality_distribution_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} quality_distribution_metrics files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## HISTOGRAM' "${files[0]}" | tail -n1 | perl -pe 's/\s+$/\n/'
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## HISTOGRAM' "$f" | tail -n+3 | grep -Pv '^$' \
        | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_gc_bias_histogram() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.gc_bias_histogram' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} gc_bias_histogram files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | perl -pe 's/\s+$/\n/'
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## METRICS' "$f" | tail -n+3 | grep -Pv '^$' \
        | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_gc_bias_metrics() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.gc_bias_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} gc_bias_metrics files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | perl -pe 's/\s+$/\n/'
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A2 '## METRICS' "$f" | tail -n1 \
        | perl -pe 's{^}{'$sample'\t};s/\s+$/\n/'
    done
  ) > "$out"
}

collate_insert_size_metrics() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.insert_size_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} insert_size_metrics files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | cut -f1-18
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A2 '## METRICS' "$f" | tail -n1 | cut -f1-18 \
        | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_insert_size_histogram() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.insert_size_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} insert_size_metrics files (histogram)"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## HISTOGRAM' "${files[0]}" | tail -n1 | perl -pe 's/\s+$/\n/'
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## HISTOGRAM' "$f" | tail -n+3 \
        | grep -Pv '^$' | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_base_distribution_by_cycle() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.base_distribution_by_cycle_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} base_distribution_by_cycle files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | cut -f1-7
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## METRICS' "$f" | tail -n+3 | cut -f1-7 \
        | grep -Pv '^$' | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_duplicate_metrics() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.duplicate_metrics' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} duplicate_metrics files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | cut -f1-9
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A2 '## METRICS' "$f" | tail -n1 | cut -f1-9 \
        | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_library_complexity() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.library_complexity' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} library_complexity files"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## METRICS' "${files[0]}" | tail -n1 | cut -f1-9
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A2 '## METRICS' "$f" | tail -n1 | cut -f1-9 \
        | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_library_complexity_histogram() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.library_complexity' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} library_complexity files (histogram)"
  (
    echo -ne "SAMPLE\t"
    grep -A1 '## HISTOGRAM' "${files[0]}" | tail -n1 | cut -f1 \
      | perl -pe 's/\s+$/\n/; s/(\S+)/$1\tfragments/;'
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      grep -A1000000 '## HISTOGRAM' "$f" | tail -n+3 \
        | grep -Pv '^$' | perl -pe 's{^}{'$sample'\t}'
    done
  ) > "$out"
}

collate_mapq_stats() {
  local out="$1" dir="$2"
  local files=($(find "$dir" -name '*.mapq_stats' | sort))
  [[ ${#files[@]} -eq 0 ]] && return 1
  warn -e "$(DATE)\tCollating ${#files[@]} mapq_stats files"
  (
    echo -e \
    "SAMPLE\tMAPQ_Mean\tMAPQ_Median\tMAPQ_SD\tMAPQ_Mode\tMAPQ_ModePercent"
    local f
    for f in ${files[*]}
    do
      local sample=$(sample_name "$f")
      echo -ne "$sample\t"
      cat $f | perl -lne '
      BEGIN{ $n = 0; $mean = 0; $median = 0; $sd = 0; $mode = 0; $m = 0; }
      /Total lines:\s+(\S+)/  && ($n = $1);
      /Ari. Mean:\s+(\S+)/  && ($mean = $1);
      /Median:\s+(\S+)/     && ($median = $1);
      /^Mode:\s+(\S+) \(N=(\d+)\)/  && ($mode = $1, $m = 100 * $2 / $n);
      /StdDev:\s+(\S+)/     && ($sd = $1);
      END{ print join "\t", ($mean, $median, $sd, $mode, $m) }
      '
    done
  ) > "$out"
}

collate_all() {
  local prefix="$1"
  warn -e "$(DATE)\tJoining all files into '$prefix-all-metrics.tsv'"
  Rscript --vanilla <(echo '
main <- function() {
  args <- commandArgs(trailingOnly = TRUE)
  prefix <- args[1]
  dat <- read_metrics(prefix)
  write_tsv(x = dat, file = sprintf("%s-all-metrics.tsv", prefix))
}

write_tsv <- function(...) {
  write.table(sep = "\t", quote = FALSE, row.names = FALSE, ...)  
}

read_tsv <- function(filename, ...) {
  if (!file.exists(filename)) {
    warning("File does not exist: ", filename)
    return(NULL)
  }
  dat <- read.delim(filename, stringsAsFactors = FALSE, ...)
  return(dat)
}

read_metrics <- function(prefix) {
  dat_align_metrics = read_tsv(
    sprintf("%s-alignment-metrics.tsv", prefix)
  )
  # This file must be present. The others are optional.
  if (is.null(dat_align_metrics)) {
    stop(sprintf("Failed to read %s-alignment-metrics.tsv", prefix))
  }

  dat_duplicate_metrics = read_tsv(
    sprintf("%s-duplicate-metrics.tsv", prefix)
  )
  dat_gcbias_metrics = read_tsv(
    sprintf("%s-gc-bias-metrics.tsv", prefix)
  )
  dat_insert_size_metrics = read_tsv(
    sprintf("%s-insert-size-metrics.tsv", prefix)
  )
  dat_library_complexity = read_tsv(
    sprintf("%s-library-complexity.tsv", prefix)
  )
  is_rnaseq <- file.exists(sprintf("%s-rnaseq-metrics.tsv", prefix))
  if (is_rnaseq) {
    dat_rnaseq_metrics = read_tsv(
      sprintf("%s-rnaseq-metrics.tsv", prefix)
    )
  }
  dat_mapq_stats <- read_tsv(
    sprintf("%s-mapq-stats.tsv", prefix)
  )

  # Exclude FIRST_OF_PAIR and SECOND_OF_PAIR.
  idx = !dat_align_metrics$CATEGORY %in% c("FIRST_OF_PAIR", "SECOND_OF_PAIR")
  dat_align_metrics = dat_align_metrics[idx, ]

  if (is_rnaseq && !is.null(dat_rnaseq_metrics)) {
    # alignment_metrics and rnaseq_metrics both have this column.
    idx <- colnames(dat_rnaseq_metrics) == "PF_ALIGNED_BASES"
    colnames(dat_rnaseq_metrics)[idx] <- "PF_ALIGNED_BASES_rnaseq_metrics"

    stopifnot( all(dat_align_metrics$SAMPLE == dat_rnaseq_metrics$SAMPLE) )

    dat_align_metrics <- merge(dat_align_metrics, dat_rnaseq_metrics, by = "SAMPLE")
  }

  if (!is.null(dat_duplicate_metrics)) {
    stopifnot( all(dat_align_metrics$SAMPLE == dat_duplicate_metrics$SAMPLE) )
    dat_align_metrics <- merge(dat_align_metrics, dat_duplicate_metrics, by = "SAMPLE")
  }

  if (!is.null(dat_gcbias_metrics)) {
    stopifnot( all(dat_align_metrics$SAMPLE == dat_gcbias_metrics$SAMPLE) )
    dat_align_metrics <- merge(dat_align_metrics, dat_gcbias_metrics, by = "SAMPLE")
  }

  if (!is.null(dat_insert_size_metrics)) {
    stopifnot( all(dat_align_metrics$SAMPLE == dat_insert_size_metrics$SAMPLE) )
    dat_align_metrics <- merge(dat_align_metrics, dat_insert_size_metrics, by = "SAMPLE")
  }

  if (!is.null(dat_library_complexity)) {
    # duplicate_metrics and library_complexity share columns
    cnames <- colnames(dat_library_complexity)
    cnames[2:length(cnames)] <- paste(
      cnames[2:length(cnames)], "_library_complexity",
      sep = ""
    )
    colnames(dat_library_complexity) <- cnames

    stopifnot( all(dat_align_metrics$SAMPLE == dat_library_complexity$SAMPLE) )
    dat_align_metrics <- merge(dat_align_metrics, dat_library_complexity, by = "SAMPLE")
  }

  if (!is.null(dat_mapq_stats)) {
    stopifnot( all(dat_align_metrics$SAMPLE == dat_mapq_stats$SAMPLE) )
    dat_align_metrics <- merge(dat_align_metrics, dat_mapq_stats, by = "SAMPLE")
  }

  # Delete intermediate files that are merged into "%s-all-metrics.tsv".
  filenames <- sprintf(c(
    "%s-alignment-metrics.tsv",
    "%s-duplicate-metrics.tsv",
    "%s-gc-bias-metrics.tsv",
    "%s-insert-size-metrics.tsv",
    "%s-library-complexity.tsv",
    "%s-rnaseq-metrics.tsv",
    "%s-mapq-stats.tsv"
  ), prefix)
  sapply(filenames, function(filename) {
    if (file.exists(filename)) {
      unlink(filename, force = TRUE)
    }
  })

  return(dat_align_metrics)
} 

main()
  ') $prefix
}

# Extra functions. -----------------------------------------------------------

warn() {
  echo $@ >&2
}

DATE() {
  command date +'%Y-%m-%d %H:%M:%S'
}

sample_name() {
  # Remove the file extension. 'a.b.c.def' -> 'a.b.c'
  echo "${1%.*}"
}

is_bam() {
  [[ -f "$1" && "$1" == *.bam ]]
}

bam_incomplete() {
  if [[ ! -s "$1" ]]; then
    return 1
  fi
  samtools view -H "$1" 2>&1 | grep -q 'EOF marker is absent'
  return "$?"
}

bam_sorted() {
  if [[ ! -f "$1" ]]; then
    return 1
  fi
  samtools view -H "$1" 2>&1 | grep -q 'SO:coordinate'
  return "$?"
}

bash_realpath() {
  if [[ -f "$1" ]]; then
    local f=$(basename "$1")
    local d=$(dirname "$1")
    if ! (cd "$d" && echo "$(pwd)/$f"); then
      warn "ERROR: Failed to cd '$d'"
      exit 1
    fi
  elif [[ -d "$1" ]]; then
    if ! (cd "$1" && echo "$(pwd)"); then
      warn "ERROR: Failed to cd '$1'"
      exit 1
    fi
  fi
}

main "$@"